# Lab - Making predictions with logistic regression
1. Create a query or queries to extract the information you think may be relevant for building the prediction model. It should include some film features and some rental features (X).
2. Create a query to get the list of all unique film titles and a boolean indicating if it was rented (rental_date) in May 2005. (Create new column called - 'rented_in_may'). This will be our TARGET (y) variable.
3. Read the data into a Pandas dataframe. At this point you should have 1000 rows. Number of columns depends on the number of features you chose.
4. Analyze extracted features (X) and transform them. You may need to encode some categorical variables, or scale numerical variables.
5. Create a logistic regression model to predict 'rented_in_may' from the cleaned data.

In [1]:
import pandas as pd
import numpy as np

In [13]:
import pymysql
from sqlalchemy import create_engine
import getpass  # To get the password without showing the input
password = getpass.getpass()

 ········


In [3]:
#1 
connection_string = 'mysql+pymysql://root:' + password + '@localhost/sakila'
engine = create_engine(connection_string)

In [4]:
#using read table
pd.read_sql_table('inventory',engine)

  self.meta.reflect(bind=self.con, only=[table_name], views=True)


Unnamed: 0,inventory_id,film_id,store_id,last_update
0,1,1,1,2006-02-15 05:09:17
1,2,1,1,2006-02-15 05:09:17
2,3,1,1,2006-02-15 05:09:17
3,4,1,1,2006-02-15 05:09:17
4,5,1,2,2006-02-15 05:09:17
...,...,...,...,...
4576,4577,1000,1,2006-02-15 05:09:17
4577,4578,1000,2,2006-02-15 05:09:17
4578,4579,1000,2,2006-02-15 05:09:17
4579,4580,1000,2,2006-02-15 05:09:17


In [5]:
#decided not to keep "original_language_id" because there were lots of nulls
#count distinct to avoid overcounting in the inventory
query = '''SELECT 
    f.film_id,
    f.title,
    f.release_year,
    f.language_id,
    f.rental_duration,
    f.length, 
    f.rental_rate,
    f.rating,
    f.replacement_cost,
    fc.category_id,
    COUNT(DISTINCT i.inventory_id) AS num_inventory_may,
    COUNT(r.rental_id) AS num_rentals_may
FROM 
    film f
LEFT JOIN 
    inventory i ON f.film_id = i.film_id
LEFT JOIN 
    rental r ON i.inventory_id = r.inventory_id
LEFT JOIN
    film_category fc ON f.film_id=fc.film_id
WHERE 
    (r.rental_date IS NULL) OR (r.rental_date >= '2005-05-01' AND r.rental_date < '2005-06-01')
GROUP BY 
    f.film_id, f.title, f.release_year, f.language_id, f.rental_duration, 
    f.length, f.rental_rate, f.rating, f.replacement_cost, fc.category_id;'''

data = pd.read_sql_query(query, engine)



In [6]:
data.head(10)
#data.describe().T

Unnamed: 0,film_id,title,release_year,language_id,rental_duration,length,rental_rate,rating,replacement_cost,category_id,num_inventory_may,num_rentals_may
0,1,ACADEMY DINOSAUR,2006,1,6,86,0.99,PG,20.99,6,3,2
1,3,ADAPTATION HOLES,2006,1,7,50,2.99,NC-17,18.99,6,1,1
2,4,AFFAIR PREJUDICE,2006,1,5,117,2.99,G,26.99,11,2,2
3,5,AFRICAN EGG,2006,1,6,130,2.99,G,22.99,8,1,1
4,6,AGENT TRUMAN,2006,1,3,169,2.99,PG,17.99,9,2,2
5,8,AIRPORT POLLOCK,2006,1,6,54,4.99,R,15.99,11,3,3
6,11,ALAMO VIDEOTAPE,2006,1,6,126,0.99,G,16.99,9,2,2
7,12,ALASKA PHANTOM,2006,1,6,136,0.99,PG,22.99,12,2,2
8,14,ALICE FANTASIA,2006,1,6,94,0.99,NC-17,23.99,4,0,0
9,15,ALIEN CENTER,2006,1,5,46,2.99,NC-17,10.99,9,2,2


In [7]:
data.isna().sum()

film_id              0
title                0
release_year         0
language_id          0
rental_duration      0
length               0
rental_rate          0
rating               0
replacement_cost     0
category_id          0
num_inventory_may    0
num_rentals_may      0
dtype: int64

In [8]:
#Create new column called 'rented_in_may'
query = '''SELECT 
    f.film_id,
    f.title,
    f.release_year,
    f.language_id,
    f.rental_duration,
    f.length, 
    f.rental_rate,
    f.rating,
    f.replacement_cost,
    fc.category_id,
    COUNT(DISTINCT i.inventory_id) AS num_inventory_may,
    COUNT(r.rental_id) AS num_rentals_may,
    max(CASE WHEN r.rental_date BETWEEN '2005-05-01' AND '2005-05-31' THEN 1 ELSE 0 END) AS rented_in_may
FROM 
    film f
LEFT JOIN 
    inventory i ON f.film_id = i.film_id
LEFT JOIN 
    rental r ON i.inventory_id = r.inventory_id
LEFT JOIN
    film_category fc ON f.film_id=fc.film_id   
GROUP BY 
    f.film_id, f.title, f.release_year, f.language_id, f.rental_duration, 
    f.length, f.rental_rate, f.rating, f.replacement_cost, fc.category_id;'''

data = pd.read_sql_query(query, engine)
data.head(3)
#data.shape

Unnamed: 0,film_id,title,release_year,language_id,rental_duration,length,rental_rate,rating,replacement_cost,category_id,num_inventory_may,num_rentals_may,rented_in_may
0,1,ACADEMY DINOSAUR,2006,1,6,86,0.99,PG,20.99,6,8,23,1
1,2,ACE GOLDFINGER,2006,1,3,48,4.99,G,12.99,11,3,7,0
2,3,ADAPTATION HOLES,2006,1,7,50,2.99,NC-17,18.99,6,4,12,0


In [9]:
data.dtypes

film_id                int64
title                 object
release_year           int64
language_id            int64
rental_duration        int64
length                 int64
rental_rate          float64
rating                object
replacement_cost     float64
category_id            int64
num_inventory_may      int64
num_rentals_may        int64
rented_in_may          int64
dtype: object

In [10]:
data['language_id'].value_counts(dropna=False)

data['rating'].value_counts(dropna=False)

rating
PG-13    223
NC-17    210
R        195
PG       194
G        178
Name: count, dtype: int64

In [17]:
data['category_id']=data['category_id'].astype(object)
data['category_id'].dtypes

dtype('O')

# Train test

In [11]:
#always TRAIN TEST BEFORE transforming any data

In [22]:
##x and y##
y = data[['rented_in_may']] #'rented_in_may' is our (y) target
X = data.drop(['rented_in_may','title'], axis=1) #X is all the other columns

##train test split##
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [23]:
X_train_cat = X_train.select_dtypes(include = object)
X_test_cat = X_train.select_dtypes(include = object)
X_train_num = X_train.select_dtypes(include = np.number)
X_test_num = X_test.select_dtypes(include = np.number)

# Transform variables
You may need to encode some categorical variables, or scale numerical variables.

In [24]:
##encoding FILM CAT##
from sklearn.preprocessing import OneHotEncoder

In [25]:
#Encoding TRAIN values
encoder = OneHotEncoder(drop='first').fit(X_train_cat)
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns) #uses the labels by default
X_train_cat_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(),columns=cols)
X_train_cat_encode.head()

Unnamed: 0,rating_NC-17,rating_PG,rating_PG-13,rating_R,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,category_id_7,category_id_8,category_id_9,category_id_10,category_id_11,category_id_12,category_id_13,category_id_14,category_id_15,category_id_16
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#Encoding TEST values
cols = encoder.get_feature_names_out(input_features=X_test_cat.columns) #uses the labels by default
X_test_cat_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(),columns=cols)

# Scaling min max

In [21]:
#With train data
from sklearn.preprocessing import MinMaxScaler

# Scaling 
transformer = MinMaxScaler().fit(X_train_num) # need to keep transformer
X_train_normalized = transformer.transform(X_train_num)
X_train_norm = pd.DataFrame(X_train_normalized, columns=X_train_num.columns)
X_train_norm

Unnamed: 0,film_id,release_year,language_id,rental_duration,length,rental_rate,replacement_cost,num_inventory_may,num_rentals_may
0,0.174699,0.0,0.0,0.25,0.906475,1.0,0.55,0.375,0.382353
1,0.940763,0.0,0.0,0.75,0.129496,0.0,0.50,0.375,0.264706
2,0.346386,0.0,0.0,1.00,0.287770,0.0,0.00,0.500,0.500000
3,0.961847,0.0,0.0,1.00,0.827338,1.0,0.65,0.625,0.676471
4,0.832329,0.0,0.0,0.50,0.928058,0.0,0.35,0.500,0.411765
...,...,...,...,...,...,...,...,...,...
795,0.728916,0.0,0.0,0.50,0.942446,0.5,0.95,0.500,0.323529
796,0.630522,0.0,0.0,0.25,0.129496,0.0,0.10,0.625,0.558824
797,0.184739,0.0,0.0,0.50,0.079137,0.5,0.15,0.375,0.323529
798,0.093373,0.0,0.0,0.25,0.884892,0.5,0.60,0.375,0.382353


In [27]:
# Scaling 
X_test_normalized = transformer.transform(X_train_num)
X_test_norm = pd.DataFrame(X_train_normalized, columns=X_train_num.columns)
X_test_norm

Unnamed: 0,film_id,release_year,language_id,rental_duration,length,rental_rate,replacement_cost,num_inventory_may,num_rentals_may
0,0.174699,0.0,0.0,0.25,0.906475,1.0,0.55,0.375,0.382353
1,0.940763,0.0,0.0,0.75,0.129496,0.0,0.50,0.375,0.264706
2,0.346386,0.0,0.0,1.00,0.287770,0.0,0.00,0.500,0.500000
3,0.961847,0.0,0.0,1.00,0.827338,1.0,0.65,0.625,0.676471
4,0.832329,0.0,0.0,0.50,0.928058,0.0,0.35,0.500,0.411765
...,...,...,...,...,...,...,...,...,...
795,0.728916,0.0,0.0,0.50,0.942446,0.5,0.95,0.500,0.323529
796,0.630522,0.0,0.0,0.25,0.129496,0.0,0.10,0.625,0.558824
797,0.184739,0.0,0.0,0.50,0.079137,0.5,0.15,0.375,0.323529
798,0.093373,0.0,0.0,0.25,0.884892,0.5,0.60,0.375,0.382353


# Plotting

In [32]:
import matplotlib.pyplot as plt
import seaborn as sns
corr_matrix=X.corr()  
#fig, ax = plt.subplots(figsize=(10, 8))
#ax = sns.heatmap(corr_matrix, annot=True)
plt.show()

ValueError: could not convert string to float: 'PG'

In [30]:
sns.displot(X_train_norm, kde=True)
plt.show()

NameError: name 'sns' is not defined

In [5]:
from sklearn.linear_model import LogisticRegression #log reg will guess already 1 and 0 
LRmodel = LinearRegression()
LRmodel = LRmodel.fit(X_train, y_train)
LRmodel.score(X_test,y_test)

0.6844267283527068

In [33]:
X_train_categorical = X_train.select_dtypes(include = object)
X_train_cat = pd.get_dummies(X_train_categorical, 
                             drop_first=True)
#get_dummies Convert categorical variable into dummy/indicator variables.
#Each variable is converted in as many 0/1 variables as there are different values. Columns in the output are each named after a value; if the input is a DataFrame
X_train_cat.head()

Unnamed: 0,rating_NC-17,rating_PG,rating_PG-13,rating_R,category_id_2,category_id_3,category_id_4,category_id_5,category_id_6,category_id_7,category_id_8,category_id_9,category_id_10,category_id_11,category_id_12,category_id_13,category_id_14,category_id_15,category_id_16
892,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
972,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
749,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
283,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
861,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
