In [74]:
import pandas as pd
import numpy as np

from sqlalchemy import create_engine
from getpass import getpass

In [75]:
password = getpass()

In [76]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/sakila'
engine = create_engine(connection_string)

In [77]:
rented_may = pd.DataFrame(engine.execute('''
SELECT film.title, COUNT(rental_date) AS rented
FROM film
LEFT JOIN
    (SELECT film_id, title, rental_date
    FROM film
    JOIN inventory USING (film_id)
    JOIN rental r USING (inventory_id)
    WHERE DATE_FORMAT(CONVERT(rental_date, DATE), '%%Y %%M') = '2005 May') sub
    USING (film_id)
GROUP BY film.title;
'''))
rented_may

Unnamed: 0,title,rented
0,ACADEMY DINOSAUR,2
1,ACE GOLDFINGER,0
2,ADAPTATION HOLES,1
3,AFFAIR PREJUDICE,2
4,AFRICAN EGG,1
...,...,...
995,YOUNG LANGUAGE,0
996,YOUTH KICK,0
997,ZHIVAGO CORE,1
998,ZOOLANDER FICTION,1


In [78]:
rented_may_io = rented_may
rented_may_io['rented'] = rented_may['rented'].apply(lambda x: 1 if x>0 else 0)
rented_may_io

Unnamed: 0,title,rented
0,ACADEMY DINOSAUR,1
1,ACE GOLDFINGER,0
2,ADAPTATION HOLES,1
3,AFFAIR PREJUDICE,1
4,AFRICAN EGG,1
...,...,...
995,YOUNG LANGUAGE,0
996,YOUTH KICK,0
997,ZHIVAGO CORE,1
998,ZOOLANDER FICTION,1


In [79]:
rented_june = pd.DataFrame(engine.execute('''
SELECT film.title, COUNT(rental_date) AS rented
FROM film
LEFT JOIN
    (SELECT film_id, title, rental_date
    FROM film
    JOIN inventory USING (film_id)
    JOIN rental r USING (inventory_id)
    WHERE DATE_FORMAT(CONVERT(rental_date, DATE), '%%Y %%M') = '2005 May') sub
    USING (film_id)
GROUP BY film.title;
'''))

rented_june_io = rented_june
rented_june_io['rented'] = rented_june['rented'].apply(lambda x: 1 if x>0 else 0)

In [80]:
film_bonus = pd.DataFrame(engine.execute('''
SELECT title, special_features
FROM film
ORDER BY title;
'''))
film_bonus.special_features.value_counts(dropna=False)
film_bonus = film_bonus['special_features'].str.get_dummies(sep=',')
film_bonus

Unnamed: 0,Behind the Scenes,Commentaries,Deleted Scenes,Trailers
0,1,0,1,0
1,0,0,1,1
2,0,0,1,1
3,1,1,0,0
4,0,0,1,0
...,...,...,...,...
995,1,0,0,1
996,1,0,0,1
997,0,0,1,0
998,0,0,1,1


In [83]:
film_categories = pd.DataFrame(engine.execute('''
SELECT f.title, c.name, f.rating, f.rental_duration, f.rental_rate
FROM film f
JOIN film_category fa USING (film_id)
JOIN category c USING (category_id)
ORDER BY f.title;
''')).astype(object)
film_categories

Unnamed: 0,title,name,rating,rental_duration,rental_rate
0,ACADEMY DINOSAUR,Documentary,PG,6,0.99
1,ACE GOLDFINGER,Horror,G,3,4.99
2,ADAPTATION HOLES,Documentary,NC-17,7,2.99
3,AFFAIR PREJUDICE,Horror,G,5,2.99
4,AFRICAN EGG,Family,G,6,2.99
...,...,...,...,...,...
995,YOUNG LANGUAGE,Documentary,G,6,0.99
996,YOUTH KICK,Music,NC-17,4,0.99
997,ZHIVAGO CORE,Horror,NC-17,6,0.99
998,ZOOLANDER FICTION,Children,R,5,2.99


In [84]:
film_lens = pd.DataFrame(engine.execute('''
SELECT f.title, f.length
FROM film f;
'''))
film_lens.isna().sum()

title     0
length    0
dtype: int64

In [85]:
# film_nums = pd.concat((film_lens.length, film_rates.rental_rate, film_durations.rental_duration), axis=1)
film_nums = film_lens.length

In [87]:
from sklearn.preprocessing import OneHotEncoder

film_cats = pd.concat((film_bonus, film_categories.drop(['title'], axis=1)), axis=1)
film_cats

encoder = OneHotEncoder(drop='first').fit(pd.DataFrame(film_cats))
encoded = encoder.transform(pd.DataFrame(film_cats)).toarray()

cols = encoder.get_feature_names_out(input_features=film_cats.columns)

onehot_encoded_cats = pd.DataFrame(encoded, columns=cols).astype(object)
onehot_encoded_cats

Unnamed: 0,Behind the Scenes_1,Commentaries_1,Deleted Scenes_1,Trailers_1,name_Animation,name_Children,name_Classics,name_Comedy,name_Documentary,name_Drama,...,rating_NC-17,rating_PG,rating_PG-13,rating_R,rental_duration_4,rental_duration_5,rental_duration_6,rental_duration_7,rental_rate_2.99,rental_rate_4.99
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
996,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
998,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [88]:
from sklearn.model_selection import train_test_split  

X = pd.concat((film_nums, onehot_encoded_cats), axis=1)
y = rented_may_io['rented']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
X_num_train = X_train.select_dtypes(include=np.number).reset_index(drop=True)
X_num_test = X_test.select_dtypes(include=np.number).reset_index(drop=True)
X_cat_train = X_train.select_dtypes(include=object).reset_index(drop=True).astype(float)
X_cat_test = X_test.select_dtypes(include=object).reset_index(drop=True).astype(float)

In [89]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_num_train)
X_train_norm = scaler.transform(X_num_train)
X_train_norm = pd.DataFrame(X_train_norm,columns=X_num_train.columns)

X_test_norm = scaler.transform(X_num_test)
X_test_norm = pd.DataFrame(X_test_norm,columns=X_num_train.columns)

X_train_final = pd.concat((X_train_norm, X_cat_train), axis=1)
X_test_final = pd.concat((X_test_norm, X_cat_test), axis=1)
X_test_final

Unnamed: 0,length,Behind the Scenes_1,Commentaries_1,Deleted Scenes_1,Trailers_1,name_Animation,name_Children,name_Classics,name_Comedy,name_Documentary,...,rating_NC-17,rating_PG,rating_PG-13,rating_R,rental_duration_4,rental_duration_5,rental_duration_6,rental_duration_7,rental_rate_2.99,rental_rate_4.99
0,0.654676,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.949640,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.251799,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.798561,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.402878,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.093525,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
196,0.575540,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
197,0.381295,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
198,0.949640,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [90]:
corr = pd.concat((X_cat_train, y_train) ,axis=1).corr()
corr.rented.sort_values()


rental_duration_7     -0.098173
Trailers_1            -0.080521
name_Travel           -0.065901
name_New              -0.056648
name_Horror           -0.055261
rental_rate_2.99      -0.034740
Behind the Scenes_1   -0.033555
rating_PG             -0.030520
name_Foreign          -0.027231
name_Games            -0.015743
name_Sports           -0.013945
name_Music            -0.011227
name_Children         -0.008430
name_Classics         -0.004880
rating_PG-13          -0.001854
rating_NC-17          -0.000577
rental_rate_4.99       0.001989
rating_R               0.002133
name_Comedy            0.003682
rental_duration_6      0.008550
rental_duration_5      0.019197
Deleted Scenes_1       0.019866
name_Sci-Fi            0.022054
Commentaries_1         0.023859
name_Animation         0.040972
name_Family            0.043946
name_Documentary       0.047864
name_Drama             0.059123
rental_duration_4      0.061352
rented                 1.000000
Name: rented, dtype: float64

In [91]:
from sklearn.linear_model import LogisticRegression

classification = LogisticRegression(random_state=0, solver='saga',
                  multi_class='multinomial').fit(X_train_final, y_train)

predictions = classification.predict(X_train_final)
print(classification.score(X_train_final, y_train))

predictions_test = classification.predict(X_test_final)
print(classification.score(X_test_final, y_test))

0.69375
0.69


In [92]:
y2 = rented_june_io['rented']

X2 = pd.concat((X_train_final, X_test_final), axis=0)
X2

Unnamed: 0,length,Behind the Scenes_1,Commentaries_1,Deleted Scenes_1,Trailers_1,name_Animation,name_Children,name_Classics,name_Comedy,name_Documentary,...,rating_NC-17,rating_PG,rating_PG-13,rating_R,rental_duration_4,rental_duration_5,rental_duration_6,rental_duration_7,rental_rate_2.99,rental_rate_4.99
0,0.258993,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.136691,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.489209,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.179856,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.280576,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.093525,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
196,0.575540,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
197,0.381295,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
198,0.949640,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [93]:
predictions2 = classification.predict(X2)
print(classification.score(X2, y2))         

0.665


In [94]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y2, predictions2)         

array([[  8, 306],
       [ 29, 657]])

In [95]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

precision_score(y2, predictions2)
recall_score(y2, predictions2)

0.9577259475218659

In [104]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=7, weights='uniform')
clf.fit(X_train_final, y_train)
predictions_clf = clf.predict(X2)
clf.score(X2, y2)

0.631

In [105]:
confusion_matrix(y2, predictions_clf)

array([[ 41, 273],
       [ 96, 590]])