# ID2214: Programming for Data Science
## Best Practices in Modelling
### Amir Hossein A. Rahnama

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

Always remember to set the seed for your experiments so that it is reproducible:

In [4]:
# need to change the seed number to change the random setting to make sure
# that your results do not coincidently work fine
seed_number = 10
np.random.seed(seed_number)

Let us reload and merge data as we did in the last notebook:

In [6]:
# imdb_ratings=pd.read_csv('./data/IMDb ratings.csv',usecols=['weighted_average_vote'])
# imdb_titles=pd.read_csv('./data/IMDb movies.csv', usecols=['title','year','genre'])
# ratings = pd.DataFrame({'Title':imdb_titles.title,
#                     'Release Year':imdb_titles.year,
#                     'Rating': imdb_ratings.weighted_average_vote,
#                     'Genre':imdb_titles.genre})
# ratings.drop_duplicates(subset=['Title','Release Year','Rating'], inplace=True)
# ratings.shape

In [5]:
netflix_overall = pd.read_csv('./data/netflix_titles.csv')

In [7]:
ratings.dropna()
joint_data=ratings.merge(netflix_overall, left_on='Title', right_on='title', how='inner')
joint_data=joint_data.sort_values(by='Rating', ascending=False)

NameError: name 'ratings' is not defined

### Simple recommendation model

In [1]:
#removing stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
netflix_overall['description'] = netflix_overall['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix_description = tfidf.fit_transform(netflix_overall['description'])

#Output the shape of tfidf_matrix
tfidf_matrix_description.shape

NameError: name 'TfidfVectorizer' is not defined

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix_description, tfidf_matrix_description)

In [None]:
indices = pd.Series(netflix_overall.index, index=netflix_overall['title']).drop_duplicates()

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return netflix_overall['title'].iloc[movie_indices]

In [None]:
get_recommendations('Peaky Blinders')

### Rate prediction

In [None]:
tfidf_genre = CountVectorizer(stop_words='english')

#Replace NaN with an empty string
joint_data['Genre'] = joint_data['Genre'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_genre_matrix = tfidf_genre.fit_transform(joint_data['Genre'])

#Output the shape of tfidf_matrix
tfidf_genre_matrix.shape

In [None]:
genre_dictionary = ['' for i in range(len(list(tfidf_genre.vocabulary_.keys())))]

for gen in tfidf_genre.vocabulary_:
    genre_dictionary[tfidf_genre.vocabulary_[gen]] = gen

In [None]:
tfidf_director = CountVectorizer(stop_words='english')

#Replace NaN with an empty string
joint_data['director'] = joint_data['director'].replace(np.nan, '', regex=True)
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_director_matrix = tfidf_director.fit_transform(joint_data['director'])

#Output the shape of tfidf_matrix
tfidf_director_matrix.shape

In [None]:
director_dictionary = ['' for i in range(len(list(tfidf_director.vocabulary_.keys())))]

for di in tfidf_director.vocabulary_:
    director_dictionary[tfidf_director.vocabulary_[di]] = di

In [None]:
tfidf_country = CountVectorizer(stop_words='english')

#Replace NaN with an empty string
joint_data['country'] = joint_data['country'].replace(np.nan, '', regex=True)
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_country_matrix = tfidf_country.fit_transform(joint_data['country'])

#Output the shape of tfidf_matrix
tfidf_country_matrix.shape

In [None]:
country_dictionary = ['' for i in range(len(list(tfidf_country.vocabulary_.keys())))]

for co in tfidf_country.vocabulary_:
    country_dictionary[tfidf_country.vocabulary_[co]] = co

In [None]:
types_data = np.argmax(pd.get_dummies(joint_data['type'], prefix='type').values, axis=1).reshape(-1, 1)

In [None]:
X = np.hstack([tfidf_genre_matrix.toarray(), tfidf_country_matrix.toarray(), tfidf_director_matrix.toarray(), types_data])
y = np.around(joint_data['Rating'].values).astype(int)

In [None]:
column_names = np.hstack((genre_dictionary, country_dictionary, director_dictionary, ['type']))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
reg = Ridge(random_state=seed_number).fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [None]:
print(reg.score(X_train, y_train), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred))

In [None]:
largest_weights = np.abs(reg.coef_).argsort()[-10:][::-1]

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7, 5), dpi=100)

# Example data
people = ('Tom', 'Dick', 'Harry', 'Slim', 'Jim')
y_pos = np.arange(len(largest_weights))

ax.barh(y_pos, reg.coef_[largest_weights], align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(column_names[largest_weights])
ax.invert_yaxis()  
ax.set_xlabel('Importance')
ax.set_title('Important Features in Ridge Regression Model')

plt.show()

In [None]:
from sklearn import tree

from sklearn.tree import DecisionTreeRegressor

In [None]:
d_tree = DecisionTreeRegressor(max_depth=4, random_state= seed_number).fit(X_train, y_train)
y_pred = d_tree.predict(X_test)

In [None]:
print(reg.score(X_train, y_train), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred))

In [None]:
import graphviz 

dot_data = tree.export_graphviz(d_tree, out_file=None, 
                                feature_names=column_names,  filled=True, rounded=True) 
graph = graphviz.Source(dot_data)

graph

## Hyperparameter Optimization

## GridSearch

### Linear Model

In [None]:
param_lr = {'alpha': np.linspace(0, 1, 10),
                 'normalize': [True, False],
                 'solver': ['auto', 'cholesky', 'sparse_cg']
                }

score = 'neg_mean_absolute_error'

ridge = GridSearchCV(Ridge(random_state=seed_number), param_grid=param_lr, scoring=score)
ridge.fit(X_train, y_train)

In [None]:
print('Best parameterse for score {}: {}'.format(s, ridge.best_params_)) 

In [None]:
y_pred = ridge.predict(X_test)

In [None]:
print(ridge.score(X_train, y_train), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred))

#### Decision Tree

In [None]:
param_dt = {"max_depth": np.arange(2, 20), 
             'splitter': ['best', 'random'],
              'min_samples_split': np.arange(2, 5)}

In [None]:
score = 'neg_mean_absolute_error'

dt_tree_reg = GridSearchCV(DecisionTreeRegressor(random_state=seed_number), param_grid=param_dt, scoring=score)
dt_tree_reg.fit(X_train, y_train)
print('Best parameterse for score {}: {}'.format(s, dt_tree_reg.best_params_))

In [None]:
y_pred_dt = dt_tree_reg.predict(X_test)

In [None]:
print(dt_tree_reg.score(X_train, y_train), mean_squared_error(y_test, y_pred_dt), mean_absolute_error(y_test, y_pred_dt))

### Random Search

### Linear Model

In [None]:
# run randomized search
n_iter_search = 20
random_lr = RandomizedSearchCV(Ridge(random_state=seed_number), param_distributions=param_lr,
                                   n_iter=n_iter_search)

random_lr.fit(X_train, y_train)
print('Best parameterse for score {}: {}'.format(s, random_dtree.best_params_))

In [None]:
y_pred_r_lreg = random_lr.predict(X_test)

In [None]:
print(random_lr.score(X_train, y_train), mean_squared_error(y_test, y_pred_r_lreg), mean_absolute_error(y_test, y_pred_r_lreg))

### Decision Tree

In [None]:
# run randomized search
n_iter_search = 20
random_dtree = RandomizedSearchCV(DecisionTreeRegressor(random_state=seed_number), param_distributions=param_dt,
                                   n_iter=n_iter_search)

random_dtree.fit(X_train, y_train)
print('Best parameterse for score {}: {}'.format(s, random_dtree.best_params_))

In [None]:
y_pred_dt_random = random_dtree.predict(X_test)

In [None]:
print(random_dtree.score(X_train, y_train), mean_squared_error(y_test, y_pred_dt_random), mean_absolute_error(y_test, y_pred_dt_random))

Let us visually compare the results of both techniques:

In [None]:
lr_grid_score = ridge.score(X_train, y_train)
lr_random_score = random_lr.score(X_train, y_train)

lr_grid_mse =  mean_squared_error(y_test, y_pred)
lr_random_mse = mean_squared_error(y_test, y_pred_r_lreg)

lr_grid_mae = mean_absolute_error(y_test, y_pred)
lr_random_mae = mean_absolute_error(y_test, y_pred_r_lreg)


lr_grid_all_scores = [lr_grid_score, lr_grid_mse, lr_random_mae]
lr_random_all_scores = [lr_random_score, lr_random_mse, lr_random_mae]

In [None]:
dt_grid_score = dt_tree_reg.score(X_train, y_train)
dt_random_score = random_dtree.score(X_train, y_train)

dt_grid_mse =  mean_squared_error(y_test, y_pred_dt)
dt_random_mse = mean_squared_error(y_test, y_pred_dt_random)

dt_grid_mae = mean_absolute_error(y_test, y_pred_dt)
dt_random_mae =  mean_absolute_error(y_test, y_pred_dt_random)

dt_grid_all_scores = [dt_grid_score, dt_grid_mse, dt_grid_mae]
dt_random_all_scores = [dt_random_score, dt_random_mse, dt_random_mae]

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5), dpi=100)

N = 3
width = 0.1

ind = np.arange(N)

ax[0].bar(ind + width, lr_grid_all_scores, width, label='Grid')
ax[0].bar(ind - width, lr_random_all_scores, width, label='Random')

ax[0].set_title('Ridge Regression')
ax[0].set_xticks(ind + width / 2)
ax[0].set_xticklabels(('Score', 'MSE', 'MAE'))

ax[0].legend()
ax[0].autoscale_view()


ax[1].bar(ind + width, dt_grid_all_scores, width, label='Grid')
ax[1].bar(ind - width, dt_random_all_scores, width, label='Random')

ax[1].set_title('Decision Tree')
ax[1].set_xticks(ind + width / 2)
ax[1].set_xticklabels(('Score', 'MSE', 'MAE'))

ax[1].legend()
ax[1].autoscale_view()


plt.show()