# *** THIS PROJECT IS A COLLABORATION BETWEEN BLESSING NWOGU, DIANA CASTILLO, GILA KOHANBASH, AND RACHEL FINLEY ***

In [165]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
train_df = pd.read_csv("checkpoint_train.csv")
test_df = pd.read_csv("checkpoint_test.csv")    

In [5]:
text_columns = ['overview', 'tagline', 'title', 'all_keywords']

In [10]:
train_df[text_columns].head(1)

Unnamed: 0,overview,tagline,title,all_keywords
0,Karl Childers is a mentally disabled man who h...,A simple man A difficult choice,Sling Blade,independent film repair shop southern death th...


In [7]:
train_df[text_columns] = train_df[text_columns].fillna("").astype(str)
test_df[text_columns] = test_df[text_columns].fillna("").astype(str)

In [8]:
train_df['combined_text'] = train_df['overview'] + train_df['tagline'] + train_df['title'] + train_df['all_keywords']
test_df['combined_text'] = test_df['overview'] + test_df['tagline'] + test_df['title'] + test_df['all_keywords']

In [12]:
test_df['combined_text'][0]

'Uncle Fester has been missing for 25 years An evil doctor finds out and introduces a fake Fester in an attempt to get the Adams Familys money The youngest daughter has some doubts about the new uncle Fester but the fake uncle adapts very well to the strange family Can the doctor carry out her evil plans and take over the Adams Familys fortune?Weird Is RelativeThe Addams Familydead wish vampire black humor uncle eccentric werewolf macabre loan shark accountant'

In [14]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_df['combined_text'])
tfidf_train_matrix = vectorizer.transform(train_df['combined_text'])
tfidf_test_matrix = vectorizer.transform(test_df['combined_text'])

In [23]:
y_train = train_df['rating']
X_train = tfidf_train_matrix
X_test = tfidf_test_matrix

In [197]:
model = make_pipeline(StandardScaler(with_mean = False), LinearRegression( n_jobs = -1))

In [198]:
model.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('linearregression', LinearRegression(n_jobs=-1))])

In [26]:
test_df['rating'] = model.predict(X_test)
test_df['movieId'] = test_df['movieId'].astype(int).astype (str)
test_df['userId']= test_df['userId'].astype(int).astype (str)
test_df['userId_movieId'] = test_df['userId'] + '_' + test_df['movieId']
cols = ['userId_movieId', 'rating']
test_df = test_df[cols]

In [27]:
test_df.to_csv('submission.csv', index = False)

Model RMSE: 0.2018

### Now, lets try adding the rest of the features together into a data frame with the TFIDF matrix and see if we can improve score

In [229]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, f_regression

In [230]:
train_df = pd.read_csv("checkpoint_train.csv")
test_df = pd.read_csv("checkpoint_test.csv")    

In [231]:
text_columns = ['overview', 'tagline', 'title', 'all_keywords']

In [232]:
num_train = train_df.drop(columns = text_columns)
num_test = test_df.drop(columns = text_columns)

In [233]:
train_df[text_columns].head(1)

Unnamed: 0,overview,tagline,title,all_keywords
0,Karl Childers is a mentally disabled man who h...,A simple man A difficult choice,Sling Blade,independent film repair shop southern death th...


In [234]:
train_df[text_columns] = train_df[text_columns].fillna("").astype(str)
test_df[text_columns] = test_df[text_columns].fillna("").astype(str)

In [235]:
train_df['combined_text'] = train_df['overview'] + train_df['tagline'] + train_df['title'] + train_df['all_keywords']
test_df['combined_text'] = test_df['overview'] + test_df['tagline'] + test_df['title'] + test_df['all_keywords']

In [236]:
test_df['combined_text'][0]

'Uncle Fester has been missing for 25 years An evil doctor finds out and introduces a fake Fester in an attempt to get the Adams Familys money The youngest daughter has some doubts about the new uncle Fester but the fake uncle adapts very well to the strange family Can the doctor carry out her evil plans and take over the Adams Familys fortune?Weird Is RelativeThe Addams Familydead wish vampire black humor uncle eccentric werewolf macabre loan shark accountant'

In [237]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_df['combined_text'])
tfidf_train_matrix = vectorizer.transform(train_df['combined_text'])
tfidf_test_matrix = vectorizer.transform(test_df['combined_text'])

- would be interesting to try different 'n_components' values. it is the amount of columns that the tfidf matrix will truncate into

In [238]:
svd_model = TruncatedSVD(n_components = 21)
svd_matrix_train = svd_model.fit_transform(tfidf_train_matrix)
svd_matrix_test = svd_model.fit_transform(tfidf_test_matrix)

In [239]:
train_df = pd.merge(num_train, pd.DataFrame(svd_matrix_train), left_index = True, right_index = True)
test_df = pd.merge(num_test, pd.DataFrame(svd_matrix_test), left_index = True, right_index = True)

In [240]:
X_train = train_df.drop(labels = 'rating', axis = 1)
y_train = train_df['rating']

X_test = test_df

In [241]:
X_train.head(1)

Unnamed: 0,userId,movieId,budget,popularity,release_date,revenue,runtime,vote_average,vote_count,part_of_collection,...,11,12,13,14,15,16,17,18,19,20
0,10,1358,1000000.0,8.46,1996,24444121.0,135,7.4,109,0,...,0.034951,-0.026155,-0.00564,-0.020971,-0.012126,0.029952,-0.039815,-0.029681,0.007032,-0.028253


- it would be worth trying different models with this approach: Random Forest, Lasso, etc. Diff approaches could use slightly different data engineering approaches as well as their own individual hyperparameter tuning

- setting normalize to true or not also has an effect on the way that the model is assinging weights

In [242]:
model = make_pipeline(StandardScaler(with_mean = False),
                      SelectKBest(f_regression, k = 10),
                      LinearRegression( n_jobs = -1))

In [243]:
model.fit(X_train, y_train)

  correlation_coefficient /= X_norms


Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('selectkbest',
                 SelectKBest(score_func=<function f_regression at 0x0000029941BF9B80>)),
                ('linearregression', LinearRegression(n_jobs=-1))])

In [244]:
test_df['rating'] = model.predict(X_test)
test_df['movieId'] = test_df['movieId'].astype(int).astype (str)
test_df['userId']= test_df['userId'].astype(int).astype (str)
test_df['userId_movieId'] = test_df['userId'] + '_' + test_df['movieId']
cols = ['userId_movieId', 'rating']
test_df = test_df[cols]



In [246]:
test_df.to_csv('submission.csv', index = False)

Model RMSE: 0.1944

#### Feature weights sorted in decending value!!

In [225]:
import seaborn as sns
import matplotlib.pyplot as plt

In [228]:
model['linearregression'].coef_

array([-0.01193822, -0.00253509,  0.00297744,  0.08184927, -0.00140408,
        0.00509809,  0.00144883,  0.00190261,  0.0017924 ,  0.00098468])

In [226]:
''' Get feature coefficients '''
coefs = model['linearregression'].coef_

''' Create a DataFrame with feature names and coefficients '''
df_feature_importances = pd.DataFrame({'Feature': train_df.drop(columns='rating').columns, 'Coefficient': coefs})

''' Sort top 10 values based on coefficient magnitude '''
df_feature_importances = df_feature_importances.reindex(df_feature_importances['Coefficient'].abs().sort_values(ascending=False).index)

''' Get the top 10 highest coefficient columns '''
top_features = df_feature_importances['Feature'].iloc[:10].tolist()

''' Make it pretty, plot the data '''
plt.figure(figsize = (12, 8))
sns.set_style('whitegrid')
sns.barplot(x = 'Feature', y = 'Coefficient', data = df_feature_importances, palette = 'RdYlBu_r')
plt.xticks(rotation = 90)
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.title('Linear Regression Feature Importance')
plt.show()

ValueError: All arrays must be of the same length