In [7]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
X_train = pd.read_csv('../../data/model_data/X_train.csv').drop(columns=['Unnamed: 0', 'production_countries', 'genres'])
X_train['description'] = X_train['description'].fillna('')
X_test = pd.read_csv('../../data/model_data/X_test.csv').drop(columns=['Unnamed: 0', 'production_countries', 'genres'])
y_train = pd.read_csv('../../data/model_data/y_train.csv').drop(columns=['Unnamed: 0'])
y_test = pd.read_csv('../../data/model_data/y_test.csv').drop(columns=['Unnamed: 0'])

In [9]:
ohe_columns = ['role', 'type']
tfid_columns = ['title', 'description']
poly_columns = ['seasons', 'runtime', 'release_year', 'person_id']

# Feature Engineering

In [10]:
stop_words  = stopwords.words('english')
ct_count_vec = ColumnTransformer(
                            [
                                ('tfid_t', TfidfVectorizer(ngram_range=(1,1), min_df=2), tfid_columns[0]),
                                ('tfid_d', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1), min_df=3), tfid_columns[1]),
                            ]
                        )
ct_count_vec.fit(X_train, y_train)
X_train_cv = pd.DataFrame(ct_count_vec.transform(X_train).A, columns=ct_count_vec.get_feature_names_out())
X_test_cv = pd.DataFrame(ct_count_vec.transform(X_test).A, columns=ct_count_vec.get_feature_names_out())
X_train.drop(columns=tfid_columns, inplace=True)
X_test.drop(columns=tfid_columns, inplace=True)

X_train = pd.concat([X_train, X_train_cv], axis=1) 
X_test = pd.concat([X_test, X_test_cv], axis=1) 

# GetDummies

In [11]:
X_train = pd.concat([X_train, pd.get_dummies(X_train, columns=ohe_columns, drop_first=True)], axis=1).drop(columns=ohe_columns)
X_test = pd.concat([X_test, pd.get_dummies(X_test, columns=ohe_columns, drop_first=True)], axis=1).drop(columns=ohe_columns)

# Final Model

In [12]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lasso', LassoCV()),
])
pipe.fit(X_train, y_train['tmdb_popularity'])
preds = pipe.predict(X_test)
r2_score(y_test['tmdb_popularity'], preds), mean_squared_error(y_test['tmdb_popularity'], preds, squared=False)

(0.9869131397326231, 21.889730399252002)

NOTE: Ran This on google Colab Pro need to rerun to extract best features will incorporate this into my git repo

# Conclusion

> My model greatly beat the baseline model.  The baselines performance had a percent of variance present in the data that it predicted was -0.0022 while the root_mean_squared_error was 191.56.  Where my percent of variance present in the data that my final model predicted was .987 and had a root_mean_squared_error of 21.89.  In sum the model was good at finding the linear realtionship between HBO Movie/TVShows and IMDB_Score based on a specific set of features my model will help HBO determine if their next show is going to be a hit.
