In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Import Data

In [6]:
X_train = pd.read_csv('./X_train.csv').drop(columns=['Unnamed: 0', 'production_countries', 'genres'])
X_train['description'] = X_train['description'].fillna('')
X_test = pd.read_csv('./X_test.csv').drop(columns=['Unnamed: 0', 'production_countries', 'genres'])
y_train = pd.read_csv('./y_train.csv').drop(columns=['Unnamed: 0'])
y_test = pd.read_csv('./y_test.csv').drop(columns=['Unnamed: 0'])

# Get Columns name for Transformations

In [7]:
ohe_columns = ['role', 'type']
tfid_columns = ['title', 'description']
poly_columns = ['seasons', 'runtime', 'release_year', 'person_id']

# TfidfVectorizer

In [8]:
stop_words  = stopwords.words('english')
ct_count_vec = ColumnTransformer(
                            [
                                ('tfid_t', TfidfVectorizer(ngram_range=(1,1), min_df=2), tfid_columns[0]),
                                ('tfid_d', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1), min_df=3), tfid_columns[1]),
                            ]
                        )
ct_count_vec.fit(X_train, y_train)
X_train_cv = pd.DataFrame(ct_count_vec.transform(X_train).A, columns=ct_count_vec.get_feature_names_out())
X_test_cv = pd.DataFrame(ct_count_vec.transform(X_test).A, columns=ct_count_vec.get_feature_names_out())
X_train.drop(columns=tfid_columns, inplace=True)
X_test.drop(columns=tfid_columns, inplace=True)

X_train = pd.concat([X_train, X_train_cv], axis=1) 
X_test = pd.concat([X_test, X_test_cv], axis=1) 

# Getdummies

In [9]:
X_train = pd.concat([X_train, pd.get_dummies(X_train, columns=ohe_columns, drop_first=True)], axis=1).drop(columns=ohe_columns)
X_test = pd.concat([X_test, pd.get_dummies(X_test, columns=ohe_columns, drop_first=True)], axis=1).drop(columns=ohe_columns)

# Model Testing

## Model 1

In [6]:
rf = RandomForestRegressor(max_depth=10)
rf.fit(X_train, y_train['tmdb_popularity'])
preds = rf.predict(X_test)
r2_score(y_test['tmdb_popularity'], preds), mean_squared_error(y_test['tmdb_popularity'], preds, squared=False)

(0.9700976171488305, 33.08838564714761)

## Model 2

In [7]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lasso', LassoCV()),
])
pipe.fit(X_train, y_train['tmdb_popularity'])
preds = pipe.predict(X_test)
r2_score(y_test['tmdb_popularity'], preds), mean_squared_error(y_test['tmdb_popularity'], preds, squared=False)

(0.9869131397326231, 21.889730399251995)

## Model 3

In [8]:
pipe2 = Pipeline([
    ('ss', StandardScaler()),
    ('lasso', Lasso()),
])
pipe2.fit(X_train, y_train['tmdb_popularity'])
preds = pipe2.predict(X_test)
r2_score(y_test['tmdb_popularity'], preds), mean_squared_error(y_test['tmdb_popularity'], preds, squared=False)

(0.9741896713590367, 30.7411030832573)

## Model 4

In [10]:
pipe3 = Pipeline([
    ('ss', StandardScaler()),
    ('lasso', Lasso()),
])
pipe3_parameters = {
    'lasso__alpha': np.logspace(1, 10, 15),
    'lasso__fit_intercept': [True, False],
    'lasso__normalize': [True, False],
    'lasso__selection':['cyclic', 'random']
}

gs = GridSearchCV( 
    pipe3, 
    pipe3_parameters,
    n_jobs = -1
)
gs.fit(X_train, y_train['tmdb_popularity'])
preds = gs.predict(X_test)
r2_score(y_test['tmdb_popularity'], preds), mean_squared_error(y_test['tmdb_popularity'], preds, squared=False)



(0.8933472680444862, 62.48973346765479)