# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
X_train = pd.read_csv('../../data/model_data/X_train.csv').drop(columns=['Unnamed: 0', 'production_countries', 'genres'])
X_train['description'] = X_train['description'].fillna('')
X_test = pd.read_csv('../../data/model_data/X_test.csv').drop(columns=['Unnamed: 0', 'production_countries', 'genres'])
y_train = pd.read_csv('../../data/model_data/y_train.csv').drop(columns=['Unnamed: 0'])
y_test = pd.read_csv('../../data/model_data/y_test.csv').drop(columns=['Unnamed: 0'])


In [3]:
ohe_columns = ['role', 'type']
tfid_columns = ['title', 'description']
poly_columns = ['seasons', 'runtime', 'release_year', 'person_id']

# Feature Engineering

## Vectorizer

In [4]:
stop_words  = stopwords.words('english')
ct_count_vec = ColumnTransformer(
                            [
                                ('tfid_t', TfidfVectorizer(ngram_range=(1,1), min_df=2), tfid_columns[0]),
                                ('tfid_d', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1), min_df=3), tfid_columns[1]),
                            ]
                        )
ct_count_vec.fit(X_train, y_train)
X_train_cv = pd.DataFrame(ct_count_vec.transform(X_train).A, columns=ct_count_vec.get_feature_names_out())
X_test_cv = pd.DataFrame(ct_count_vec.transform(X_test).A, columns=ct_count_vec.get_feature_names_out())
X_train.drop(columns=tfid_columns, inplace=True)
X_test.drop(columns=tfid_columns, inplace=True)

X_train = pd.concat([X_train, X_train_cv], axis=1) 
X_test = pd.concat([X_test, X_test_cv], axis=1) 

## OHE

In [5]:
X_train = pd.concat([X_train, pd.get_dummies(X_train, columns=ohe_columns, drop_first=True)], axis=1).drop(columns=ohe_columns)
X_test = pd.concat([X_test, pd.get_dummies(X_test, columns=ohe_columns, drop_first=True)], axis=1).drop(columns=ohe_columns)

# Baseline

In [6]:
from sklearn.metrics import r2_score, mean_squared_error
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train['tmdb_popularity'])
preds = dummy_regr.predict(X_test)
r2_score(y_test['tmdb_popularity'], preds), mean_squared_error(y_test['tmdb_popularity'], preds, squared=False)

(-0.002248232031702546, 191.56242644478877)