In [369]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import scipy.sparse as sp
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, Lasso

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [370]:
tfid_columns = ['title', 'description', 'genres', 'production_countries']
ohe_columns = ['role', 'type']

In [371]:
df = pd.read_csv('./model_ready_data.csv')

In [372]:
df['description'].fillna('', inplace=True)
df['genres'].fillna('', inplace=True)
df['production_countries'].fillna('', inplace=True)
df.isna().sum()


person_id               0
role                    0
title                   0
type                    0
description             0
release_year            0
runtime                 0
genres                  0
production_countries    0
imdb_score              0
imdb_votes              0
tmdb_popularity         0
tmdb_score              0
dtype: int64

## Data size

In [373]:
df = df.sample(frac=0.05)
df.shape

(3094, 13)

# Supervised Learning

## Single Linear Regression

In [363]:
X = df.drop(columns=['imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'])
y = df['imdb_votes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=200)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2475, 9), (619, 9), (2475,), (619,))

In [364]:
stop_words  = stopwords.words('english')
ohe_ct = ColumnTransformer(
                      [
                        ('ohe', OneHotEncoder(sparse=False), ohe_columns)
                      ]
                    )
X_train_ohe = pd.DataFrame(ohe_ct.fit_transform(X_train), columns=ohe_ct.get_feature_names_out())
X_test_ohe = pd.DataFrame(ohe_ct.transform(X_test), columns=ohe_ct.get_feature_names_out())
tfid_ct = ColumnTransformer(
                      [
                        ('tfid_t', TfidfVectorizer(ngram_range=(1,1), min_df=5), tfid_columns[0]),
                        ('tfid_d', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1), min_df=10), tfid_columns[1]),
                        ('tfid_g', TfidfVectorizer(ngram_range=(1,1)), tfid_columns[2]),
                        ('tfid_pc', TfidfVectorizer(ngram_range=(1,1)), tfid_columns[3])
                      ]
                    )
X_train_tfid = pd.DataFrame.sparse.from_spmatrix(tfid_ct.fit_transform(X_train), columns=tfid_ct.get_feature_names_out())
X_test_tfid = pd.DataFrame.sparse.from_spmatrix(tfid_ct.transform(X_test), columns=tfid_ct.get_feature_names_out())


X_train.drop(columns=tfid_columns + ohe_columns, inplace=True)
X_test.drop(columns=tfid_columns + ohe_columns, inplace=True)


X_train = pd.concat([X_train.reset_index(drop=True), X_train_ohe.reset_index(drop=True), X_train_tfid.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_ohe.reset_index(drop=True), X_test_tfid.reset_index(drop=True)], axis=1)


In [365]:
lin_reg = LinearRegression(fit_intercept=True)
lin_reg.fit(X_train, y_train)
preds = lin_reg.predict(X_test)
lin_reg.score(X_train, y_train), lin_reg.score(X_test, y_test), r2_score(y_test, preds), mean_squared_error(y_test, preds, squared=False)

  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."


(0.9927431981693902,
 0.6051758599979884,
 0.6051758599979884,
 172612.37362386246)

## Multi Linear Regression

In [374]:
X = df.drop(columns=['imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'])
y = df[['imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=200)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2475, 9), (619, 9), (2475, 4), (619, 4))

In [375]:
stop_words  = stopwords.words('english')
ohe_ct = ColumnTransformer(
                      [
                        ('ohe', OneHotEncoder(sparse=False), ohe_columns)
                      ]
                    )
X_train_ohe = pd.DataFrame(ohe_ct.fit_transform(X_train), columns=ohe_ct.get_feature_names_out())
X_test_ohe = pd.DataFrame(ohe_ct.transform(X_test), columns=ohe_ct.get_feature_names_out())
tfid_ct = ColumnTransformer(
                      [
                        ('tfid_t', TfidfVectorizer(ngram_range=(1,1), min_df=5), tfid_columns[0]),
                        ('tfid_d', TfidfVectorizer(stop_words=stop_words, ngram_range=(1,1), min_df=10), tfid_columns[1]),
                        ('tfid_g', TfidfVectorizer(ngram_range=(1,1)), tfid_columns[2]),
                        ('tfid_pc', TfidfVectorizer(ngram_range=(1,1)), tfid_columns[3])
                      ]
                    )
X_train_tfid = pd.DataFrame.sparse.from_spmatrix(tfid_ct.fit_transform(X_train), columns=tfid_ct.get_feature_names_out())
X_test_tfid = pd.DataFrame.sparse.from_spmatrix(tfid_ct.transform(X_test), columns=tfid_ct.get_feature_names_out())


X_train.drop(columns=tfid_columns + ohe_columns, inplace=True)
X_test.drop(columns=tfid_columns + ohe_columns, inplace=True)


X_train = pd.concat([X_train.reset_index(drop=True), X_train_ohe.reset_index(drop=True), X_train_tfid.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), X_test_ohe.reset_index(drop=True), X_test_tfid.reset_index(drop=True)], axis=1)


In [376]:
class MultiRegression:
  def __init__(self, columns=[]):
    self.columns = columns
    self.lin_reg1 = LinearRegression(fit_intercept=True)
    self.lin_reg2 = LinearRegression(fit_intercept=True)
    self.lin_reg3 = LinearRegression(fit_intercept=True)
    self.lin_reg4 = LinearRegression(fit_intercept=True)

    self.pred1 = None
    self.pred2 = None
    self.pred3 = None
    self.pred4 = None

    self.pipe1 = None
    self.pipe2 = None
    self.pipe3 = None
    self.pipe4 = None

  def fit(self, X_train, y_train):
    self.pipe1 = Pipeline(
        [
          ('ss', StandardScaler()),
          ('lin_reg', self.lin_reg1)
        ]
    )
    self.pipe2 = Pipeline(
        [
          ('ss', StandardScaler()),
          ('lin_reg', self.lin_reg2)
        ]
    )
    self.pipe3 = Pipeline(
        [
          ('ss', StandardScaler()),
          ('lin_reg', self.lin_reg3)
        ]
    )
    self.pipe4 = Pipeline(
        [
          ('ss', StandardScaler()),
          ('lin_reg', self.lin_reg4)
        ]
    )
    self.pipe1.fit(X_train, y_train[self.columns[0]])
    self.pipe2.fit(X_train, y_train[self.columns[1]])
    self.pipe3.fit(X_train, y_train[self.columns[2]])
    self.pipe4.fit(X_train, y_train[self.columns[3]])

  def predict(self, X_test, y_test):
    self.pred1 = self.pipe1.predict(X_test)
    self.pred2 = self.pipe2.predict(X_test)
    self.pred3 = self.pipe3.predict(X_test)
    self.pred4 = self.pipe4.predict(X_test)
    
    print(r2_score(y_test[self.columns[0]], self.pred1), self.columns[0])
    print(r2_score(y_test[self.columns[1]], self.pred2), self.columns[1])
    print(r2_score(y_test[self.columns[2]], self.pred3), self.columns[2])
    print(r2_score(y_test[self.columns[3]], self.pred4), self.columns[3])

    # return pd.concat(
    #     [
    #      pd.DataFrame(self.pred1, columns=[self.columns[0]]), 
    #      pd.DataFrame(self.pred2, columns=[self.columns[1]]), 
    #      pd.DataFrame(self.pred3, columns=[self.columns[2]]), 
    #      pd.DataFrame(self.pred4, columns=[self.columns[3]])
    #     ], axis=1)
# imdb_votes

mult_lin_reg = MultiRegression(columns=['imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score'])
mult_lin_reg.fit(X_train, y_train)
preds = mult_lin_reg.predict(X_test, y_test)

  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."
  "pandas.DataFrame with sparse columns found."


-4.214239096312847e+25 imdb_score
-8.45933879252115e+24 imdb_votes
-4.468719047840346 tmdb_popularity
-8.114412898756825e+24 tmdb_score


In [354]:
X_train

Unnamed: 0,person_id,release_year,runtime,ohe__role_ACTOR,ohe__type_MOVIE,ohe__type_SHOW,tfid_t__10,tfid_t__12,tfid_t__1984,tfid_t__2001,...,tfid_pc__se,tfid_pc__sn,tfid_pc__su,tfid_pc__tr,tfid_pc__tw,tfid_pc__us,tfid_pc__uy,tfid_pc__xc,tfid_pc__xx,tfid_pc__za
0,16019.0,1986.0,111.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0
1,125722.0,1953.0,94.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,9739.0,2009.0,100.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.292857,0.0,0.0,0.0,0.0
3,153265.0,1939.0,83.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0
4,12018.0,2006.0,28.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2470,60089.0,2008.0,104.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2471,14142.0,2021.0,80.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0
2472,33873.0,1988.0,98.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2473,2889.0,2011.0,106.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.187247,0.0,0.0,0.0,0.0


# fdsafd

In [288]:
# def mod_r2_score(y_test, preds, column):
#   print(y_test[column], preds[column])
#   return r2_score(y_test[column], preds[column])

# mod_r2_score(y_test, preds, 'tmdb_popularity')

31251    72.272
60911    11.869
691       8.294
5251     13.802
13054    11.923
          ...  
34916    36.169
7729     13.161
38227     5.583
57256    30.184
2251      5.420
Name: tmdb_popularity, Length: 3714, dtype: float64 0       6.501381
1       6.766617
2       7.926925
3       8.609810
4       7.105972
          ...   
3709    6.306679
3710    6.700394
3711    5.395286
3712    6.159080
3713    7.291672
Name: tmdb_popularity, Length: 3714, dtype: float64


-5.140334297053516e+19

# Deep Learning