In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import math
from py_files.d2v_embed import d2v_embed

In [2]:
df = pd.read_csv('df_with_features.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,primaryTitleFormatted,Year,oscar_noms,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0010600,The Doll,Die Puppe,1919.0,,66.0,1898.0,True,the_doll,1919,,...,0,0,0,0,0,0,0,0,0,0
tt0011841,Way Down East,Way Down East,1920.0,,145.0,5376.0,True,way_down_east,1920,,...,0,0,0,0,0,0,0,0,0,0
tt0012494,Déstiny,Der müde Tod,1921.0,,97.0,5842.0,True,destiny,1921,,...,0,0,0,0,0,0,0,0,0,0
tt0015163,The Navigator,The Navigator,1924.0,,59.0,9652.0,True,the_navigator,1924,,...,0,0,0,0,0,0,0,0,0,0
tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925.0,,93.0,17887.0,True,the_phantom_of_the_opera,1925,3.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7959 entries, tt0010600 to tt9911196
Columns: 17948 entries, primaryTitle to nm9985837
dtypes: bool(3), float64(30), int64(17909), object(6)
memory usage: 1.1+ GB


In [5]:
df.select_dtypes(include=['bool']).columns

Index(['label', 'hasRemake', 'isEN'], dtype='object')

In [6]:
df.select_dtypes(include=['float']).columns

Index(['startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'oscar_noms',
       'oscar_wins', 'Rank', 'Worldwide', 'Domestic', 'Foreign', 'year',
       'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [7]:
df.select_dtypes(include=['object']).columns

Index(['primaryTitle', 'originalTitle', 'primaryTitleFormatted',
       'title_language', 'titleFormatted', 'genres'],
      dtype='object')

In [8]:
prim_title_df = d2v_embed(df['primaryTitle'])

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:27<00:00,  1.48s/it]


In [9]:
orig_title_df = d2v_embed(df['originalTitle'])

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:30<00:00,  1.11it/s]


In [10]:
prim_title_formatted_df = d2v_embed(df['primaryTitleFormatted'])

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:25<00:00,  1.17it/s]


In [11]:
title_formatted_df = d2v_embed(df['titleFormatted'])

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:18<00:00,  1.27it/s]


In [12]:
genres_df = d2v_embed(df['genres'])

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:14<00:00,  1.35it/s]


In [13]:
df['title_language'] = pd.factorize(df['title_language'])[0]

In [14]:
df.drop(columns = df.select_dtypes(include='object').columns, inplace=True)

In [15]:
df.head()

Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label,Year,oscar_noms,oscar_wins,Rank,Worldwide,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0010600,1919.0,,66.0,1898.0,True,1919,,,,,...,0,0,0,0,0,0,0,0,0,0
tt0011841,1920.0,,145.0,5376.0,True,1920,,,,,...,0,0,0,0,0,0,0,0,0,0
tt0012494,1921.0,,97.0,5842.0,True,1921,,,,,...,0,0,0,0,0,0,0,0,0,0
tt0015163,1924.0,,59.0,9652.0,True,1924,,,,,...,0,0,0,0,0,0,0,0,0,0
tt0016220,1925.0,,93.0,17887.0,True,1925,3.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# dealing with nan values
for index, row in df.iterrows():
    # For missing startYear or endYear entries, insert the other, if it exists.
    if math.isnan(row['startYear']):
        if not math.isnan(row['endYear']):
            df.at[index,'startYear']=df.at[index,'endYear']
    if math.isnan(row['endYear']):
        if not math.isnan(row['startYear']):
            df.at[index,'endYear']=df.at[index,'startYear']
    
    # For missing oscar_noms and oscar_wins, insert 0
    if math.isnan(row['oscar_noms']):
        df.at[index,'oscar_noms'] = 0
    if math.isnan(row['oscar_wins']):
        df.at[index,'oscar_wins'] = 0
    
df.head()

Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label,Year,oscar_noms,oscar_wins,Rank,Worldwide,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0010600,1919.0,1919.0,66.0,1898.0,True,1919,0.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0
tt0011841,1920.0,1920.0,145.0,5376.0,True,1920,0.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0
tt0012494,1921.0,1921.0,97.0,5842.0,True,1921,0.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0
tt0015163,1924.0,1924.0,59.0,9652.0,True,1924,0.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0
tt0016220,1925.0,1925.0,93.0,17887.0,True,1925,3.0,0.0,,,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df = df.join(prim_title_df)
df = df.join(orig_title_df)
df = df.join(prim_title_formatted_df)
df = df.join(title_formatted_df)
df = df.join(genres_df)
df.head()

Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label,Year,oscar_noms,oscar_wins,Rank,Worldwide,...,genres_118,genres_119,genres_120,genres_121,genres_122,genres_123,genres_124,genres_125,genres_126,genres_127
tt0010600,1919.0,1919.0,66.0,1898.0,True,1919,0.0,0.0,,,...,-0.030807,0.153879,-0.093956,-0.165557,0.107428,-0.108622,0.097445,0.213741,-0.027062,0.289416
tt0011841,1920.0,1920.0,145.0,5376.0,True,1920,0.0,0.0,,,...,-0.044073,0.062299,-0.023901,0.174868,-0.107422,0.117729,0.133439,0.067754,0.023237,-0.195334
tt0012494,1921.0,1921.0,97.0,5842.0,True,1921,0.0,0.0,,,...,0.002146,-0.001138,0.002069,-0.001281,-0.001445,0.002688,0.00159,0.001819,0.002143,-0.002463
tt0015163,1924.0,1924.0,59.0,9652.0,True,1924,0.0,0.0,,,...,-0.000615,-0.002997,-0.003896,-0.001055,0.001461,0.00069,-0.003348,-0.000171,0.001314,0.002341
tt0016220,1925.0,1925.0,93.0,17887.0,True,1925,3.0,0.0,,,...,0.003321,0.000408,-0.002946,-0.000956,-0.001717,0.001184,0.000563,-0.001644,0.000851,0.003044


In [18]:
# split in train and validation sets
train_df, valid_df = train_test_split(df, train_size=0.7, shuffle=True, stratify=df['label'], random_state=17)

In [19]:
model = lgb.LGBMClassifier(random_state=17)
model.fit(train_df.loc[:, train_df.columns != 'label'],
          train_df['label'],
          eval_metric='logloss')

LGBMClassifier(random_state=17)

In [20]:
valid_df['label'] = valid_df['label'].astype('int')
val_preds = model.predict(valid_df.loc[:, train_df.columns != 'label'])

In [21]:
acc_valid = accuracy_score(y_true=valid_df['label'].astype('int'), y_pred=val_preds)

In [22]:
acc_valid

0.7889447236180904