In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

import multiprocessing as mp
from sklearn.metrics import accuracy_score

import lightgbm as lgb

In [2]:
#Read directory and create file list
files = os.listdir()
files = [file for file in files if 'train-' in file]

In [3]:
files

['train-1.csv',
 'train-2.csv',
 'train-3.csv',
 'train-4.csv',
 'train-5.csv',
 'train-6.csv',
 'train-7.csv',
 'train-8.csv']

In [4]:
#Create initial df by concatating all files in list
df = pd.concat(map(pd.read_csv, files))
df.set_index('tconst', inplace = True)

In [5]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0010600,4,The Doll,Die Puppe,1919,\N,66,1898.0,True
tt0011841,7,Way Down East,Way Down East,1920,\N,145,5376.0,True
tt0012494,9,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
tt0015163,25,The Navigator,The Navigator,1924,\N,59,9652.0,True
tt0016220,38,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True


In [6]:
# Remove Unnamed column (which is old index list)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [7]:
df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True


In [8]:
SEED = 17

In [9]:
# # Cleaning titles and merging with oscar noms
# df["primaryTitle"] = df["primaryTitle"].str.lower()\
#                                        .str.normalize('NFKD')\
#                                        .str.encode('ascii', errors='ignore')\
#                                        .str.decode('utf-8')\
#                                        .str.replace(" ", "_", regex=True)\
#                                        .str.replace("\W", "", regex=True)

# For string columns, fill NAN entries with whitespace
for col in ['primaryTitle', 'originalTitle']:
    df[col] = df[col].fillna(" ")
    df[col] = df[col].str.lower()\
                        .str.normalize('NFKD')\
                        .str.encode('ascii', errors='ignore')\
                        .str.decode('utf-8')\
                        .str.replace("\W", " ", regex=True)
df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,the doll,die puppe,1919,\N,66,1898.0,True
tt0011841,way down east,way down east,1920,\N,145,5376.0,True
tt0012494,destiny,der mude tod,1921,\N,97,5842.0,True
tt0015163,the navigator,the navigator,1924,\N,59,9652.0,True
tt0016220,the phantom of the opera,the phantom of the opera,1925,\N,93,17887.0,True


In [48]:
def d2v_embed(df_col, max_epochs = 100, vec_size = 100, alpha = 0.025):
    
    tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(df_col)]

    model = Doc2Vec(vector_size=vec_size,
                    alpha=alpha, 
                    min_alpha=0.00025,
                    min_count=1,
                    dm =1,
                    workers = mp.cpu_count())
  
    model.build_vocab(tagged_data)

    for epoch in range(max_epochs):
    #     print('iteration {0}'.format(epoch))
        model.train(tagged_data,
                    total_examples=model.corpus_count,
                    epochs=model.epochs)
        # decrease the learning rate
        model.alpha -= 0.0002
        # fix the learning rate, no decay
        model.min_alpha = primTitleD2V.alpha
    
    # save model
    model.save(f"doc2vec_model_{df_col.name}.model")
    
    #return df with doc embeddings
    return pd.DataFrame([model.docvecs[i] for i in range(df_col.shape[0])], index = df_col.index)

In [49]:
df = d2v_embed(df['primaryTitle'], max_epochs = 100, vec_size = 100, alpha = 0.025)
df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,-0.159284,0.072122,0.381893,-0.291232,0.363767,-0.401523,0.415001,-0.686272,-0.015962,0.171462,...,0.643781,-0.165972,-0.287152,-0.04372,-0.186789,-0.007745,0.023346,0.376694,-0.512286,-0.200289
tt0011841,0.474351,-0.037624,-0.105854,-0.470612,-0.027656,0.181307,-0.039609,0.11169,-0.106913,-0.396222,...,-0.391907,0.14142,-0.096461,0.233054,-0.171814,0.278082,0.049548,0.484113,0.019187,0.532084
tt0012494,0.57956,-0.468167,0.075424,-0.137524,0.255278,0.235488,0.152141,-0.468264,-0.248279,0.249112,...,0.115424,-0.122997,0.108823,-0.051666,0.272828,0.143414,0.122895,0.667337,0.272697,-0.289365
tt0015163,0.117276,-0.050688,0.268759,-0.326602,0.134885,0.099501,0.101406,-0.444524,-0.331852,-0.017976,...,-0.005192,-0.172694,0.314073,-0.181717,-0.343734,-0.072333,0.237675,0.392752,-0.513758,0.078036
tt0016220,0.06511,-0.069539,0.355802,-0.278576,0.382869,-0.428268,0.13639,-0.347611,0.013706,0.175766,...,0.215957,0.098567,-0.250401,0.301944,-0.404063,-0.585265,-0.349682,0.648196,-0.302803,0.102346


In [21]:
primTitleD2V.save("primTitleD2V.model")

In [19]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(df['originalTitle'])]

max_epochs = 100
vec_size = 100
alpha = 0.025

origTitleD2V = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1,
                workers = mp.cpu_count())
  
origTitleD2V.build_vocab(tagged_data)

for epoch in range(max_epochs):
#     print('iteration {0}'.format(epoch))
    origTitleD2V.train(tagged_data,
                total_examples=origTitleD2V.corpus_count,
                epochs=origTitleD2V.epochs)
    # decrease the learning rate
    origTitleD2V.alpha -= 0.0002
    # fix the learning rate, no decay
    origTitleD2V.min_alpha = origTitleD2V.alpha

In [22]:
origTitleD2V.save("origTitleD2V.model")

In [10]:
primTitleD2V = Doc2Vec.load("primTitleD2V.model")
origTitleD2V = Doc2Vec.load("origTitleD2V.model")

In [11]:
primaryTitleD2V = pd.DataFrame([primTitleD2V.docvecs[i] for i in range(df.shape[0])], index = df.index)
#primaryTitleD2V = pd.DataFrame(primTitleD2V.docvecs[:])
#primaryTitleD2V.head()

In [12]:
primaryTitleD2V.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,-0.440142,0.306454,-0.070401,-0.057359,0.000581,-0.071423,0.051526,-0.076256,-0.269771,-0.028364,...,0.009934,0.358975,-0.31075,0.289994,0.018177,0.134371,-0.049273,0.235176,-0.455158,0.531315
tt0011841,-0.073622,0.358497,-0.277526,-0.439501,0.245974,0.224028,0.409261,0.39762,-0.260262,0.236719,...,-0.566846,0.13406,-0.421964,-0.094485,-0.290325,-0.197263,-0.190978,0.081143,-0.896569,-0.069563
tt0012494,0.18217,0.012225,-0.059819,-0.300158,0.062693,0.314501,0.297345,-0.371924,-0.129783,0.153722,...,-0.661549,0.358284,-0.273053,0.765994,6.5e-05,0.203705,0.031249,0.531458,-0.302416,0.098668
tt0015163,-0.162405,-0.108079,-0.058337,-0.096047,-0.052548,0.345115,0.04432,-0.179015,-0.020992,0.356106,...,-0.417272,-0.061615,-0.193513,0.191552,-0.058161,-0.342857,-0.011617,0.114098,-0.456539,0.182062
tt0016220,0.181273,-0.3677,0.005288,-0.191084,0.383556,0.365448,0.392199,-0.026652,-0.159548,-0.01137,...,-0.35497,0.479828,-0.333378,0.396658,0.034621,-0.127124,0.319341,0.372939,-0.181004,-0.014595


In [14]:
originalTitleD2V = pd.DataFrame([origTitleD2V.docvecs[i] for i in range(df.shape[0])], 
                                index = df.index)

In [15]:
originalTitleD2V.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,0.104987,-0.28874,-0.295158,-0.325333,0.026754,0.20457,0.116409,-0.006599,0.636242,0.292284,...,-0.466755,0.239823,-0.647431,0.366631,-0.104582,0.121624,-0.635718,0.396445,-0.34031,-0.394536
tt0011841,0.000353,-0.036141,-0.670399,-0.414776,0.18489,-0.00311,0.620009,-0.267304,0.076645,-0.139206,...,-0.444145,0.170897,-0.781565,0.605423,-0.329683,0.388482,-0.607193,0.202279,-0.729887,-0.175332
tt0012494,0.221099,-0.423083,0.031141,-0.013665,-0.410172,0.171698,-0.355385,-0.55549,0.371891,0.160266,...,-0.941476,-0.282652,-0.409407,0.520303,-0.437225,-0.365602,-0.726512,0.314618,0.129482,-0.0612
tt0015163,0.136383,-0.000208,0.002486,-0.333616,-0.400127,0.18581,0.510572,-0.20235,-0.173713,0.288232,...,-0.580744,0.215948,-0.77183,0.493596,0.066465,0.457513,0.260721,-0.039111,-0.109017,0.028599
tt0016220,-0.038602,-0.481189,0.290622,-0.368133,0.731881,-0.237527,0.837575,0.082099,0.392058,0.37572,...,-0.483001,0.291847,-0.284678,0.511878,-0.274454,0.449384,-0.115485,0.6067,-0.093008,-0.149644


In [16]:
originalTitleD2V = pd.DataFrame([origTitleD2V.docvecs[i] for i in range(df.shape[0])], 
                                index = df.index,
                                columns = list(range(primaryTitleD2V.shape[1],primaryTitleD2V.shape[1]+originalTitleD2V.shape[1])))

In [17]:
originalTitleD2V.head()

Unnamed: 0_level_0,100,101,102,103,104,105,106,107,108,109,...,190,191,192,193,194,195,196,197,198,199
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,0.104987,-0.28874,-0.295158,-0.325333,0.026754,0.20457,0.116409,-0.006599,0.636242,0.292284,...,-0.466755,0.239823,-0.647431,0.366631,-0.104582,0.121624,-0.635718,0.396445,-0.34031,-0.394536
tt0011841,0.000353,-0.036141,-0.670399,-0.414776,0.18489,-0.00311,0.620009,-0.267304,0.076645,-0.139206,...,-0.444145,0.170897,-0.781565,0.605423,-0.329683,0.388482,-0.607193,0.202279,-0.729887,-0.175332
tt0012494,0.221099,-0.423083,0.031141,-0.013665,-0.410172,0.171698,-0.355385,-0.55549,0.371891,0.160266,...,-0.941476,-0.282652,-0.409407,0.520303,-0.437225,-0.365602,-0.726512,0.314618,0.129482,-0.0612
tt0015163,0.136383,-0.000208,0.002486,-0.333616,-0.400127,0.18581,0.510572,-0.20235,-0.173713,0.288232,...,-0.580744,0.215948,-0.77183,0.493596,0.066465,0.457513,0.260721,-0.039111,-0.109017,0.028599
tt0016220,-0.038602,-0.481189,0.290622,-0.368133,0.731881,-0.237527,0.837575,0.082099,0.392058,0.37572,...,-0.483001,0.291847,-0.284678,0.511878,-0.274454,0.449384,-0.115485,0.6067,-0.093008,-0.149644


In [18]:
# For numerical columns, replace non-numerical entries with zeroes
for col in ["startYear", "endYear", "runtimeMinutes"]:
    df[col] = df[col].fillna(" ").replace('\\N', '0').astype(int)

# For missing startYear or endYear entries, insert the other, if it exists.
for index, row in df.iterrows():
    if row['startYear']==0:
        if row['endYear']!=0:
            df.at[index,'startYear']=df.at[index,'endYear']
    
    if row['endYear']==0:
        if row['startYear']!=0:
            df.at[index,'endYear']=df.at[index,'startYear']

df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,the doll,die puppe,1919,1919,66,1898.0,True
tt0011841,way down east,way down east,1920,1920,145,5376.0,True
tt0012494,destiny,der mude tod,1921,1921,97,5842.0,True
tt0015163,the navigator,the navigator,1924,1924,59,9652.0,True
tt0016220,the phantom of the opera,the phantom of the opera,1925,1925,93,17887.0,True


In [19]:
# For the float column, replace NANs with mean of the rest
df['numVotes'] = df['numVotes'].fillna(df['numVotes'].mean(skipna=True))
df['runtimeMinutes'] = df['runtimeMinutes'].fillna(df['runtimeMinutes'].mean(skipna=True))
df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,the doll,die puppe,1919,1919,66,1898.0,True
tt0011841,way down east,way down east,1920,1920,145,5376.0,True
tt0012494,destiny,der mude tod,1921,1921,97,5842.0,True
tt0015163,the navigator,the navigator,1924,1924,59,9652.0,True
tt0016220,the phantom of the opera,the phantom of the opera,1925,1925,93,17887.0,True


In [172]:
# # Use TF-IDF to convert column of String to feature matrix
# text_transformer = TfidfVectorizer()

In [20]:
# # Convert primaryTitle column to feature matrix using TF-IDF
# primaryTitleTFIDF = pd.DataFrame(text_transformer.fit_transform(df['primaryTitle']).toarray(), 
#                                  index = df.index)
# primaryTitleTFIDF.head()

In [21]:
# # Convert originalTitle column to feature matrix using TF-IDF
# originalTitleTFIDF = pd.DataFrame(text_transformer.fit_transform(df['originalTitle']).toarray(), 
#                                   index = df.index)
# originalTitleTFIDF.head()

In [22]:
# # rename the originalTitle feature matrix columns with simple incremental values, just to keep the column names unique
# originalTitleTFIDF = pd.DataFrame(originalTitleTFIDF.values, 
#                                   index = df.index,
#                                   columns = list(range(primaryTitleTFIDF.shape[1],primaryTitleTFIDF.shape[1]+originalTitleTFIDF.shape[1])))
# originalTitleTFIDF.head()

In [23]:
# Remove old string columns
data = df.drop(['primaryTitle', 'originalTitle'], axis=1)
data.head()

Unnamed: 0_level_0,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0010600,1919,1919,66,1898.0,True
tt0011841,1920,1920,145,5376.0,True
tt0012494,1921,1921,97,5842.0,True
tt0015163,1924,1924,59,9652.0,True
tt0016220,1925,1925,93,17887.0,True


In [24]:
# Add primaryTitle feature matrix
data = data.join(primaryTitleD2V)
data.head()

Unnamed: 0_level_0,startYear,endYear,runtimeMinutes,numVotes,label,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,1919,1919,66,1898.0,True,-0.440142,0.306454,-0.070401,-0.057359,0.000581,...,0.009934,0.358975,-0.31075,0.289994,0.018177,0.134371,-0.049273,0.235176,-0.455158,0.531315
tt0011841,1920,1920,145,5376.0,True,-0.073622,0.358497,-0.277526,-0.439501,0.245974,...,-0.566846,0.13406,-0.421964,-0.094485,-0.290325,-0.197263,-0.190978,0.081143,-0.896569,-0.069563
tt0012494,1921,1921,97,5842.0,True,0.18217,0.012225,-0.059819,-0.300158,0.062693,...,-0.661549,0.358284,-0.273053,0.765994,6.5e-05,0.203705,0.031249,0.531458,-0.302416,0.098668
tt0015163,1924,1924,59,9652.0,True,-0.162405,-0.108079,-0.058337,-0.096047,-0.052548,...,-0.417272,-0.061615,-0.193513,0.191552,-0.058161,-0.342857,-0.011617,0.114098,-0.456539,0.182062
tt0016220,1925,1925,93,17887.0,True,0.181273,-0.3677,0.005288,-0.191084,0.383556,...,-0.35497,0.479828,-0.333378,0.396658,0.034621,-0.127124,0.319341,0.372939,-0.181004,-0.014595


In [25]:
# Add originalTitle feature matrix
data = data.join(originalTitleD2V)
data.head()

Unnamed: 0_level_0,startYear,endYear,runtimeMinutes,numVotes,label,0,1,2,3,4,...,190,191,192,193,194,195,196,197,198,199
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,1919,1919,66,1898.0,True,-0.440142,0.306454,-0.070401,-0.057359,0.000581,...,-0.466755,0.239823,-0.647431,0.366631,-0.104582,0.121624,-0.635718,0.396445,-0.34031,-0.394536
tt0011841,1920,1920,145,5376.0,True,-0.073622,0.358497,-0.277526,-0.439501,0.245974,...,-0.444145,0.170897,-0.781565,0.605423,-0.329683,0.388482,-0.607193,0.202279,-0.729887,-0.175332
tt0012494,1921,1921,97,5842.0,True,0.18217,0.012225,-0.059819,-0.300158,0.062693,...,-0.941476,-0.282652,-0.409407,0.520303,-0.437225,-0.365602,-0.726512,0.314618,0.129482,-0.0612
tt0015163,1924,1924,59,9652.0,True,-0.162405,-0.108079,-0.058337,-0.096047,-0.052548,...,-0.580744,0.215948,-0.77183,0.493596,0.066465,0.457513,0.260721,-0.039111,-0.109017,0.028599
tt0016220,1925,1925,93,17887.0,True,0.181273,-0.3677,0.005288,-0.191084,0.383556,...,-0.483001,0.291847,-0.284678,0.511878,-0.274454,0.449384,-0.115485,0.6067,-0.093008,-0.149644


In [26]:
data.columns = data.columns.astype(str)
data.head()

Unnamed: 0_level_0,startYear,endYear,runtimeMinutes,numVotes,label,0,1,2,3,4,...,190,191,192,193,194,195,196,197,198,199
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,1919,1919,66,1898.0,True,-0.440142,0.306454,-0.070401,-0.057359,0.000581,...,-0.466755,0.239823,-0.647431,0.366631,-0.104582,0.121624,-0.635718,0.396445,-0.34031,-0.394536
tt0011841,1920,1920,145,5376.0,True,-0.073622,0.358497,-0.277526,-0.439501,0.245974,...,-0.444145,0.170897,-0.781565,0.605423,-0.329683,0.388482,-0.607193,0.202279,-0.729887,-0.175332
tt0012494,1921,1921,97,5842.0,True,0.18217,0.012225,-0.059819,-0.300158,0.062693,...,-0.941476,-0.282652,-0.409407,0.520303,-0.437225,-0.365602,-0.726512,0.314618,0.129482,-0.0612
tt0015163,1924,1924,59,9652.0,True,-0.162405,-0.108079,-0.058337,-0.096047,-0.052548,...,-0.580744,0.215948,-0.77183,0.493596,0.066465,0.457513,0.260721,-0.039111,-0.109017,0.028599
tt0016220,1925,1925,93,17887.0,True,0.181273,-0.3677,0.005288,-0.191084,0.383556,...,-0.483001,0.291847,-0.284678,0.511878,-0.274454,0.449384,-0.115485,0.6067,-0.093008,-0.149644


In [27]:
data.columns

Index(['startYear', 'endYear', 'runtimeMinutes', 'numVotes', 'label', '0', '1',
       '2', '3', '4',
       ...
       '190', '191', '192', '193', '194', '195', '196', '197', '198', '199'],
      dtype='object', length=205)

In [28]:
# save processsed dataset to file
data.to_csv("data.csv")

In [29]:
# split in train and validation sets
train_df, valid_df = train_test_split(data, train_size=0.7, shuffle=True, stratify=df['label'], random_state=SEED)

In [34]:
# model = LogisticRegression(random_state=SEED, 
#                            max_iter=500,
#                            n_jobs=mp.cpu_count(), 
#                            verbose=1)
# model.fit(train_df.loc[:, train_df.columns != 'label'], train_df['label'])

In [38]:
model = lgb.LGBMClassifier(random_state=SEED)
model.fit(train_df.loc[:, train_df.columns != 'label'],train_df['label'],
          verbose=20,eval_metric='logloss')



LGBMClassifier(random_state=17)

In [39]:
valid_df['label'] = valid_df['label'].astype('int')
val_preds = model.predict(valid_df.loc[:, train_df.columns != 'label'])

In [40]:
acc_valid = accuracy_score(y_true=valid_df['label'].astype('int'), y_pred=val_preds)

In [41]:
acc_valid

0.7269681742043551