In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
import multiprocessing as mp
from sklearn.metrics import accuracy_score

In [2]:
#Read directory and create file list
files = os.listdir()
files = [file for file in files if 'train-' in file]

In [3]:
files

['train-1.csv',
 'train-2.csv',
 'train-3.csv',
 'train-4.csv',
 'train-5.csv',
 'train-6.csv',
 'train-7.csv',
 'train-8.csv']

In [4]:
#Create initial df by concatating all files in list
df = pd.concat(map(pd.read_csv, files))
df.set_index('tconst', inplace = True)

In [5]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0010600,4,The Doll,Die Puppe,1919,\N,66,1898.0,True
tt0011841,7,Way Down East,Way Down East,1920,\N,145,5376.0,True
tt0012494,9,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
tt0015163,25,The Navigator,The Navigator,1924,\N,59,9652.0,True
tt0016220,38,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True


In [6]:
# Remove Unnamed column (which is old index list)
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [7]:
df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True


In [9]:
SEED = 17

In [10]:
# For string columns, fill NAN entries with whitespace
for col in ['primaryTitle', 'originalTitle']:
    df[col] = df[col].fillna(" ")

df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True


In [11]:
# For numerical columns, replace non-numerical entries with zeroes
for col in ["startYear", "endYear", "runtimeMinutes"]:
    df[col] = df[col].fillna(" ").replace('\\N', '0').astype(int)

# For missing startYear or endYear entries, insert the other, if it exists.
for index, row in df.iterrows():
    if row['startYear']==0:
        if row['endYear']!=0:
            df.at[index,'startYear']=df.at[index,'endYear']
    
    if row['endYear']==0:
        if row['startYear']!=0:
            df.at[index,'endYear']=df.at[index,'startYear']

df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,The Doll,Die Puppe,1919,1919,66,1898.0,True
tt0011841,Way Down East,Way Down East,1920,1920,145,5376.0,True
tt0012494,Déstiny,Der müde Tod,1921,1921,97,5842.0,True
tt0015163,The Navigator,The Navigator,1924,1924,59,9652.0,True
tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,1925,93,17887.0,True


In [15]:
# For the float column, replace NANs with mean of the rest
df['numVotes'] = df['numVotes'].fillna(df['numVotes'].mean(skipna=True))
df.head()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt0010600,The Doll,Die Puppe,1919,1919,66,1898.0,True
tt0011841,Way Down East,Way Down East,1920,1920,145,5376.0,True
tt0012494,Déstiny,Der müde Tod,1921,1921,97,5842.0,True
tt0015163,The Navigator,The Navigator,1924,1924,59,9652.0,True
tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,1925,93,17887.0,True


In [16]:
# Use TF-IDF to convert column of String to feature matrix
text_transformer = TfidfVectorizer()

In [17]:
# Convert primaryTitle column to feature matrix using TF-IDF
primaryTitleTFIDF = pd.DataFrame(text_transformer.fit_transform(df['primaryTitle']).toarray(), 
                                 index = df.index)
primaryTitleTFIDF.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,8717,8718,8719,8720,8721,8722,8723,8724,8725,8726
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0011841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0012494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0015163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0016220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Convert originalTitle column to feature matrix using TF-IDF
originalTitleTFIDF = pd.DataFrame(text_transformer.fit_transform(df['originalTitle']).toarray(), 
                                  index = df.index)
originalTitleTFIDF.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,5795,5796,5797,5798,5799,5800,5801,5802,5803,5804
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0011841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0012494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0015163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0016220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# rename the originalTitle feature matrix columns with simple incremental values, just to keep the column names unique
originalTitleTFIDF = pd.DataFrame(originalTitleTFIDF.values, 
                                  index = df.index,
                                  columns = list(range(primaryTitleTFIDF.shape[1],primaryTitleTFIDF.shape[1]+originalTitleTFIDF.shape[1])))
originalTitleTFIDF.head()

Unnamed: 0_level_0,8727,8728,8729,8730,8731,8732,8733,8734,8735,8736,...,14522,14523,14524,14525,14526,14527,14528,14529,14530,14531
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0011841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0012494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0015163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0016220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Remove old string columns
data = df.drop(['primaryTitle', 'originalTitle'], axis=1)
data.head()

Unnamed: 0_level_0,startYear,endYear,runtimeMinutes,numVotes,label
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0010600,1919,1919,66,1898.0,True
tt0011841,1920,1920,145,5376.0,True
tt0012494,1921,1921,97,5842.0,True
tt0015163,1924,1924,59,9652.0,True
tt0016220,1925,1925,93,17887.0,True


In [21]:
# Add primaryTitle feature matrix
data = data.join(primaryTitleTFIDF)
data.head()

Unnamed: 0_level_0,startYear,endYear,runtimeMinutes,numVotes,label,0,1,2,3,4,...,8717,8718,8719,8720,8721,8722,8723,8724,8725,8726
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,1919,1919,66,1898.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0011841,1920,1920,145,5376.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0012494,1921,1921,97,5842.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0015163,1924,1924,59,9652.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0016220,1925,1925,93,17887.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Add originalTitle feature matrix
data = data.join(originalTitleTFIDF)
data.head()

Unnamed: 0_level_0,startYear,endYear,runtimeMinutes,numVotes,label,0,1,2,3,4,...,14522,14523,14524,14525,14526,14527,14528,14529,14530,14531
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,1919,1919,66,1898.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0011841,1920,1920,145,5376.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0012494,1921,1921,97,5842.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0015163,1924,1924,59,9652.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0016220,1925,1925,93,17887.0,True,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# save processsed dataset to file
data.to_csv("data.csv")

In [23]:
# split in train and validation sets
train_df, valid_df = train_test_split(data, train_size=0.7, shuffle=True, stratify=df[TARGET_FIELD], random_state=SEED)

In [24]:
model = LogisticRegression(random_state=SEED, 
                           max_iter=500,
                           n_jobs=mp.cpu_count(), 
                           verbose=1)

In [25]:
model.fit(train_df.loc[:, train_df.columns != 'label'], train_df['label'])

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 out of   1 | elapsed:   52.9s finished


LogisticRegression(max_iter=500, n_jobs=12, random_state=17, verbose=1)

In [27]:
valid_df[TARGET_FIELD] = valid_df[TARGET_FIELD].astype('int')
val_preds = model.predict(valid_df.loc[:, train_df.columns != 'label'])



In [28]:
acc_valid = accuracy_score(y_true=valid_df[TARGET_FIELD].astype('int'), y_pred=val_preds)

In [29]:
acc_valid

0.6758793969849246