# Import necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
from itertools import groupby

from py_files.writer_director_to_one_hot import writer_director_to_one_hot
from py_files.add_merge_begin_end_year import merge_start_end_year
from py_files.load_box_office_data import load_and_aggregate_box_office
from py_files.add_remake_feature import create_remake_column
from py_files.add_langoriginaltitle_feature import add_language_of_original_title
from py_files.add_ENvsNonEN_feature import add_english_title_or_not
from py_files.add_movie_genre_feature import add_movie_genre

from py_files.d2v_embed import d2v_embed
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import math

# Loading the data

In [2]:
from py_files.load_original_data import load_original_data

df_original = load_original_data()

Found files: train-1.csv, train-2.csv, train-3.csv, train-4.csv, train-5.csv, train-6.csv, train-7.csv, train-8.csv


# Preprocessing of original columns

In [3]:
# copy the dataframe so we leave the original untouched
df_preprocessed = df_original.copy(deep=True)

# start the preprocessing
df_preprocessed = df_original.replace("\\N", np.nan)
df_preprocessed["primaryTitleFormatted"] = df_preprocessed["primaryTitle"].str.lower()\
                                                                          .str.normalize('NFKD')\
                                                                          .str.encode('ascii', errors='ignore')\
                                                                          .str.decode('utf-8')\
                                                                          .str.replace(" ", "_", regex=True)\
                                                                          .str.replace("\W", "", regex=True)

# merge endYear into beginYear when beginYear is not available --> rename Year
df_preprocessed = merge_start_end_year(df_preprocessed)

# set the datatypes of the dataframe correctly
df_preprocessed['Year'] = df_preprocessed['Year'].astype(int)
df_preprocessed['runtimeMinutes'] = df_preprocessed['runtimeMinutes'].astype(float)

df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7959 entries, 0 to 7958
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   tconst                 7959 non-null   object 
 1   primaryTitle           7959 non-null   object 
 2   originalTitle          3971 non-null   object 
 3   startYear              7173 non-null   object 
 4   endYear                786 non-null    object 
 5   runtimeMinutes         7946 non-null   float64
 6   numVotes               7169 non-null   float64
 7   label                  7959 non-null   bool   
 8   primaryTitleFormatted  7959 non-null   object 
 9   Year                   7959 non-null   int32  
dtypes: bool(1), float64(2), int32(1), object(6)
memory usage: 536.4+ KB


## Preprocessing of exogenous data

### Oscar data

In [4]:
oscars = pd.read_csv("additional_data/oscars.csv")

oscars["film"] = oscars["film"].str.lower()\
                               .str.normalize('NFKD')\
                               .str.encode('ascii', errors='ignore')\
                               .str.decode('utf-8')\
                               .str.replace(" ", "_", regex=True)\
                               .str.replace("\W", "", regex=True)

# Counting oscar nominations and wins per movie
oscar_noms = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].count()
oscar_wins = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].sum()

### Writer and Director data

In [5]:
# Find writers and directors per movie and combine the two
written_and_directed = (writer_director_to_one_hot("writers") + writer_director_to_one_hot("directors")).fillna(0).astype(int).loc[df_preprocessed['tconst']]

### TMDB data

### Box Office data

In [6]:
df_box_office_mojo = load_and_aggregate_box_office()

# process the 'release group' (read movie title) in the same way as the formatted title
df_box_office_mojo["Release Group"] = df_box_office_mojo["Release Group"].str.lower()\
                                       .str.normalize('NFKD')\
                                       .str.encode('ascii', errors='ignore')\
                                       .str.decode('utf-8')\
                                       .str.replace(" ", "_", regex=True)\
                                       .str.replace("\W", "", regex=True)
df_box_office_mojo.drop(['%', '%.1'], axis=1, inplace=True)

Found files: box_office_mojo\1977.csv, box_office_mojo\1978.csv, box_office_mojo\1979.csv, box_office_mojo\1980.csv, box_office_mojo\1981.csv, box_office_mojo\1982.csv, box_office_mojo\1983.csv, box_office_mojo\1984.csv, box_office_mojo\1985.csv, box_office_mojo\1986.csv, box_office_mojo\1987.csv, box_office_mojo\1988.csv, box_office_mojo\1989.csv, box_office_mojo\1990.csv, box_office_mojo\1991.csv, box_office_mojo\1992.csv, box_office_mojo\1993.csv, box_office_mojo\1994.csv, box_office_mojo\1995.csv, box_office_mojo\1996.csv, box_office_mojo\1997.csv, box_office_mojo\1998.csv, box_office_mojo\1999.csv, box_office_mojo\2000.csv, box_office_mojo\2001.csv, box_office_mojo\2002.csv, box_office_mojo\2003.csv, box_office_mojo\2004.csv, box_office_mojo\2005.csv, box_office_mojo\2006.csv, box_office_mojo\2007.csv, box_office_mojo\2008.csv, box_office_mojo\2009.csv, box_office_mojo\2010.csv, box_office_mojo\2011.csv, box_office_mojo\2012.csv, box_office_mojo\2013.csv, box_office_mojo\2014.csv,

# Adding of exogenous columns

In [7]:
df_incl_exog = df_preprocessed.copy(deep=True)
df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7959 entries, tt0010600 to tt9911196
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   primaryTitle           7959 non-null   object 
 1   originalTitle          3971 non-null   object 
 2   startYear              7173 non-null   object 
 3   endYear                786 non-null    object 
 4   runtimeMinutes         7946 non-null   float64
 5   numVotes               7169 non-null   float64
 6   label                  7959 non-null   bool   
 7   primaryTitleFormatted  7959 non-null   object 
 8   Year                   7959 non-null   int32  
dtypes: bool(1), float64(2), int32(1), object(5)
memory usage: 536.3+ KB


## add oscar data

In [8]:
df_incl_exog["oscar_noms"] = oscar_noms
df_incl_exog["oscar_wins"] = oscar_wins

## add mojo box office

In [9]:
df_incl_exog = df_incl_exog.reset_index().merge(df_box_office_mojo, left_on=['primaryTitleFormatted', 'Year'], right_on=['Release Group', 'year'], how="left").set_index('id')
df_incl_exog.drop(['Release Group', 'year'], axis=1, inplace=True)

df_incl_exog.loc[df_incl_exog['Worldwide'] == '-', 'Worldwide'] = np.nan
df_incl_exog.loc[df_incl_exog['Domestic'] == '-', 'Domestic'] = np.nan
df_incl_exog.loc[df_incl_exog['Foreign'] == '-', 'Foreign'] = np.nan
df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'] = df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'] = df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'] = df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'].apply(lambda x: float(x.replace('$', '').replace(',', '')))

## add remake column

In [10]:
df_incl_exog = create_remake_column(df_incl_exog)

## add title language

In [11]:
# # add the language of the original title, currently commented for training data usage and not wait 15 min every time
# df_incl_exog = add_language_of_original_title(df_incl_exog)

df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

## add whether title is English or not

In [12]:
df_incl_exog = add_english_title_or_not(df_incl_exog)

## add movie genres

In [13]:
df_incl_exog = add_movie_genre(df_incl_exog)

## add writers and directors

In [14]:
df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T

## add TMDB data

## save dataframe with features

In [15]:
df_incl_exog.to_csv('df_with_features.csv')

# Preparing data for classifier

Convert non-numeric columns to numeric.
We use Doc2Vec to embed each string column into n-by-128 array 

In [2]:
df = pd.read_csv('df_with_features.csv', index_col=0)

In [3]:
prim_title_df = d2v_embed(df['primaryTitle'])
orig_title_df = d2v_embed(df['originalTitle'])
prim_title_formatted_df = d2v_embed(df['primaryTitleFormatted'])
title_formatted_df = d2v_embed(df['titleFormatted'])
genres_df = d2v_embed(df['genres'])

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:48<00:00,  2.28s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:24<00:00,  2.04s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:05<00:00,  1.86s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:27<00:00,  1.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [01:21<00:00,  1.22it/s]


In [4]:
# just encode languages into ints for this column
df['title_language'] = pd.factorize(df['title_language'])[0]

# Selecting data for predicting

In [5]:
df.drop(columns = df.select_dtypes(include='object').columns, inplace=True)

In [6]:
# dealing with (some) nan values
for index, row in df.iterrows():
    # For missing startYear or endYear entries, insert the other, if it exists.
    if math.isnan(row['startYear']):
        if not math.isnan(row['endYear']):
            df.at[index,'startYear']=df.at[index,'endYear']
    if math.isnan(row['endYear']):
        if not math.isnan(row['startYear']):
            df.at[index,'endYear']=df.at[index,'startYear']
    
    # For missing oscar_noms and oscar_wins, insert 0
    if math.isnan(row['oscar_noms']):
        df.at[index,'oscar_noms'] = 0
    if math.isnan(row['oscar_wins']):
        df.at[index,'oscar_wins'] = 0

df['numVotes'] = df['numVotes'].fillna(df['numVotes'].mean(skipna=True))
df['runtimeMinutes'] = df['runtimeMinutes'].fillna(df['runtimeMinutes'].mean(skipna=True))

In [7]:
df = df.join(prim_title_df)
df = df.join(orig_title_df)
df = df.join(prim_title_formatted_df)
df = df.join(title_formatted_df)
# df = df.join(genres_df)
df.head()

Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label,Year,oscar_noms,oscar_wins,Rank,Worldwide,...,titleFormatted_118,titleFormatted_119,titleFormatted_120,titleFormatted_121,titleFormatted_122,titleFormatted_123,titleFormatted_124,titleFormatted_125,titleFormatted_126,titleFormatted_127
tt0010600,1919.0,1919.0,66.0,1898.0,True,1919,0.0,0.0,,,...,-0.055023,-0.009641,-0.006508,0.360667,0.190912,-0.138266,0.039766,-0.014868,0.217307,-0.140372
tt0011841,1920.0,1920.0,145.0,5376.0,True,1920,0.0,0.0,,,...,-0.056519,-0.049163,-0.051074,0.049195,-0.067826,-0.086067,-0.073648,-0.074869,-0.104015,-0.118744
tt0012494,1921.0,1921.0,97.0,5842.0,True,1921,0.0,0.0,,,...,0.003397,-0.001142,-0.001421,-0.001415,-0.002896,-9.1e-05,0.002653,-0.000516,-0.002534,0.00259
tt0015163,1924.0,1924.0,59.0,9652.0,True,1924,0.0,0.0,,,...,-0.002926,-0.00349,-0.003414,-0.003009,-0.002648,-0.003438,0.003772,0.000833,0.00316,-0.002091
tt0016220,1925.0,1925.0,93.0,17887.0,True,1925,3.0,0.0,,,...,0.002632,-0.001953,0.003525,-0.001428,-0.003035,0.002686,-0.002837,0.002153,0.001681,-0.002182


df_added_dataclassifier

In [8]:
train_df, valid_df = train_test_split(df, train_size=0.7, shuffle=True, stratify=df['label'], random_state=17)

# Evaluating classifier

In [9]:
model_lgbm = lgb.LGBMClassifier(objective='binary',
                                learning_rate=0.01,
                                num_iterations=1000,
                                feature_fraction=0.8,
                                verbosity=1,
                                random_state=17)
model_lgbm.fit(train_df.loc[:, train_df.columns != 'label'],
              train_df['label'],
              eval_metric='logloss')



[LightGBM] [Info] Number of positive: 2793, number of negative: 2778
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 132362
[LightGBM] [Info] Number of data points in the train set: 5571, number of used features: 546
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501346 -> initscore=0.005385
[LightGBM] [Info] Start training from score 0.005385


LGBMClassifier(feature_fraction=0.8, learning_rate=0.01, num_iterations=1000,
               objective='binary', random_state=17, verbosity=1)

# Predicting

In [10]:
# current predictions are made on training data only
valid_df['label'] = valid_df['label'].astype('int')
val_preds = model_lgbm.predict(valid_df.loc[:, train_df.columns != 'label'])
acc_valid_lgbm = accuracy_score(y_true=valid_df['label'].astype('int'), y_pred=val_preds)
# acc_score = 
acc_valid_lgbm

0.7931323283082077