# Import necessary libraries

In [1]:
import json
import ast
import numpy as np
import pandas as pd
from itertools import groupby

from py_files.writer_director_to_one_hot import writer_director_to_one_hot
from py_files.add_merge_begin_end_year import merge_start_end_year
from py_files.load_box_office_data import load_and_aggregate_box_office
from py_files.add_remake_feature import create_remake_column
from py_files.add_langoriginaltitle_feature import add_language_of_original_title
from py_files.add_ENvsNonEN_feature import add_english_title_or_not
from py_files.add_movie_genre_feature import add_movie_genre
from py_files.df_processor_enrichment import df_processor_enrichment

from py_files.df_model_prep import df_model_prep
from py_files.d2v_embed import d2v_embed
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import math

# Loading the data

In [2]:
from py_files.load_original_data import load_original_data

df_original = load_original_data()

Found files: train-1.csv, train-2.csv, train-3.csv, train-4.csv, train-5.csv, train-6.csv, train-7.csv, train-8.csv


# Preprocessing of original columns

In [143]:
# copy the dataframe so we leave the original untouched
df_preprocessed = df_original.copy(deep=True)

# start the preprocessing
df_preprocessed = df_original.replace("\\N", np.nan)
df_preprocessed["primaryTitleFormatted"] = df_preprocessed["primaryTitle"].str.lower()\
                                                                          .str.normalize('NFKD')\
                                                                          .str.encode('ascii', errors='ignore')\
                                                                          .str.decode('utf-8')\
                                                                          .str.replace(" ", "_", regex=True)\
                                                                          .str.replace("\W", "", regex=True)

# merge endYear into beginYear when beginYear is not available --> rename Year
# df_preprocessed = merge_start_end_year(df_preprocessed)

# set the datatypes of the dataframe correctly
# df_preprocessed['Year'] = df_preprocessed['Year'].astype(int)
df_preprocessed['runtimeMinutes'] = df_preprocessed['runtimeMinutes'].astype(float)

df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7959 entries, 0 to 7958
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   tconst                 7959 non-null   object 
 1   primaryTitle           7959 non-null   object 
 2   originalTitle          3971 non-null   object 
 3   startYear              7173 non-null   object 
 4   endYear                786 non-null    object 
 5   runtimeMinutes         7946 non-null   float64
 6   numVotes               7169 non-null   float64
 7   label                  7959 non-null   bool   
 8   primaryTitleFormatted  7959 non-null   object 
dtypes: bool(1), float64(2), object(6)
memory usage: 505.3+ KB


## Preprocessing of exogenous data

### Oscar data

In [144]:
oscars = pd.read_csv("additional_data/oscars.csv")

oscars["film"] = oscars["film"].str.lower()\
                               .str.normalize('NFKD')\
                               .str.encode('ascii', errors='ignore')\
                               .str.decode('utf-8')\
                               .str.replace(" ", "_", regex=True)\
                               .str.replace("\W", "", regex=True)

# Counting oscar nominations and wins per movie
oscar_noms = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].count()
oscar_wins = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].sum()

### Writer and Director data

In [10]:
# Find writers and directors per movie and combine the two
writers = writer_director_to_one_hot("writers")
directors = writer_director_to_one_hot("directors")
written_and_directed = writers.add(directors, fill_value=0).fillna(0).astype(int).loc[df_preprocessed["tconst"]]

### TMDB data

In [126]:
df_TMDB = pd.read_csv("additional_data/TMDB.csv")[["budget", "genres", "imdb_id", 
                                                   "original_language", "overview", 
                                                   "popularity", "production_companies", 
                                                   "tagline", "Keywords", "revenue"]]

In [127]:
def dict_to_list(dictionary):
    try:
        d = ast.literal_eval(dictionary)
    except ValueError:
        return []
    
    return [i["name"] for i in d]

In [128]:
df_TMDB["genres"] = df_TMDB["genres"].apply(lambda x: dict_to_list(x))
df_TMDB["Keywords"] = df_TMDB["Keywords"].apply(lambda x: dict_to_list(x))
df_TMDB["production_companies"] = df_TMDB["production_companies"].apply(lambda x: dict_to_list(x))
df_TMDB = df_TMDB.set_index("imdb_id")

### Metacritic data

In [129]:
df_meta = pd.read_csv("additional_data/Metacritic.csv").drop("Unnamed: 0", axis=1).set_index("movie")
df_meta["overview"] = df_meta["overview"].apply(lambda x: eval(x))
df_meta["overview"] = df_meta["overview"].apply(lambda x: x[0] if x else str(x))

In [130]:
overviews = pd.merge(df_TMDB["overview"], df_meta["overview"], left_index=True, right_index=True, how="outer")
overviews["overview"] = overviews["overview_x"].str.cat(overviews["overview_y"], na_rep="")
overviews = overviews.drop(["overview_x", "overview_y"], axis=1)
df_TMDB = df_TMDB.drop("overview", axis=1)

### Box Office data

In [131]:
df_box_office_mojo = load_and_aggregate_box_office()

# process the 'release group' (read movie title) in the same way as the formatted title
df_box_office_mojo["Release Group"] = df_box_office_mojo["Release Group"].str.lower()\
                                       .str.normalize('NFKD')\
                                       .str.encode('ascii', errors='ignore')\
                                       .str.decode('utf-8')\
                                       .str.replace(" ", "_", regex=True)\
                                       .str.replace("\W", "", regex=True)
df_box_office_mojo.drop(['%', '%.1'], axis=1, inplace=True)

Found files: box_office_mojo\1977.csv, box_office_mojo\1978.csv, box_office_mojo\1979.csv, box_office_mojo\1980.csv, box_office_mojo\1981.csv, box_office_mojo\1982.csv, box_office_mojo\1983.csv, box_office_mojo\1984.csv, box_office_mojo\1985.csv, box_office_mojo\1986.csv, box_office_mojo\1987.csv, box_office_mojo\1988.csv, box_office_mojo\1989.csv, box_office_mojo\1990.csv, box_office_mojo\1991.csv, box_office_mojo\1992.csv, box_office_mojo\1993.csv, box_office_mojo\1994.csv, box_office_mojo\1995.csv, box_office_mojo\1996.csv, box_office_mojo\1997.csv, box_office_mojo\1998.csv, box_office_mojo\1999.csv, box_office_mojo\2000.csv, box_office_mojo\2001.csv, box_office_mojo\2002.csv, box_office_mojo\2003.csv, box_office_mojo\2004.csv, box_office_mojo\2005.csv, box_office_mojo\2006.csv, box_office_mojo\2007.csv, box_office_mojo\2008.csv, box_office_mojo\2009.csv, box_office_mojo\2010.csv, box_office_mojo\2011.csv, box_office_mojo\2012.csv, box_office_mojo\2013.csv, box_office_mojo\2014.csv,

# Adding of exogenous columns

In [132]:
df_incl_exog = df_preprocessed.copy(deep=True)
df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7959 entries, tt0010600 to tt9911196
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   primaryTitle           7959 non-null   object 
 1   originalTitle          3971 non-null   object 
 2   startYear              7173 non-null   object 
 3   endYear                786 non-null    object 
 4   runtimeMinutes         7946 non-null   float64
 5   numVotes               7169 non-null   float64
 6   label                  7959 non-null   bool   
 7   primaryTitleFormatted  7959 non-null   object 
 8   Year                   7959 non-null   int32  
dtypes: bool(1), float64(2), int32(1), object(5)
memory usage: 536.3+ KB


## add oscar data

In [133]:
df_incl_exog["oscar_noms"] = oscar_noms
df_incl_exog["oscar_wins"] = oscar_wins

## add mojo box office

In [134]:
df_incl_exog = df_incl_exog.reset_index().merge(df_box_office_mojo, left_on=['primaryTitleFormatted', 'Year'], right_on=['Release Group', 'year'], how="left").set_index('id')
df_incl_exog.drop(['Release Group', 'year'], axis=1, inplace=True)

df_incl_exog.loc[df_incl_exog['Worldwide'] == '-', 'Worldwide'] = np.nan
df_incl_exog.loc[df_incl_exog['Domestic'] == '-', 'Domestic'] = np.nan
df_incl_exog.loc[df_incl_exog['Foreign'] == '-', 'Foreign'] = np.nan
df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'] = df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'] = df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'] = df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'].apply(lambda x: float(x.replace('$', '').replace(',', '')))

## add remake column

In [135]:
df_incl_exog = create_remake_column(df_incl_exog)

## add title language

In [136]:
# # add the language of the original title, currently commented for training data usage and not wait 15 min every time
# df_incl_exog = add_language_of_original_title(df_incl_exog)

df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

## add whether title is English or not

In [137]:
df_incl_exog = add_english_title_or_not(df_incl_exog)

## add movie genres

In [138]:
df_incl_exog = add_movie_genre(df_incl_exog)

## add writers and directors

In [24]:
df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T

## add TMDB & Metacritic overviews

In [140]:
df_incl_exog = pd.merge(df_incl_exog, df_TMDB, how="left", left_index=True, right_index=True)
df_incl_exog = pd.merge(df_incl_exog, overviews, how="left", left_index=True, right_index=True)

In [141]:
df_incl_exog["overview"].str.len().sort_values().dropna()

id
tt1950235       2.0
tt1285009       2.0
tt3488184       2.0
tt0490170      18.0
tt1796603      41.0
              ...  
tt7253506    1749.0
tt0840361    1966.0
tt1149361    1987.0
tt1729637    1996.0
tt0926084    2002.0
Name: overview, Length: 3127, dtype: float64

## save dataframe with features

In [23]:
df_incl_exog.to_csv('df_with_features.csv')

# Preparing data for classifier

Convert non-numeric columns to numeric.
We use Doc2Vec to embed each string column into n-by-128 array 

In [9]:
# from py_files.df_processor_enrichment import df_processor_enrichment
# train_df = pd.read_csv('df_with_features.csv', index_col=0)
train_df = df_processor_enrichment('train')
train_df.head()

Looking for pre made file...


Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,primaryTitleFormatted,Year,oscar_noms,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0010600,The Doll,Die Puppe,1919.0,,66.0,1898.0,True,the_doll,1919,,...,0,0,0,0,0,0,0,0,0,0
tt0011841,Way Down East,Way Down East,1920.0,,145.0,5376.0,True,way_down_east,1920,,...,0,0,0,0,0,0,0,0,0,0
tt0012494,Déstiny,Der müde Tod,1921.0,,97.0,5842.0,True,destiny,1921,,...,0,0,0,0,0,0,0,0,0,0
tt0015163,The Navigator,The Navigator,1924.0,,59.0,9652.0,True,the_navigator,1924,,...,0,0,0,0,0,0,0,0,0,0
tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925.0,,93.0,17887.0,True,the_phantom_of_the_opera,1925,3.0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# from py_files.df_model_prep import df_model_prep

train_df_prepped = df_model_prep(train_df,'train')
train_df_prepped.head()

Looking for pre made file...


Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,label,Year,oscar_noms,oscar_wins,Rank,Worldwide,...,genres_118,genres_119,genres_120,genres_121,genres_122,genres_123,genres_124,genres_125,genres_126,genres_127
tt0010600,1919.0,1919.0,66.0,1898.0,True,1919,0.0,0.0,,,...,0.11625,-0.001943,0.09604,-0.145849,-0.051422,-0.252837,0.115215,0.327335,0.103121,0.042877
tt0011841,1920.0,1920.0,145.0,5376.0,True,1920,0.0,0.0,,,...,-0.001922,0.000793,0.063636,-0.000612,0.079247,-0.00119,-0.113678,-0.047694,0.08594,0.033783
tt0012494,1921.0,1921.0,97.0,5842.0,True,1921,0.0,0.0,,,...,-0.001502,-0.002725,-0.00099,-0.001626,-0.00197,0.002729,-0.000598,-0.000955,0.000612,0.000262
tt0015163,1924.0,1924.0,59.0,9652.0,True,1924,0.0,0.0,,,...,0.000813,-0.002928,-0.003572,-0.001633,-0.001751,-0.001878,-0.002639,0.000714,0.000631,-0.003067
tt0016220,1925.0,1925.0,93.0,17887.0,True,1925,3.0,0.0,,,...,0.000298,0.000112,0.003719,-0.002776,-0.000493,0.000173,0.001278,3.3e-05,0.001724,0.001425


# Evaluating classifier

In [11]:
model_lgbm = lgb.LGBMClassifier(objective='binary',
                                learning_rate=0.01,
                                num_iterations=1000,
                                feature_fraction=0.8,
                                verbosity=1,
                                random_state=17)
model_lgbm.fit(train_df_prepped.loc[:, train_df_prepped.columns != 'label'],
              train_df_prepped['label'],
              eval_metric='logloss')



[LightGBM] [Info] Number of positive: 3990, number of negative: 3969
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165089
[LightGBM] [Info] Number of data points in the train set: 7959, number of used features: 674
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501319 -> initscore=0.005277
[LightGBM] [Info] Start training from score 0.005277


LGBMClassifier(feature_fraction=0.8, learning_rate=0.01, num_iterations=1000,
               objective='binary', random_state=17, verbosity=1)

# Predicting

## Add and process train and valid data

In [3]:
valid_df = df_processor_enrichment('validation_hidden.csv')
valid_df.head()

Looking for pre made file...
File not found, creating a new one..
Found files: box_office_mojo\1977.csv, box_office_mojo\1978.csv, box_office_mojo\1979.csv, box_office_mojo\1980.csv, box_office_mojo\1981.csv, box_office_mojo\1982.csv, box_office_mojo\1983.csv, box_office_mojo\1984.csv, box_office_mojo\1985.csv, box_office_mojo\1986.csv, box_office_mojo\1987.csv, box_office_mojo\1988.csv, box_office_mojo\1989.csv, box_office_mojo\1990.csv, box_office_mojo\1991.csv, box_office_mojo\1992.csv, box_office_mojo\1993.csv, box_office_mojo\1994.csv, box_office_mojo\1995.csv, box_office_mojo\1996.csv, box_office_mojo\1997.csv, box_office_mojo\1998.csv, box_office_mojo\1999.csv, box_office_mojo\2000.csv, box_office_mojo\2001.csv, box_office_mojo\2002.csv, box_office_mojo\2003.csv, box_office_mojo\2004.csv, box_office_mojo\2005.csv, box_office_mojo\2006.csv, box_office_mojo\2007.csv, box_office_mojo\2008.csv, box_office_mojo\2009.csv, box_office_mojo\2010.csv, box_office_mojo\2011.csv, box_office_

Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,primaryTitleFormatted,Year,oscar_noms,oscar_wins,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0003740,Cabiria,,1914,,148.0,3452.0,cabiria,1914,,,...,0,0,0,0,0,0,0,0,0,0
tt0008663,A Man There Was,Terje Vigen,1917,,65.0,1882.0,a_man_there_was,1917,,,...,0,0,0,0,0,0,0,0,0,0
tt0010307,J'accuse!,,1919,,166.0,1692.0,jaccuse,1919,,,...,0,0,0,0,0,0,0,0,0,0
tt0014429,Safety Last!,Safety Last!,1923,,74.0,19898.0,safety_last,1923,,,...,0,0,0,0,0,0,0,0,0,0
tt0015175,Die Nibelungen: Siegfried,,1924,,143.0,5676.0,die_nibelungen_siegfried,1924,,,...,0,0,0,0,0,0,0,0,0,0


In [4]:
valid_df_prepped = df_model_prep(valid_df, 'valid')
valid_df_prepped.head()

Looking for pre made file...


Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,Year,oscar_noms,oscar_wins,Rank,Worldwide,Domestic,...,genres_118,genres_119,genres_120,genres_121,genres_122,genres_123,genres_124,genres_125,genres_126,genres_127
tt0003740,1914.0,1914.0,148.0,3452.0,1914,0.0,0.0,,,,...,0.005018,-0.083632,-0.309236,0.083878,-0.032624,-0.081726,-0.138933,-0.00248,-0.038279,-0.205428
tt0008663,1917.0,1917.0,65.0,1882.0,1917,0.0,0.0,,,,...,0.00024,0.002041,0.002875,-0.001802,-0.000609,-0.001934,0.000139,-0.000298,-8.3e-05,0.002119
tt0010307,1919.0,1919.0,166.0,1692.0,1919,0.0,0.0,,,,...,0.002928,-0.001719,-0.002222,0.000331,-0.003333,0.003408,-0.0001,0.001668,-0.00229,0.001684
tt0014429,1923.0,1923.0,74.0,19898.0,1923,0.0,0.0,,,,...,-0.025711,0.0101,-0.068962,0.006616,0.055832,0.022382,-0.051938,-0.010245,0.067933,0.024606
tt0015175,1924.0,1924.0,143.0,5676.0,1924,0.0,0.0,,,,...,0.00301,-0.00116,0.003869,0.000499,-0.001284,-0.00184,0.001352,0.002752,0.001206,0.000316


In [5]:
valid_df_prepped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 955 entries, tt0003740 to tt9812614
Columns: 18582 entries, startYear to genres_127
dtypes: bool(2), float64(670), int64(17910)
memory usage: 135.4+ MB


In [6]:
test_df = df_processor_enrichment('test_hidden.csv')
test_df.head()

Looking for pre made file...


Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,primaryTitleFormatted,Year,oscar_noms,oscar_wins,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0014972,He Who Gets Slapped,He Who Gets Slapped,1924.0,,95.0,3654.0,he_who_gets_slapped,1924,,,...,0,0,0,0,0,0,0,0,0,0
tt0015016,The Iron Horse,,1924.0,,150.0,2136.0,the_iron_horse,1924,,,...,0,0,0,0,0,0,0,0,0,0
tt0015174,Die Nibelungen: Kriemhild's Revenge,,1924.0,,129.0,4341.0,die_nibelungen_kriemhilds_revenge,1924,,,...,0,0,0,0,0,0,0,0,0,0
tt0015214,At 3:25,,,1925.0,59.0,1724.0,at_325,1925,,,...,0,0,0,0,0,0,0,0,0,0
tt0015863,Go West,,1925.0,,69.0,4188.0,go_west,1925,,,...,0,0,0,0,0,0,0,0,0,0


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1086 entries, tt0014972 to tt9526826
Columns: 17947 entries, primaryTitle to nm9985837
dtypes: bool(2), float64(31), int64(17909), object(5)
memory usage: 148.7+ MB


In [8]:
test_df_prepped = df_model_prep(test_df, 'test')
test_df_prepped.head()

Looking for pre made file...
No file found, creating a new one


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:18<00:00,  5.54it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:16<00:00,  6.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:16<00:00,  6.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:14<00:00,  6.78it/s]


Unnamed: 0,startYear,endYear,runtimeMinutes,numVotes,Year,oscar_noms,oscar_wins,Rank,Worldwide,Domestic,...,genres_118,genres_119,genres_120,genres_121,genres_122,genres_123,genres_124,genres_125,genres_126,genres_127
tt0014972,1924.0,1924.0,95.0,3654.0,1924,0.0,0.0,,,,...,-0.07083,-0.063842,-0.011708,0.097846,-0.098989,0.031874,0.065344,-0.000238,-0.052941,-0.005215
tt0015016,1924.0,1924.0,150.0,2136.0,1924,0.0,0.0,,,,...,-0.000991,-0.001031,-0.002112,0.002662,0.000402,-0.001271,-0.002833,-0.001403,-0.002354,0.001079
tt0015174,1924.0,1924.0,129.0,4341.0,1924,0.0,0.0,,,,...,0.002657,-0.002607,-0.002448,-0.00052,-0.003271,0.001987,-2.3e-05,-0.002696,-0.002066,-0.002315
tt0015214,1925.0,1925.0,59.0,1724.0,1925,0.0,0.0,,,,...,-0.003289,-0.00241,0.00246,0.001951,0.000382,-0.001222,-0.002446,-0.001669,0.001374,-0.00276
tt0015863,1925.0,1925.0,69.0,4188.0,1925,0.0,0.0,,,,...,-0.041088,-0.030583,0.014165,0.137364,-0.087731,0.133544,0.1596,-0.085428,-0.11009,0.289789


In [15]:
test_df_prepped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1086 entries, tt0014972 to tt9526826
Columns: 18582 entries, startYear to genres_127
dtypes: bool(2), float32(640), float64(30), int64(17910)
memory usage: 151.3+ MB


In [12]:
val_preds_lgbm = model_lgbm.predict(valid_df_prepped)
test_preds_lgbm = model_lgbm.predict(test_df_prepped)

In [13]:
with open('val_preds_lgbm.txt', 'w+') as f:
    for val in val_preds_lgbm:
        f.write(f"{str(val)}\n")

In [14]:
with open('test_preds_lgbm.txt', 'w+') as f:
    for val in test_preds_lgbm:
        f.write(f"{str(val)}\n")