In [1]:
import pandas as pd
import numpy as np

from itertools import groupby
from sklearn.preprocessing import MultiLabelBinarizer

import json
import glob
import ast

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
def openfile(data):
    if data == "writers":
        with open("writing.json") as f:
            writers = f.read()
        
        return json.loads(writers)
    elif data == "directors":
        with open("directing.json") as f:
            directors = f.read()
        
        return json.loads(directors)    
    else:
        return NotImplemented    

In [3]:
def json_to_one_hot(kind = "writers"):

    if kind in ["writers", "directors"]:
        json_file = openfile(kind)
    else:
        return NotImplemented
  
    if kind == "writers":
         # Group writers/directors by movie
        groups = groupby([(i["movie"], i["writer"]) for i in json_file], key = lambda x : x[0])

        # Turn groupby object into a json-like dict
        grouped = {writer: [i[1] for i in movies] for writer, movies in groups}

        # Convert to Series
        df = pd.DataFrame.from_records(list(grouped.items())).set_index(0).squeeze()
    else:
        df = pd.DataFrame(json_file).groupby("movie")["director"].apply(lambda x: x.values)
    
    # Create one-hot encoded DataFrame
    mlb = MultiLabelBinarizer()
    res = pd.DataFrame(mlb.fit_transform(df),
                       columns=mlb.classes_,
                       index=df.index)
    
    return res.drop("\\N", axis = 1)

In [4]:
all_files = glob.glob("train*.csv")

print(f"Found files: {', '.join(all_files)}")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True).drop("Unnamed: 0", axis = 1)

Found files: train-1.csv, train-2.csv, train-3.csv, train-4.csv, train-5.csv, train-6.csv, train-7.csv, train-8.csv


In [5]:
df = df.replace("\\N", np.nan)

In [6]:
df.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,tt0010600,The Doll,Die Puppe,1919,,66,1898.0,True
1,tt0011841,Way Down East,Way Down East,1920,,145,5376.0,True
2,tt0012494,Déstiny,Der müde Tod,1921,,97,5842.0,True
3,tt0015163,The Navigator,The Navigator,1924,,59,9652.0,True
4,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,,93,17887.0,True


In [7]:
df["label"].value_counts()

True     3990
False    3969
Name: label, dtype: int64

In [8]:
df.isna().mean()

tconst            0.000000
primaryTitle      0.000000
originalTitle     0.501068
startYear         0.098756
endYear           0.901244
runtimeMinutes    0.001633
numVotes          0.099259
label             0.000000
dtype: float64

In [9]:
oscars = pd.read_csv("additional_data/oscars.csv")

In [10]:
oscars.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [11]:
# Cleaning titles and merging with oscar noms
df["primaryTitle"] = df["primaryTitle"].str.lower()\
                                       .str.normalize('NFKD')\
                                       .str.encode('ascii', errors='ignore')\
                                       .str.decode('utf-8')\
                                       .str.replace(" ", "_", regex=True)\
                                       .str.replace("\W", "", regex=True)

oscars["film"] = oscars["film"].str.lower()\
                               .str.normalize('NFKD')\
                               .str.encode('ascii', errors='ignore')\
                               .str.decode('utf-8')\
                               .str.replace(" ", "_", regex=True)\
                               .str.replace("\W", "", regex=True)

In [12]:
# Counting oscar nominations and wins per movie
oscar_noms = pd.merge(df, oscars, left_on = "primaryTitle", right_on = "film").groupby("tconst")["winner"].count()
oscar_wins = pd.merge(df, oscars, left_on = "primaryTitle", right_on = "film").groupby("tconst")["winner"].sum()

In [13]:
df = df.rename({"tconst" : "id"}, axis = 1).set_index("id")

In [14]:
df["oscar_noms"] = oscar_noms
df["oscar_wins"] = oscar_wins

In [15]:
# Find writers and directors per movie and combine the two
writers = json_to_one_hot("writers")
directors = json_to_one_hot("directors")
written_and_directed = writers.add(directors, fill_value=0).fillna(0).astype(int).loc[df.index]

In [16]:
# Drop unused writers/directors
means = written_and_directed.mean(axis=0)
written_and_directed = written_and_directed.loc[:, means[means > 0].index]

In [17]:
# Add writer/director data to df (the transposes are done to speed things up)
df = pd.concat([df.T, written_and_directed.T]).T

In [18]:
df

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,nm0000005,...,nm9922522,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9985316,nm9985837
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,the_doll,Die Puppe,1919,,66,1898.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0011841,way_down_east,Way Down East,1920,,145,5376.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0012494,destiny,Der müde Tod,1921,,97,5842.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0015163,the_navigator,The Navigator,1924,,59,9652.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0016220,the_phantom_of_the_opera,The Phantom of the Opera,1925,,93,17887.0,True,3.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9625664,trauma_center,,2019,,87,12951.0,False,,,0,...,0,0,0,0,0,0,0,0,0,0
tt9741310,slaxx,Slaxx,2020,,77,2464.0,False,,,0,...,0,0,0,0,0,0,0,0,0,0
tt9742392,kindred,Kindred,2020,,101,1719.0,False,,,0,...,0,0,0,0,0,0,0,0,0,0
tt9850386,the_bee_gees_how_can_you_mend_a_broken_heart,,2020,,111,4144.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0


## Add TMDB data

In [19]:
df_2 = pd.read_csv("additional_data/TMDB.csv")

In [20]:
df_2 = df_2[["budget", "genres", "imdb_id", 
             "original_language", "overview", 
             "popularity", "production_companies", 
             "tagline", "Keywords", "revenue"]]

In [21]:
def dict_to_list(dictionary):
    try:
        d = ast.literal_eval(dictionary)
    except ValueError:
        return []
    
    return [i["name"] for i in d]

In [22]:
df_2["genres"] = df_2["genres"].apply(lambda x: dict_to_list(x))

In [23]:
df_2["Keywords"] = df_2["Keywords"].apply(lambda x: dict_to_list(x))

In [24]:
df_2["production_companies"] = df_2["production_companies"].apply(lambda x: dict_to_list(x))

In [25]:
df.reset_index()

Unnamed: 0,id,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,...,nm9922522,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9985316,nm9985837
0,tt0010600,the_doll,Die Puppe,1919,,66,1898.0,True,,,...,0,0,0,0,0,0,0,0,0,0
1,tt0011841,way_down_east,Way Down East,1920,,145,5376.0,True,,,...,0,0,0,0,0,0,0,0,0,0
2,tt0012494,destiny,Der müde Tod,1921,,97,5842.0,True,,,...,0,0,0,0,0,0,0,0,0,0
3,tt0015163,the_navigator,The Navigator,1924,,59,9652.0,True,,,...,0,0,0,0,0,0,0,0,0,0
4,tt0016220,the_phantom_of_the_opera,The Phantom of the Opera,1925,,93,17887.0,True,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7954,tt9625664,trauma_center,,2019,,87,12951.0,False,,,...,0,0,0,0,0,0,0,0,0,0
7955,tt9741310,slaxx,Slaxx,2020,,77,2464.0,False,,,...,0,0,0,0,0,0,0,0,0,0
7956,tt9742392,kindred,Kindred,2020,,101,1719.0,False,,,...,0,0,0,0,0,0,0,0,0,0
7957,tt9850386,the_bee_gees_how_can_you_mend_a_broken_heart,,2020,,111,4144.0,True,,,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df_full = pd.merge(df.reset_index(), df_2, how = "left", left_on = "id", right_on = "imdb_id").set_index("id")

In [27]:
df_full

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,nm0000005,...,budget,genres,imdb_id,original_language,overview,popularity,production_companies,tagline,Keywords,revenue
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0010600,the_doll,Die Puppe,1919,,66,1898.0,True,,,0,...,,,,,,,,,,
tt0011841,way_down_east,Way Down East,1920,,145,5376.0,True,,,0,...,,,,,,,,,,
tt0012494,destiny,Der müde Tod,1921,,97,5842.0,True,,,0,...,,,,,,,,,,
tt0015163,the_navigator,The Navigator,1924,,59,9652.0,True,,,0,...,,,,,,,,,,
tt0016220,the_phantom_of_the_opera,The Phantom of the Opera,1925,,93,17887.0,True,3.0,0.0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9625664,trauma_center,,2019,,87,12951.0,False,,,0,...,,,,,,,,,,
tt9741310,slaxx,Slaxx,2020,,77,2464.0,False,,,0,...,,,,,,,,,,
tt9742392,kindred,Kindred,2020,,101,1719.0,False,,,0,...,,,,,,,,,,
tt9850386,the_bee_gees_how_can_you_mend_a_broken_heart,,2020,,111,4144.0,True,,,0,...,,,,,,,,,,


In [28]:
def embed_overviews(df_full, vector_size, epochs):
    df = df_full.copy()
    
    df["overview"] = df["overview"].str.lower().str.split()
    
    texts = df[~df["overview"].isna()]["overview"]
        
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
    model = Doc2Vec(documents, vector_size = vector_size, epochs = epochs, min_count = 1)
        
    embeddings = df["overview"].apply(lambda x: 
                                      model.infer_vector(x)
                                      if type(x) == list
                                      else [np.nan] * vector_size)
                 
    return embeddings
    
vector_size = 10
epochs = 10
embeddings = pd.DataFrame(embed_overviews(df_full, vector_size, epochs).to_list(), 
                          columns = [f"embedding_{i}" for i in range(vector_size)]).set_index(df_full.index)

In [29]:
df_full = pd.merge(df_full, embeddings, left_index=True, right_index=True)

In [38]:
df_full.shape

(7959, 15090)

# Inspection of numVotes

In [31]:
df[df['numVotes'].isna()]

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,nm0000005,...,nm9922522,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9985316,nm9985837
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0023973,the_eagle_and_the_hawk,,1933,,73,,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0023986,employees_entrance,,1933,,75,,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0027478,the_crime_of_monsieur_lange,Le crime de Monsieur Lange,1936,,80,,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0028333,swing_time,,1936,,103,,True,2.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0040626,my_dear_secretary,My Dear Secretary,1948,,94,,False,,,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt7134096,the_rhythm_section,The Rhythm Section,2020,,109,,False,,,0,...,0,0,0,0,0,0,0,0,0,0
tt8017136,tony_my_mentor_the_serial_killer,Tony,,2018,124,,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt8671462,invoking_5,Invoking 5,2018,,90,,False,,,0,...,0,0,0,0,0,0,1,1,0,0
tt8694228,mikhael,,2019,,150,,False,,,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
type(df.loc['tt0029146']['numVotes'])

float

In [33]:
df[['numVotes', 'label']]

Unnamed: 0_level_0,numVotes,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0010600,1898.0,True
tt0011841,5376.0,True
tt0012494,5842.0,True
tt0015163,9652.0,True
tt0016220,17887.0,True
...,...,...
tt9625664,12951.0,False
tt9741310,2464.0,False
tt9742392,1719.0,False
tt9850386,4144.0,True


In [34]:
df['numVotes'] = df['numVotes'].fillna(0)
df["label"] = df["label"].astype(int)

In [35]:
abs(df["label"].corr(df["numVotes"]))

0.16106479047663125

In [36]:
df.corr()

Unnamed: 0,numVotes,label
numVotes,1.0,0.161065
label,0.161065,1.0


In [37]:
display(df.sort_values(by='numVotes', ascending=False).head(100))
df.sort_values(by='numVotes', ascending=False).head(1000).corr()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,nm0000005,...,nm9922522,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9985316,nm9985837
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0111161,the_shawshank_redemption,The Shawshank Redemption,1994,,142,2503641.0,1,7.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0468569,the_dark_knight,The Dark Knight,,2008,152,2453191.0,1,8.0,2.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0137523,fight_club,Fight Club,1999,,139,1969585.0,1,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0109830,forrest_gump,Forrest Gump,1994,,142,1932502.0,1,13.0,6.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0133093,the_matrix,The Matrix,1999,,136,1787064.0,1,4.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt0441773,kung_fu_panda,Kung Fu Panda,2008,,92,443161.0,1,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt1454029,the_help,,2011,,146,442091.0,1,4.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
tt1535109,captain_phillips,Captain Phillips,2013,,134,441709.0,1,6.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt1515091,sherlock_holmes_a_game_of_shadows,Sherlock Holmes: A Game of Shadows,2011,,129,437947.0,1,,,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,numVotes,label
numVotes,1.0,0.255227
label,0.255227,1.0
