In [1]:
import pandas as pd
import numpy as np

from itertools import groupby
from sklearn.preprocessing import MultiLabelBinarizer

import json
import glob

In [2]:
def openfile(data):
    if data == "writers":
        with open("writing.json") as f:
            writers = f.read()
        
        return json.loads(writers)
    elif data == "directors":
        with open("directing.json") as f:
            directors = f.read()
        
        return json.loads(directors)    
    else:
        return NotImplemented    

In [3]:
def json_to_one_hot(kind = "writers"):

    if kind in ["writers", "directors"]:
        json_file = openfile(kind)
    else:
        return NotImplemented
  
    if kind == "writers":
         # Group writers/directors by movie
        groups = groupby([(i["movie"], i["writer"]) for i in json_file], key = lambda x : x[0])

        # Turn groupby object into a json-like dict
        grouped = {writer: [i[1] for i in movies] for writer, movies in groups}

        # Convert to Series
        df = pd.DataFrame.from_records(list(grouped.items())).set_index(0).squeeze()
    else:
        df = pd.DataFrame(json_file).groupby("movie")["director"].apply(lambda x: x.values)
    
    # Create one-hot encoded DataFrame
    mlb = MultiLabelBinarizer()
    res = pd.DataFrame(mlb.fit_transform(df),
                       columns=mlb.classes_,
                       index=df.index)
    
    return res.drop("\\N", axis = 1)

In [4]:
all_files = glob.glob("train*.csv")

print(f"Found files: {', '.join(all_files)}")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True).drop("Unnamed: 0", axis = 1)

Found files: train-6.csv, train-7.csv, train-5.csv, train-4.csv, train-1.csv, train-3.csv, train-2.csv, train-8.csv


In [5]:
df = df.replace("\\N", np.nan)

In [6]:
df.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,tt0013257,Häxan,Häxan,1922,,91,13679.0,True
1,tt0013556,Robin Hood,,1922,,143,2178.0,True
2,tt0014341,Our Hospitality,Our Hospitality,1923,,65,10911.0,True
3,tt0014538,Three Ages,,1923,,63,4312.0,True
4,tt0017925,The General,,1926,,67,87784.0,True


In [7]:
df["label"].value_counts()

True     3990
False    3969
Name: label, dtype: int64

In [8]:
df.isna().mean()

tconst            0.000000
primaryTitle      0.000000
originalTitle     0.501068
startYear         0.098756
endYear           0.901244
runtimeMinutes    0.001633
numVotes          0.099259
label             0.000000
dtype: float64

In [9]:
oscars = pd.read_csv("oscars.csv")

In [10]:
oscars.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False


In [11]:
# Cleaning titles and merging with oscar noms
df["primaryTitle"] = df["primaryTitle"].str.lower()\
                                       .str.normalize('NFKD')\
                                       .str.encode('ascii', errors='ignore')\
                                       .str.decode('utf-8')\
                                       .str.replace(" ", "_", regex=True)\
                                       .str.replace("\W", "", regex=True)

oscars["film"] = oscars["film"].str.lower()\
                               .str.normalize('NFKD')\
                               .str.encode('ascii', errors='ignore')\
                               .str.decode('utf-8')\
                               .str.replace(" ", "_", regex=True)\
                               .str.replace("\W", "", regex=True)

In [12]:
# Counting oscar nominations and wins per movie
oscar_noms = pd.merge(df, oscars, left_on = "primaryTitle", right_on = "film").groupby("tconst")["winner"].count()
oscar_wins = pd.merge(df, oscars, left_on = "primaryTitle", right_on = "film").groupby("tconst")["winner"].sum()

In [13]:
df = df.rename({"tconst" : "id"}, axis = 1).set_index("id")

In [14]:
df["oscar_noms"] = oscar_noms
df["oscar_wins"] = oscar_wins

In [15]:
# Find writers and directors per movie and combine the two
written_and_directed = (json_to_one_hot("writers") + json_to_one_hot("directors")).fillna(0).astype(int).loc[df.index]

In [16]:
# Add writer/director data to df (the transposes are done to speed things up)
df = pd.concat([df.T, written_and_directed.T]).T

In [17]:
df.head(20)

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,nm0000005,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0013257,haxan,Häxan,1922.0,,91,13679.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0013556,robin_hood,,1922.0,,143,2178.0,True,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0014341,our_hospitality,Our Hospitality,1923.0,,65,10911.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0014538,three_ages,,1923.0,,63,4312.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0017925,the_general,,1926.0,,67,87784.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0018192,napoleon,Napoléon vu par Abel Gance,1927.0,,330,7622.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0018526,underworld,,1927.0,,80,2912.0,True,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0018806,the_crowd,The Crowd,,1928.0,98,8197.0,True,2.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0019788,coquette,Coquette,,1929.0,76,2127.0,False,1.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0023871,a_song_of_lisbon,A Canção de Lisboa,1933.0,,85,1432.0,True,,,0,...,0,0,0,0,0,0,0,0,0,0


# Inspection of numVotes

In [18]:
df[df['numVotes'].isna()]

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,nm0000005,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0029146,the_life_of_emile_zola,The Life of Emile Zola,1937,,116,,True,10.0,3.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0034521,black_dragons,,1942,,64,,False,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0039651,record_of_a_tenement_gentleman,,1947,,72,,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0042436,les_enfants_terribles,Les enfants terribles,1950,,105,,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt0046446,torch_song,,1953,,90,,False,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt7134096,the_rhythm_section,The Rhythm Section,2020,,109,,False,,,0,...,0,0,0,0,0,0,0,0,0,0
tt8017136,tony_my_mentor_the_serial_killer,Tony,,2018,124,,True,,,0,...,0,0,0,0,0,0,0,0,0,0
tt8671462,invoking_5,Invoking 5,2018,,90,,False,,,0,...,0,0,0,0,0,0,0,0,0,0
tt8694228,mikhael,,2019,,150,,False,,,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
type(df.loc['tt0029146']['numVotes'])

float

In [20]:
df[['numVotes', 'label']]

Unnamed: 0_level_0,numVotes,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0013257,13679.0,True
tt0013556,2178.0,True
tt0014341,10911.0,True
tt0014538,4312.0,True
tt0017925,87784.0,True
...,...,...
tt9625664,12951.0,False
tt9741310,2464.0,False
tt9742392,1719.0,False
tt9850386,4144.0,True


In [21]:
df['numVotes'] = df['numVotes'].fillna(0)
df["label"] = df["label"].astype(int)

In [22]:
abs(df["label"].corr(df["numVotes"]))

0.16106479047663133

In [23]:
df.corr()

Unnamed: 0,numVotes,label
numVotes,1.0,0.161065
label,0.161065,1.0


In [49]:
display(df.sort_values(by='numVotes', ascending=False).head(100))
df.sort_values(by='numVotes', ascending=False).head(1000).corr()

Unnamed: 0_level_0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,oscar_noms,oscar_wins,nm0000005,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0111161,the_shawshank_redemption,The Shawshank Redemption,1994,,142,2503641.0,1,7.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0468569,the_dark_knight,The Dark Knight,,2008,152,2453191.0,1,8.0,2.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0137523,fight_club,Fight Club,1999,,139,1969585.0,1,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0109830,forrest_gump,Forrest Gump,1994,,142,1932502.0,1,13.0,6.0,0,...,0,0,0,0,0,0,0,0,0,0
tt0133093,the_matrix,The Matrix,1999,,136,1787064.0,1,4.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt0441773,kung_fu_panda,Kung Fu Panda,2008,,92,443161.0,1,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt1454029,the_help,,2011,,146,442091.0,1,4.0,1.0,0,...,0,0,0,0,0,0,0,0,0,0
tt1535109,captain_phillips,Captain Phillips,2013,,134,441709.0,1,6.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
tt1515091,sherlock_holmes_a_game_of_shadows,Sherlock Holmes: A Game of Shadows,2011,,129,437947.0,1,,,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,numVotes,label
numVotes,1.0,0.255227
label,0.255227,1.0
