# Import necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
from itertools import groupby

from py_files.writer_director_to_one_hot import writer_director_to_one_hot


# Loading the data

In [2]:
from py_files.load_original_data import load_original_data

df_original = load_original_data()

Found files: train-8.csv, train-2.csv, train-7.csv, train-5.csv, train-3.csv, train-4.csv, train-1.csv, train-6.csv


# Preprocessing of original columns

In [3]:
df_preprocessed = df_original.replace("\\N", np.nan)
df_preprocessed["primaryTitleFormatted"] = df_preprocessed["primaryTitle"].str.lower()\
                                                                          .str.normalize('NFKD')\
                                                                          .str.encode('ascii', errors='ignore')\
                                                                          .str.decode('utf-8')\
                                                                          .str.replace(" ", "_", regex=True)\
                                                                          .str.replace("\W", "", regex=True)

## Preprocessing of exogenous data

### Oscar data

In [4]:
oscars = pd.read_csv("additional_data/oscars.csv")

oscars["film"] = oscars["film"].str.lower()\
                               .str.normalize('NFKD')\
                               .str.encode('ascii', errors='ignore')\
                               .str.decode('utf-8')\
                               .str.replace(" ", "_", regex=True)\
                               .str.replace("\W", "", regex=True)

# Counting oscar nominations and wins per movie
oscar_noms = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].count()
oscar_wins = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].sum()

### Writer and Director data

In [5]:
# Find writers and directors per movie and combine the two
written_and_directed = (writer_director_to_one_hot("writers") + writer_director_to_one_hot("directors")).fillna(0).astype(int).loc[df_preprocessed['tconst']]

### TMDB data

# Adding of exogenous columns

In [8]:
df_incl_exog = df_preprocessed.copy(deep=True)
df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")

# add oscar data
df_incl_exog["oscar_noms"] = oscar_noms
df_incl_exog["oscar_wins"] = oscar_wins

# add writers and directors
df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T

# Selecting data for predicting

Unnamed: 0,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label,primaryTitleFormatted,oscar_noms,oscar_wins,...,nm9925241,nm9933959,nm9942830,nm9946633,nm9955258,nm9958352,nm9958353,nm9980769,nm9985316,nm9985837
tt0015224,Peter Pan,,1924,,105,1042.0,True,peter_pan,,,...,0,0,0,0,0,0,0,0,0,0
tt0015864,The Gold Rush,,1925,,95,107475.0,True,the_gold_rush,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
tt0016029,The Little Colonel,,1935,,81,1646.0,True,the_little_colonel,,,...,0,0,0,0,0,0,0,0,0,0
tt0021309,The Story of the Fox,Le roman de Renard,1937,,63,,True,the_story_of_the_fox,,,...,0,0,0,0,0,0,0,0,0,0
tt0022395,The Skin Game,,1931,,85,,False,the_skin_game,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt9415552,Tankers,,2018,,90,1705.0,False,tankers,,,...,0,0,0,0,0,0,0,0,0,0
tt9484998,Palm Springs,,2020,,90,137884.0,True,palm_springs,,,...,0,0,0,0,0,0,0,0,0,0
tt9664108,Voyagers,Voyagers,2021,,108,15793.0,False,voyagers,,,...,0,0,0,0,0,0,0,0,0,0
tt9808510,Vellam,Vellam,2021,,154,1731.0,True,vellam,,,...,0,0,0,0,0,0,0,0,0,0


df_added_dataclassifier

# Evaluating classifier

# Predicting