# Import necessary libraries

In [1]:
import json
import numpy as np
import pandas as pd
from itertools import groupby

from py_files.writer_director_to_one_hot import writer_director_to_one_hot
from py_files.add_merge_begin_end_year import merge_start_end_year
from py_files.load_box_office_data import load_and_aggregate_box_office
from py_files.add_remake_feature import create_remake_column
from py_files.add_langoriginaltitle_feature import add_language_of_original_title
from py_files.add_ENvsNonEN_feature import add_english_title_or_not
from py_files.add_movie_genre_feature import add_movie_genre

# Loading the data

In [2]:
from py_files.load_original_data import load_original_data

df_original = load_original_data()

Found files: train-8.csv, train-2.csv, train-7.csv, train-5.csv, train-3.csv, train-4.csv, train-1.csv, train-6.csv


# Preprocessing of original columns

In [3]:
# copy the dataframe so we leave the original untouched
df_preprocessed = df_original.copy(deep=True)

# start the preprocessing
df_preprocessed = df_original.replace("\\N", np.nan)
df_preprocessed["primaryTitleFormatted"] = df_preprocessed["primaryTitle"].str.lower()\
                                                                          .str.normalize('NFKD')\
                                                                          .str.encode('ascii', errors='ignore')\
                                                                          .str.decode('utf-8')\
                                                                          .str.replace(" ", "_", regex=True)\
                                                                          .str.replace("\W", "", regex=True)

# merge endYear into beginYear when beginYear is not available --> rename Year
df_preprocessed = merge_start_end_year(df_preprocessed)

# set the datatypes of the dataframe correctly
df_preprocessed['Year'] = df_preprocessed['Year'].astype(int)
df_preprocessed['runtimeMinutes'] = df_preprocessed['runtimeMinutes'].astype(float)

df_preprocessed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7959 entries, 0 to 7958
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   tconst                 7959 non-null   object 
 1   primaryTitle           7959 non-null   object 
 2   originalTitle          3971 non-null   object 
 3   startYear              7173 non-null   object 
 4   endYear                786 non-null    object 
 5   runtimeMinutes         7946 non-null   float64
 6   numVotes               7169 non-null   float64
 7   label                  7959 non-null   bool   
 8   primaryTitleFormatted  7959 non-null   object 
 9   Year                   7959 non-null   int64  
dtypes: bool(1), float64(2), int64(1), object(6)
memory usage: 567.5+ KB


## Preprocessing of exogenous data

### Oscar data

In [4]:
oscars = pd.read_csv("additional_data/oscars.csv")

oscars["film"] = oscars["film"].str.lower()\
                               .str.normalize('NFKD')\
                               .str.encode('ascii', errors='ignore')\
                               .str.decode('utf-8')\
                               .str.replace(" ", "_", regex=True)\
                               .str.replace("\W", "", regex=True)

# Counting oscar nominations and wins per movie
oscar_noms = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].count()
oscar_wins = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].sum()

### Writer and Director data

In [5]:
# Find writers and directors per movie and combine the two
written_and_directed = (writer_director_to_one_hot("writers") + writer_director_to_one_hot("directors")).fillna(0).astype(int).loc[df_preprocessed['tconst']]

### TMDB data

### Box Office data

In [6]:
df_box_office_mojo = load_and_aggregate_box_office()

# process the 'release group' (read movie title) in the same way as the formatted title
df_box_office_mojo["Release Group"] = df_box_office_mojo["Release Group"].str.lower()\
                                       .str.normalize('NFKD')\
                                       .str.encode('ascii', errors='ignore')\
                                       .str.decode('utf-8')\
                                       .str.replace(" ", "_", regex=True)\
                                       .str.replace("\W", "", regex=True)
df_box_office_mojo.drop(['%', '%.1'], axis=1, inplace=True)

Found files: box_office_mojo/2014.csv, box_office_mojo/1982.csv, box_office_mojo/1979.csv, box_office_mojo/1980.csv, box_office_mojo/2008.csv, box_office_mojo/1997.csv, box_office_mojo/2015.csv, box_office_mojo/1986.csv, box_office_mojo/2010.csv, box_office_mojo/1978.csv, box_office_mojo/1996.csv, box_office_mojo/2011.csv, box_office_mojo/1998.csv, box_office_mojo/2009.csv, box_office_mojo/2005.csv, box_office_mojo/2018.csv, box_office_mojo/1977.csv, box_office_mojo/1981.csv, box_office_mojo/1994.csv, box_office_mojo/2013.csv, box_office_mojo/2002.csv, box_office_mojo/2006.csv, box_office_mojo/1991.csv, box_office_mojo/1985.csv, box_office_mojo/2017.csv, box_office_mojo/1984.csv, box_office_mojo/1995.csv, box_office_mojo/2016.csv, box_office_mojo/2001.csv, box_office_mojo/2007.csv, box_office_mojo/1989.csv, box_office_mojo/1988.csv, box_office_mojo/2012.csv, box_office_mojo/2000.csv, box_office_mojo/1990.csv, box_office_mojo/2004.csv, box_office_mojo/2021.csv, box_office_mojo/2022.csv,

# Adding of exogenous columns

In [7]:
df_incl_exog = df_preprocessed.copy(deep=True)
df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7959 entries, tt0015224 to tt9900782
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   primaryTitle           7959 non-null   object 
 1   originalTitle          3971 non-null   object 
 2   startYear              7173 non-null   object 
 3   endYear                786 non-null    object 
 4   runtimeMinutes         7946 non-null   float64
 5   numVotes               7169 non-null   float64
 6   label                  7959 non-null   bool   
 7   primaryTitleFormatted  7959 non-null   object 
 8   Year                   7959 non-null   int64  
dtypes: bool(1), float64(2), int64(1), object(5)
memory usage: 567.4+ KB


## add oscar data

In [8]:
df_incl_exog["oscar_noms"] = oscar_noms
df_incl_exog["oscar_wins"] = oscar_wins

## add mojo box office

In [9]:
df_incl_exog = df_incl_exog.reset_index().merge(df_box_office_mojo, left_on=['primaryTitleFormatted', 'Year'], right_on=['Release Group', 'year'], how="left").set_index('id')
df_incl_exog.drop(['Release Group', 'year'], axis=1, inplace=True)

df_incl_exog.loc[df_incl_exog['Worldwide'] == '-', 'Worldwide'] = np.nan
df_incl_exog.loc[df_incl_exog['Domestic'] == '-', 'Domestic'] = np.nan
df_incl_exog.loc[df_incl_exog['Foreign'] == '-', 'Foreign'] = np.nan
df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'] = df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'] = df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'] = df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'].apply(lambda x: float(x.replace('$', '').replace(',', '')))

## add remake column

In [10]:
df_incl_exog = create_remake_column(df_incl_exog)

## add title language

In [11]:
# # add the language of the original title, currently commented for training data usage and not wait 15 min every time
# df_incl_exog = add_language_of_original_title(df_incl_exog)

df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

## add whether title is English or not

In [12]:
df_incl_exog = add_english_title_or_not(df_incl_exog)

## add movie genres

In [14]:
df_incl_exog = add_movie_genre(df_incl_exog)

0


## add writers and directors

In [16]:
df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T

## add TMDB data

## save dataframe with features

In [None]:
df_incl_exog.to_csv('df_with_features.csv')

# Selecting data for predicting

df_added_dataclassifier

# Evaluating classifier

# Predicting