# Import necessary libraries

In [None]:
!pip install gensim nltk lightgbm langdetect
# run those in terminal
# conda install -c conda-forge langdetect
# conda install -c brittainhard fancyimpute

## Notes and Current issues:

- Currently, the data is exported from the spark pipeline to be used from the classifier. This happens in three files per set (training, validation, test). Example for the training set:
    - The texts dataset is exported into csv and manually saved with name "texts_df_train.csv"(texts.write.mode("overwrite").option("header", True).csv("texts_train")) 
    - The imputed columns are exported into csv and manually saved with name "imputed_training_df.csv"(imputed_data.write.mode("overwrite").option("header",True).csv("imputed_data"))
    - The training_data is exported into csv and manually saved with name "training_df.csv"
    
The aforementioned files can be found into the "Data" directory. 

Current Issues:

- Jim's imputation function is not running properly and the usual retryingblock error is produced. In this version, the merging of movie genres to the training_data is (supposedly) fixed. 
    - This may be fixable if we export the one hot encoded genres to a separate file.
    - Another (non-spark) fix could happen in the data_prep() function (see how column 'language' is one-hot encoded)

In [None]:
import json
import ast
import glob
import numpy as np
import pandas as pd
from itertools import groupby

from py_files.writer_director_to_one_hot import writer_director_to_one_hot
from py_files.add_merge_begin_end_year import merge_start_end_year
from py_files.load_box_office_data import load_and_aggregate_box_office
from py_files.add_remake_feature import create_remake_column
from py_files.add_langoriginaltitle_feature import add_language_of_original_title
from py_files.add_ENvsNonEN_feature import add_english_title_or_not
from py_files.add_movie_genre_feature import add_movie_genre
from py_files.df_processor_enrichment import df_processor_enrichment

from py_files.df_model_prep import df_model_prep
from py_files.d2v_embed import d2v_embed
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import math
import os
import re

from pyspark.sql.functions import input_file_name, substring, udf,col, lit, coalesce,\
                                  when, regexp_replace, count, regexp_extract, split,\
                                  array_contains, monotonically_increasing_id, concat, concat_ws

from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType, FloatType, LongType, DoubleType
from pyspark import SparkContext

from fancyimpute import KNN, IterativeImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MultiLabelBinarizer

import unicodedata
import nltk
nltk.download('punkt')

# Loading the data

In [None]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

SparkContext.setSystemProperty('spark.executor.memory', '12g')
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")


all_files = glob.glob("train-*.csv")

print(f"Found files: {', '.join(all_files)}")

schema = StructType() \
      .add("_c0",IntegerType(),True) \
      .add("tconst",StringType(),True) \
      .add("primaryTitle",StringType(),True) \
      .add("originalTitle",StringType(),True) \
      .add("startYear",IntegerType(),True) \
      .add("endYear",IntegerType(),True) \
      .add("runtimeMinutes",IntegerType(),True) \
      .add("numVotes",FloatType(),True) \
      .add("label",BooleanType(),True)

# skip the header and define our own because the automatic detection doesn't go right
n_skip_rows = 1
row_rdd = spark.sparkContext \
    .textFile("validation_hidden.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

training_data = spark.read.csv(row_rdd, schema=schema, header=False)

In [None]:
spark.sparkContext.getConf().getAll()

# Preprocessing of original columns

In [None]:
def format_titles(title):
    return unicodedata.normalize('NFKD',title.lower()).encode('ascii', errors='ignore').decode('utf-8').replace("\W", "")

udf_format_titles = udf(format_titles, StringType()) # if the function returns an int

training_data.show()
training_data = training_data.withColumn("primaryTitleFormatted", lit(udf_format_titles('primaryTitle')))
training_data = training_data.withColumn('Year', coalesce('startYear', 'endYear'))
training_data = training_data.where(col("tconst") != "tconst")
# training_data.persist()

## Preprocessing of exogenous data

### Oscar data

In [None]:
oscars = spark.read.csv("additional_data/oscars.csv", header=True)
oscars = oscars.na.drop(subset=["film"])
oscars = oscars.withColumn("film", lit(udf_format_titles('film')))

cond = [training_data.primaryTitleFormatted == oscars.film]
oscar_noms = training_data.join(oscars, cond, 'inner').groupBy('tconst').count()
oscar_wins = training_data.join(oscars, cond, 'inner').filter(col('winner') == True).groupBy('tconst').count()

In [None]:
# oscar_noms.show()
# oscar_wins.show()

### Razzie data

In [None]:
razzies = spark.read.csv("additional_data/Razzies.csv", header=True)
razzies = razzies.na.drop(subset=["moviename"])
razzies = razzies.withColumn("moviename", lit(udf_format_titles('moviename')))

cond = [training_data.primaryTitleFormatted == razzies.moviename]
razzie_noms = training_data.join(razzies, cond, 'inner').groupBy('tconst').count()
razzie_wins = training_data.join(razzies, cond, 'inner').filter(col('Wins') == True).groupBy('tconst').count()

In [None]:
# razzie_noms.show()
# razzie_wins.show()

### Writer and Director data

In [None]:
# writers = writer_director_to_one_hot("writers")
# directors = writer_director_to_one_hot("directors")
# written_and_directed = writers.add(directors, fill_value=0).fillna(0).astype(int).loc[df_preprocessed["tconst"]]

### TMDB data

In [None]:
schema = StructType() \
      .add("id",IntegerType(),True) \
      .add("belongs_to_collection",StringType(),True) \
      .add("budget",IntegerType(),True) \
      .add("genres",StringType(),True) \
      .add("homepage",StringType(),True) \
      .add("imdb_id",StringType(),True) \
      .add("original_language",StringType(),True) \
      .add("original_title",StringType(),True) \
      .add("overview",StringType(),True) \
      .add("popularity",FloatType(),True) \
      .add("poster_page",StringType(),True) \
      .add("production_companies",StringType(),True) \
      .add("production_countries",StringType(),True) \
      .add("release_data",StringType(),True) \
      .add("runtime",IntegerType(),True) \
      .add("spoken_language",StringType(),True) \
      .add("status",StringType(),True) \
      .add("tagline",StringType(),True) \
      .add("title",StringType(),True) \
      .add("Keywords",StringType(),True) \
      .add("cast",StringType(),True) \
      .add("crew",StringType(),True) \
      .add("revenue",IntegerType(),True)

n_skip_rows = 1
row_rdd = spark.sparkContext \
    .textFile("additional_data/TMDB.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_TMDB = spark.read.csv(row_rdd, header=False, quote='"', escape="\"", schema=schema).select("budget", "genres", "imdb_id", 
                                                                             "original_language", "overview", 
                                                                             "popularity", "production_companies", 
                                                                             "tagline", "Keywords", "revenue")

# # I think there are some incorrect rows present due to loading errors. 
# df_TMDB = spark.read.csv("additional_data/TMDB.csv", header=True, escape="\"")[["budget", "genres", "imdb_id", 
#                                                                                 "original_language", "overview", 
#                                                                                 "popularity", "production_companies", 
#                                                                                 "tagline", "Keywords", "revenue"]]

ids = training_data.select("tconst").collect()
ids = [i[0] for i in ids]
df_TMDB = df_TMDB.where(col("imdb_id").isin(set(ids)))

# df_TMDB.head(1)

In [None]:
def dict_to_string(dictionary):
    try:
        d = ast.literal_eval(dictionary)
    except ValueError:
        return ""
                
    try:
        return " ".join([i["name"] for i in d])
    except TypeError:
        return ""

udf_dict_to_string = udf(lambda x: dict_to_string(x), StringType())

In [None]:
df_TMDB = df_TMDB.withColumn("genres", udf_dict_to_string(col("genres")))
df_TMDB = df_TMDB.withColumn("Keywords", udf_dict_to_string(col("Keywords")))
df_TMDB = df_TMDB.withColumn("production_companies", udf_dict_to_string(col("production_companies")))

### Metacritic data

In [None]:
schema2 = StructType() \
      .add("_c0",IntegerType(),True) \
      .add("tconst",StringType(),True) \
      .add("genres",StringType(),True) \
      .add("language",StringType(),True) \
      .add("overview",StringType(),True) \

n_skip_rows = 1
row_rdd2 = spark.sparkContext \
    .textFile("additional_data/Metacritic_validation.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_meta = spark.read.csv(row_rdd2, header=False, escape="\"", schema=schema2)

In [None]:
def generate_overview(overview):
    overview = eval(overview)
        
    if overview:
        return overview[0]
    else:
        return " "
    
udf_generate_overview = udf(lambda x: generate_overview(x), StringType())

In [None]:
df_meta = df_meta.withColumn("overview_meta", udf_generate_overview(col("overview"))).drop("overview")
df_meta = df_meta.withColumnRenamed("genres", "genres_meta")

In [None]:
cond = [df_meta.tconst == df_TMDB.imdb_id]
full = df_meta.join(df_TMDB, cond, 'outer')

overviews = full.select(concat_ws(" ", full.overview, 
                                       full.overview_meta, 
                                       full.Keywords).alias("FullOverview"), "imdb_id", "tconst", 
                                                                             "budget", "genres", "genres_meta", 
                                                                             "language", "popularity", "revenue")

overviews = overviews.withColumn('movie_id', coalesce('tconst', 'imdb_id')).drop("tconst", "imdb_id")
overviews = overviews.where(col("movie_id").isin(set(ids)))

In [None]:
texts = overviews.select("movie_id", "FullOverview").join(training_data.select("tconst", "primaryTitleFormatted"),
                                                          overviews["movie_id"] == training_data["tconst"], 
                                                          how="outer").drop("movie_id")

texts = texts.select("tconst", concat_ws(": ", 
                                         texts.primaryTitleFormatted,
                                         texts.FullOverview).alias("FullText"))

In [None]:
meta_TMDB = overviews.drop("FullOverview", "imdb_id")

In [None]:
def clean_text(text):
    return unicodedata.normalize('NFKD', text.lower()).encode('ascii', errors='ignore').decode('utf-8').replace("\W", "").replace("'", "").replace('"', "")
        
udf_clean_text = udf(lambda x: clean_text(x), StringType())

In [None]:
texts = texts.withColumn("FullText", udf_clean_text(col("FullText")))

In [None]:
texts.write.mode("overwrite").option("header", True).csv("texts_train")

### Box Office data

In [None]:
box_office_schema = StructType() \
      .add("Rank",IntegerType(),True) \
      .add("Release Group",StringType(),True) \
      .add("Worldwide",StringType(),True) \
      .add("Domestic",StringType(),True) \
      .add("Col_to_Drop1",StringType(),True) \
      .add("Foreign",StringType(),True) \
      .add("Col_to_Drop2",StringType(),True)

n_skip_rows = 1
box_office_rdd = spark.sparkContext \
    .textFile("box_office_mojo/*.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_box_office_mojo = spark.read.csv(box_office_rdd, schema=box_office_schema, header=False)

# process the 'release group' (read movie title) in the same way as the formatted title
df_box_office_mojo = df_box_office_mojo.withColumn("Release Group", lit(udf_format_titles('Release Group')))

# add the year of the box office file
df_box_office_mojo = df_box_office_mojo.withColumn("mojo_year", substring(input_file_name(), -8, 4).cast(IntegerType()))

# drop unnecessary columns
df_box_office_mojo = df_box_office_mojo.drop(*('Col_to_Drop1', 'Col_to_Drop2'))

# Adding of exogenous columns

In [None]:
# df_incl_exog = df_preprocessed.copy(deep=True)
# df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
# df_incl_exog.info()
# training_data = training_data.withColumnRenamed('tconst', 'id')

## add oscar data

In [None]:
training_data = training_data.join(oscar_noms, ['tconst'], 'left').withColumnRenamed('count', 'oscar_noms')
training_data = training_data.join(oscar_wins, ['tconst'], 'left').withColumnRenamed('count', 'oscar_wins')

## add razzie data

In [None]:
training_data = training_data.join(razzie_noms, ['tconst'], 'left').withColumnRenamed('count', 'razzie_noms')
training_data = training_data.join(razzie_wins, ['tconst'], 'left').withColumnRenamed('count', 'razzie_wins')

## add TMDB & Metacritic data

THIS DOES NOT INCLUDE THE OVERVIEWS, THEY WILL BE ADDED LATER, AFTER BEING CONVERTED TO D2V!

In [None]:
# Using overviews2, since overviews causes memory issues
cond = [training_data.tconst == meta_TMDB.movie_id]
meta_TMDB = meta_TMDB.withColumnRenamed("genres","genres_tmdb")
training_data = training_data.join(meta_TMDB, cond, "leftouter")

In [None]:
# training_data.show(1)

## add mojo box office

In [None]:
def remove_missing_box_office_values(column):
    return when(column != '-', column).otherwise(lit(None))

cond_mojo_merge = [training_data.primaryTitleFormatted == df_box_office_mojo['Release Group'], training_data.Year == df_box_office_mojo["mojo_year"]]

training_data = training_data.join(df_box_office_mojo, cond_mojo_merge, 'left').drop(*('Release Group', "mojo_year"))
training_data = training_data.withColumn("Worldwide", remove_missing_box_office_values(col("Worldwide")))
training_data = training_data.withColumn("Domestic", remove_missing_box_office_values(col("Domestic")))
training_data = training_data.withColumn("Foreign", remove_missing_box_office_values(col("Foreign")))
training_data = training_data.withColumn('Worldwide', regexp_replace('Worldwide', '[$,]', '').cast('double'))
training_data = training_data.withColumn('Domestic', regexp_replace('Domestic', '[$,]', '').cast('double'))
training_data = training_data.withColumn('Foreign', regexp_replace('Foreign', '[$,]', '').cast('double'))
# training_data

In [None]:
# training_data = training_data.withColumn('Year', coalesce('startYear', 'endYear'))

training_data = training_data.withColumn('Revenue', coalesce('Revenue', 'Worldwide'))

In [None]:
# training_data.show()


## add remake column

In [None]:
training_data = training_data.join(
    training_data.groupBy("primaryTitle").agg((count("*")>1).cast("int").alias("hasRemake")),
    on="primaryTitle",
    how="inner"
)

## add title language

In [None]:
# # add the language of the original title, currently commented for training data usage and not wait 15 min every time
# df_incl_exog = add_language_of_original_title(df_incl_exog)

# df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
# df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
# df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

added_lang_schema = StructType() \
      .add("_c0",IntegerType(),True) \
      .add("tconst",StringType(),True) \
      .add("primaryTitle",StringType(),True) \
      .add("originalTitle",StringType(),True) \
      .add("startYear",IntegerType(),True) \
      .add("endYear",IntegerType(),True) \
      .add("runtimeMinutes",IntegerType(),True) \
      .add("numVotes",FloatType(),True) \
      .add("label",BooleanType(),True) \
      .add("title_language",StringType(),True) \
      .add("isEN",BooleanType(),True) 

n_skip_rows = 1
added_lang_rdd = spark.sparkContext \
    .textFile('additional_data/df_added_lang.csv') \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_added_lang = spark.read.csv(added_lang_rdd, schema=added_lang_schema, header=False)

training_data = training_data.join(df_added_lang.select(['tconst', 'title_language']), on='tconst', how='left')


## add whether title is English or not

In [None]:
def indicate_whether_language_is_english(column):
    return when(column == 'en', True).otherwise(lit(False))

training_data = training_data.withColumn("isEN", indicate_whether_language_is_english(col("title_language")))


In [None]:
# training_data.persist()

## add movie genres

In [None]:
movie_genres = spark.read.csv("additional_data/movie_genres.csv", header=True)
movie_genres = movie_genres.filter(movie_genres.genres != '(no genres listed)')
movie_genres = movie_genres.filter(movie_genres.title.endswith(')'))
movie_genres = movie_genres.withColumn('year', substring(col('title'), -5, 4))
movie_genres = movie_genres.filter(movie_genres.year != '')
movie_genres = movie_genres.withColumn('year', col('year').cast(IntegerType()))
movie_genres = movie_genres.withColumn('title', regexp_replace(col('title'), r' \(.*?\)', ''))
# movie_genres = movie_genres.withColumn('genres', split(col('genres'), '\|'))
movie_genres = movie_genres.withColumn("titleFormatted", lit(udf_format_titles('title')))
movie_genres = movie_genres.dropDuplicates(['titleFormatted', 'year'])

In [None]:
list_of_genres = ['Action', 'Adventure', 'Animation', 'Biography', 'Children', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Horror', 'History',
       'IMAX', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV','Romance', 'Sci-Fi', 'Science Fiction', 'Short', 'Sport', 'Thriller', 'TV Movie', 'War',
       'Western']

In [None]:

cond = [training_data.Year == movie_genres.year, training_data.primaryTitleFormatted == movie_genres.titleFormatted]
training_data = training_data.join(movie_genres.select(['year', 'titleFormatted', 'genres']), cond, how='left')
# training_data.show()

In [None]:
training_data = training_data.withColumn('genres_combined', concat(col('genres'),col('genres_meta'),col('genres_tmdb')))

In [None]:
for c in list_of_genres:
    training_data = training_data.withColumn(c, col("genres_combined").contains(c).cast("int"))

In [None]:
training_data = training_data.drop(*('genres_combined', 'genres_meta', 'genres_tmdb', 'year', 'titleFormatted'))

# Impute missing values
### For columns that make sense

In [11]:
#Expects an pyspark dataframe and either "median"/"knn"/"mice"

def impute_missing(data, columns = ['_c0', 'tconst', 'numVotes', 'runtimeMinutes', 'budget', 'popularity', 'revenue'], strategy = 'median'):
    data = data[columns]

    
    
    #Get all columns types in lists
    num_cols = [f.name for f in data.schema.fields if isinstance(f.dataType, FloatType)]
    num_cols += [f.name for f in data.schema.fields if isinstance(f.dataType, IntegerType)]
    num_cols += [f.name for f in data.schema.fields if isinstance(f.dataType, LongType)]
    str_cols = [f.name for f in data.schema.fields if isinstance(f.dataType, StringType)]
    bool_cols = [f.name for f in data.schema.fields if isinstance(f.dataType, BooleanType)]
    cat_cols = str_cols + bool_cols
    #num_imputed = [var + "_imputed" for var in num_cols]
    
    #fill categorical columns
    data_df = data.toPandas().copy(deep=True)
    data_df[cat_cols]=data_df[cat_cols].fillna(data_df.mode().iloc[0])
    
#purely in pyspark
#     if strategy == 'median':
#         imputer = Imputer(inputCols = num_cols, outputCols = num_imputed).setStrategy("median")
#         filled_data = imputer.fit(data).transform(data)
#         return filled_data

    #fill numerical columns with median
    if strategy == 'median':
        med_imputed = data[num_cols].toPandas().copy(deep=True)
        med_imputer = SimpleImputer(strategy='median')
        med_imputed.iloc[:, :] = med_imputer.fit_transform(med_imputed)
        merged_df = pd.merge(data_df, med_imputed, how='right', on = '_c0', suffixes=("_l", "")) #mice_imputed
        final_df = merged_df[merged_df.columns[~merged_df.columns.str.endswith('_l')]]
        final_df[num_cols] = final_df[num_cols].astype(int)
        return spark.createDataFrame(final_df)

    #fill numerical columns with KNN
    elif strategy == "KNN":
        knn_imputed = data[num_cols].toPandas().copy(deep=True)
        knn_imputer = KNN()
        knn_imputed.iloc[:, :] = knn_imputer.fit_transform(knn_imputed)
        merged_df = pd.merge(data_df, knn_imputed, how='right', on = '_c0', suffixes=("_l", "")) #mice_imputed
        final_df = merged_df[merged_df.columns[~merged_df.columns.str.endswith('_l')]]
        final_df[num_cols] = final_df[num_cols].astype(int)
        final_df[str_cols] = final_df[str_cols].astype(str)
        final_df[bool_cols] = final_df[bool_cols].astype(bool)
        return spark.createDataFrame(final_df)
    
    #fill numerical columns with MICE
    elif strategy == "MICE":
        mice_imputed = data[num_cols].toPandas().copy(deep=True)
        mice_imputer = IterativeImputer()
        mice_imputed.iloc[:, :] = mice_imputer.fit_transform(mice_imputed)
        merged_df = pd.merge(data_df, mice_imputed, how='right', on = '_c0', suffixes=("_l", "")) #mice_imputed
        final_df = merged_df[merged_df.columns[~merged_df.columns.str.endswith('_l')]]
        final_df[num_cols] = final_df[num_cols].astype(int)
        final_df[str_cols] = final_df[str_cols].astype(str)
        final_df[bool_cols] = final_df[bool_cols].astype(bool)
        return spark.createDataFrame(final_df)

In [None]:
imputed_data = impute_missing(training_data, strategy = "MICE")

In [None]:
training_data = training_data.join(imputed_data, on='tconst', how='left')

In [None]:
training_data.write.mode("overwrite").option("header",True).csv("validation_data")

# Preparing data for classifier

Convert non-numeric columns to numeric:

- Categorical column 'title language' is factorized
- Categorical column 'language' is one-hot encoded
- Missing values from columns 'startYear' and 'endYear' are cross filled
- Missing values in columns 'oscar_noms' and 'oscar_wins' are replaced with zeroes
- String columns ('primaryTitleFormatted', 'originalTitle', 'FullText') are embedded via Doc2Vec into an n-by-128 array

### Genres columns are discarded, as at the current time the genres merging and encoding doesn't work.

In [None]:
def df_model_prep(data_filename: str, texts_filename: str, imputed_filename: str):
    
    # read data
    df = pd.read_csv(f'{data_filename}.csv', index_col = 0)
    df = df[df.index != "tconst"]

    # read imputed data
    imputed_df = pd.read_csv(f'{imputed_filename}.csv', index_col = 1)
    
    df["genres"] = df["genres"].str.split()

    # This is extremely ugly, but I extremely do not care
    df["genres_meta"] = df["genres_meta"].fillna("").str.replace("[", "").str.replace("]", "").str.replace("'", "").str.replace(",", "").str.split()
    df['genres_meta'] = df['genres_meta'].apply(lambda d: d if isinstance(d, list) else [])
    df['genres'] = df['genres'].apply(lambda d: d if isinstance(d, list) else [])

    # Merge lists
    df["genres_merged"] = (df["genres"] + df["genres_meta"]).apply(set).apply(list)
    mlb = MultiLabelBinarizer()

    # One-hot encode
    df = df.join(pd.DataFrame(
                 mlb.fit_transform(df.pop('genres_merged')),
                 index=df.index,
                 columns=mlb.classes_), rsuffix = "_genre")
    
    print("Read files.")
    
    df.drop(columns = imputed_df.columns.values, inplace = True)
    df = df.join(imputed_df)
    del imputed_df
    
    print("Deleted garbage")
    
    df['title_language'] = pd.factorize(df['title_language'])[0]
    
    print("Factorized title language")
    
    df['language'] = pd.factorize(df['language'])[0]
    
    lang_proc = df["language"]\
                .replace(np.nan, " ")\
                .apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))\
                .str.split()

    lang_proc = pd.DataFrame(mlb.fit_transform(lang_proc),
                             columns=mlb.classes_,
                             index=df.index)

    
    # dealing with (some) nan values
    for index, row in df.iterrows():
        # For missing startYear and endYear entries, insert the other, if it exists.
        if math.isnan(row['startYear']):
            if not math.isnan(row['endYear']):
                df.at[index,'startYear']=df.at[index,'endYear']
        if math.isnan(row['endYear']):
            if not math.isnan(row['startYear']):
                df.at[index,'endYear']=df.at[index,'startYear']

        # For missing oscar_noms and oscar_wins, insert 0
        if math.isnan(row['oscar_noms']):
            df.at[index,'oscar_noms'] = 0
        if math.isnan(row['oscar_wins']):
            df.at[index,'oscar_wins'] = 0
    
    if "d2v_model_trained.csv" not in os.listdir():
        d2v_train = pd.read_csv(f'data/texts_df_train.csv', index_col = 0)
        d2v_valid = pd.read_csv(f'data/texts_df_valid.csv', index_col = 0)
        d2v_test = pd.read_csv(f'data/texts_df_test.csv', index_col = 0)

        df_d2v = pd.concat([d2v_train, d2v_valid, d2v_test], axis=0)

        # This is gonna take a while
        texts = d2v_embed(df_d2v["FullText"])
        texts.to_csv(f'd2v_model_trained.csv')
    else:
        # read texts data
        texts = pd.read_csv(f'd2v_model_trained.csv', index_col = 0)
    
    print("D2V")
    
    df.drop(columns=['genres', 
                     'genres_meta',
                     'primaryTitle',
                     'movie_id',
                     'primaryTitleFormatted', 
                     'originalTitle', 
                     'language', 
                     'startYear',
                     'endYear',
                     '_c0'], inplace=True)
    
    # df = df.join(prim_title_df)
    # df = df.join(orig_title_df)
    df = df.join(text_df)
    df = df.join(lang_proc)
    
    print("Finished")
    
    return df

In [None]:
train_df = df_model_prep('data/training_df', 'data/texts_df_train', 'data/imputed_training_df')
train_df.to_csv("completed_train_df1.csv")

eval_df = df_model_prep('data/validation_df', 'data/texts_df_valid', 'data/imputed_valid_df')
eval_df.to_csv("completed_eval_df1.csv")

test_df = df_model_prep('data/test_df', 'data/texts_df_test', 'data/imputed_test_df')
test_df.to_csv("completed_test_df1.csv")

# Evaluating classifier

In [None]:
model_lgbm = lgb.LGBMClassifier(objective='binary',
                                learning_rate=0.001,
                                num_iterations=5000,
                                feature_fraction=0.8,
                                verbosity=1,
                                random_state=17)
model_lgbm.fit(train_df.loc[:, train_df.columns != 'label'],
              train_df['label'],
              eval_metric='logloss')

# Predicting

In [None]:
valid_df = df_model_prep('validation_df', 'texts_df_valid', 'imputed_valid_df')
valid_df.drop(columns = ['label'], inplace = True)
valid_df.head()

In [None]:
test_df = df_model_prep('test_df', 'texts_df_test', 'imputed_test_df')
test_df.drop(columns = ['label'], inplace = True)
test_df_prepped.head()

In [None]:
val_preds_lgbm = model_lgbm.predict(valid_df)
with open('val_preds_lgbm.txt', 'w+') as f:
    for val in val_preds_lgbm:
        f.write(f"{str(val)}\n")

In [None]:
test_preds_lgbm = model_lgbm.predict(test_df)
with open('test_preds_lgbm.txt', 'w+') as f:
    for val in test_preds_lgbm:
        f.write(f"{str(val)}\n")