# Import necessary libraries

In [39]:
import json
import ast
import glob
import numpy as np
import pandas as pd
from itertools import groupby

from py_files.writer_director_to_one_hot import writer_director_to_one_hot
from py_files.add_merge_begin_end_year import merge_start_end_year
from py_files.load_box_office_data import load_and_aggregate_box_office
from py_files.add_remake_feature import create_remake_column
from py_files.add_langoriginaltitle_feature import add_language_of_original_title
from py_files.add_ENvsNonEN_feature import add_english_title_or_not
from py_files.add_movie_genre_feature import add_movie_genre
from py_files.df_processor_enrichment import df_processor_enrichment

from py_files.df_model_prep import df_model_prep
from py_files.d2v_embed import d2v_embed
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import math

from pyspark.sql.functions import input_file_name, substring, udf,col, lit, coalesce,\
                                  when, regexp_replace, count, regexp_extract, split,\
                                  array_contains, monotonically_increasing_id, concat, concat_ws

from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, ArrayType, FloatType

import unicodedata

# Loading the data

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

all_files = glob.glob("train-*.csv")

print(f"Found files: {', '.join(all_files)}")

schema = StructType() \
      .add("_c0",IntegerType(),True) \
      .add("tconst",StringType(),True) \
      .add("primaryTitle",StringType(),True) \
      .add("originalTitle",StringType(),True) \
      .add("startYear",IntegerType(),True) \
      .add("endYear",IntegerType(),True) \
      .add("runtimeMinutes",IntegerType(),True) \
      .add("numVotes",IntegerType(),True) \
      .add("label",BooleanType(),True)

# skip the header and define our own because the automatic detection doesn't go right
n_skip_rows = 1
row_rdd = spark.sparkContext \
    .textFile("train-*.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

training_data = spark.read.csv(row_rdd, schema=schema, header=False)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/21 21:48:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Found files: train-1.csv, train-2.csv, train-3.csv, train-4.csv, train-5.csv, train-6.csv, train-7.csv, train-8.csv


                                                                                

# Preprocessing of original columns

In [3]:
def format_titles(title):
    return unicodedata.normalize('NFKD',title.lower()).encode('ascii', errors='ignore').decode('utf-8').replace(" ", "_").replace("\W", "")

udf_format_titles = udf(format_titles, StringType()) # if the function returns an int

training_data.show()
training_data = training_data.withColumn("primaryTitleFormatted", lit(udf_format_titles('primaryTitle')))
training_data = training_data.withColumn('Year', coalesce('startYear', 'endYear'))
training_data.persist()

                                                                                

+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|       originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|  4|tt0010600|            The Doll|           Die Puppe|     1919|   null|            66|    null| true|
|  7|tt0011841|       Way Down East|       Way Down East|     1920|   null|           145|    null| true|
|  9|tt0012494|             Déstiny|        Der müde Tod|     1921|   null|            97|    null| true|
| 25|tt0015163|       The Navigator|       The Navigator|     1924|   null|            59|    null| true|
| 38|tt0016220|The Phantom of th...|The Phantom of th...|     1925|   null|            93|    null| true|
| 42|tt0016630|     Báttling Bútlér|     Battling Butler|     1926|   null|            77|    null| true|
| 81|tt0021015|Juno and the Paycock|          

DataFrame[_c0: int, tconst: string, primaryTitle: string, originalTitle: string, startYear: int, endYear: int, runtimeMinutes: int, numVotes: int, label: boolean, primaryTitleFormatted: string, Year: int]

## Preprocessing of exogenous data

### Oscar data

In [4]:
oscars = spark.read.csv("additional_data/oscars.csv", header=True)
oscars = oscars.na.drop(subset=["film"])
oscars = oscars.withColumn("film", lit(udf_format_titles('film')))

cond = [training_data.primaryTitleFormatted == oscars.film]
oscar_noms = training_data.join(oscars, cond, 'inner').groupBy('tconst').count()
oscar_wins = training_data.join(oscars, cond, 'inner').filter(col('winner') == True).groupBy('tconst').count()

                                                                                

In [5]:
# oscar_noms.show()
# oscar_wins.show()

### Razzie data

In [6]:
razzies = spark.read.csv("additional_data/Razzies.csv", header=True)
razzies = razzies.na.drop(subset=["moviename"])
razzies = razzies.withColumn("moviename", lit(udf_format_titles('moviename')))

cond = [training_data.primaryTitleFormatted == razzies.moviename]
razzie_noms = training_data.join(razzies, cond, 'inner').groupBy('tconst').count()
razzie_wins = training_data.join(razzies, cond, 'inner').filter(col('Wins') == True).groupBy('tconst').count()

In [7]:
# razzie_noms.show()
# razzie_wins.show()

### Writer and Director data

In [8]:
# writers = writer_director_to_one_hot("writers")
# directors = writer_director_to_one_hot("directors")
# written_and_directed = writers.add(directors, fill_value=0).fillna(0).astype(int).loc[df_preprocessed["tconst"]]

### TMDB data

In [63]:
schema = StructType() \
      .add("id",IntegerType(),True) \
      .add("belongs_to_collection",StringType(),True) \
      .add("budget",IntegerType(),True) \
      .add("genres",StringType(),True) \
      .add("homepage",StringType(),True) \
      .add("imdb_id",StringType(),True) \
      .add("original_language",StringType(),True) \
      .add("original_title",StringType(),True) \
      .add("overview",StringType(),True) \
      .add("popularity",FloatType(),True) \
      .add("poster_page",StringType(),True) \
      .add("production_companies",StringType(),True) \
      .add("production_countries",StringType(),True) \
      .add("release_data",StringType(),True) \
      .add("runtime",IntegerType(),True) \
      .add("spoken_language",StringType(),True) \
      .add("status",StringType(),True) \
      .add("tagline",StringType(),True) \
      .add("title",StringType(),True) \
      .add("Keywords",StringType(),True) \
      .add("cast",StringType(),True) \
      .add("crew",StringType(),True) \
      .add("revenue",IntegerType(),True)

n_skip_rows = 1
row_rdd = spark.sparkContext \
    .textFile("additional_data/TMDB.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_TMDB = spark.read.csv(row_rdd, header=False, escape="\"", schema=schema).select("budget", "genres", "imdb_id", 
                                                                             "original_language", "overview", 
                                                                             "popularity", "production_companies", 
                                                                             "tagline", "Keywords", "revenue")

# # I think there are some incorrect rows present due to loading errors. 
# df_TMDB = spark.read.csv("additional_data/TMDB.csv", header=True, escape="\"")[["budget", "genres", "imdb_id", 
#                                                                                 "original_language", "overview", 
#                                                                                 "popularity", "production_companies", 
#                                                                                 "tagline", "Keywords", "revenue"]]

                                                                                

In [64]:
def dict_to_list(dictionary):
    try:
        d = ast.literal_eval(dictionary)
    except ValueError:
        return []
        
    try:
        return [i["name"] for i in d]
    except TypeError:
        return []

udf_dict_to_list = udf(lambda x: dict_to_list(x), ArrayType(StringType()))

In [65]:
df_TMDB = df_TMDB.withColumn("genres", udf_dict_to_list(col("genres")))
df_TMDB = df_TMDB.withColumn("Keywords", udf_dict_to_list(col("Keywords")))
df_TMDB = df_TMDB.withColumn("production_companies", udf_dict_to_list(col("production_companies")))

In [66]:
df_TMDB.show(5)

[Stage 45:>                                                         (0 + 1) / 1]

+--------+--------------------+---------+-----------------+--------------------+----------+--------------------+--------------------+--------------------+--------+
|  budget|              genres|  imdb_id|original_language|            overview|popularity|production_companies|             tagline|            Keywords| revenue|
+--------+--------------------+---------+-----------------+--------------------+----------+--------------------+--------------------+--------------------+--------+
|14000000|            [Comedy]|tt2637294|               en|When Lou, who has...|  6.575393|[Paramount Pictur...|The Laws of Space...|[time travel, seq...|12314651|
|40000000|[Comedy, Drama, F...|tt0368933|               en|Mia Thermopolis i...|  8.248895|[Walt Disney Pict...|It can take a lif...|[coronation, duty...|95149435|
| 3300000|             [Drama]|tt2582802|               en|Under the directi...|  64.29999|[Bold Films, Blum...|The road to great...|[jazz, obsession,...|13092000|
| 1200000|   [Th

                                                                                

### Metacritic data

In [67]:
schema2 = StructType() \
      .add("_c0",IntegerType(),True) \
      .add("tconst",StringType(),True) \
      .add("genres",StringType(),True) \
      .add("language",StringType(),True) \
      .add("overview",StringType(),True) \

n_skip_rows = 1
row_rdd2 = spark.sparkContext \
    .textFile("additional_data/Metacritic.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_meta = spark.read.csv(row_rdd2, header=False, escape="\"", schema=schema2)

In [68]:
def generate_overview(overview):
    overview = eval(overview)
        
    if overview:
        return overview[0]
    else:
        return " "
    
udf_generate_overview = udf(lambda x: generate_overview(x), StringType())

In [69]:
df_meta = df_meta.withColumn("overview_meta", udf_generate_overview(col("overview"))).drop("overview")
df_meta = df_meta.withColumnRenamed("genres", "genres_meta")

In [70]:
df_meta.show(5)

+---+---------+--------------------+--------------------+--------------------+
|_c0|   tconst|         genres_meta|            language|       overview_meta|
+---+---------+--------------------+--------------------+--------------------+
|  0|tt0024184|['Sci-Fi', 'Horror']|         ['English']|A scientist finds...|
|  1|tt0024216|['Adventure', 'Sc...|         ['English']|A film crew trave...|
|  2|tt0028333|['Comedy', 'Roman...|['English', 'Fren...|A performer and g...|
|  3|tt0030341|['Mystery', 'Thri...|['English', 'Germ...|While travelling ...|
|  4|tt0031762|['Adventure', 'Dr...|['English', 'Span...|At a remote South...|
+---+---------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [71]:
cond = [df_meta.tconst == df_TMDB.imdb_id]
full = df_TMDB.join(df_meta, cond, 'outer')

overviews = full.select(concat_ws("", full.overview, full.overview_meta).alias("FullOverview"), "imdb_id", "tconst")

# overviews = full.select(concat_ws("", full.overview, full.overview_meta).alias("FullOverview"), "imdb_id", "tconst", 
#                                                                                "budget", "genres", "genres_meta", "original_language", 
#                                                                                "language", "popularity", "Keywords", "revenue")

# Some nonsense data is still present, but I think it won't be an issue after joining on training_data
overviews = overviews.withColumn('movie_id', coalesce('tconst', 'imdb_id')).drop("tconst", "imdb_id")

### Box Office data

In [None]:
box_office_schema = StructType() \
      .add("Rank",IntegerType(),True) \
      .add("Release Group",StringType(),True) \
      .add("Worldwide",StringType(),True) \
      .add("Domestic",StringType(),True) \
      .add("Col_to_Drop1",StringType(),True) \
      .add("Foreign",StringType(),True) \
      .add("Col_to_Drop2",StringType(),True)

n_skip_rows = 1
box_office_rdd = spark.sparkContext \
    .textFile("box_office_mojo/*.csv") \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_box_office_mojo = spark.read.csv(box_office_rdd, schema=box_office_schema, header=False)

# process the 'release group' (read movie title) in the same way as the formatted title
df_box_office_mojo = df_box_office_mojo.withColumn("Release Group", lit(udf_format_titles('Release Group')))

# add the year of the box office file
df_box_office_mojo = df_box_office_mojo.withColumn("year", substring(input_file_name(), -8, 4).cast(IntegerType()))

# drop unnecessary columns
df_box_office_mojo = df_box_office_mojo.drop(*('Col_to_Drop1', 'Col_to_Drop2'))

# Adding of exogenous columns

In [None]:
# df_incl_exog = df_preprocessed.copy(deep=True)
# df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
# df_incl_exog.info()
# training_data = training_data.withColumnRenamed('tconst', 'id')

## add oscar data

In [None]:
training_data = training_data.join(oscar_noms, ['tconst'], 'left').withColumnRenamed('count', 'oscar_noms')
training_data = training_data.join(oscar_wins, ['tconst'], 'left').withColumnRenamed('count', 'oscar_wins')

## add razzie data

In [None]:
training_data = training_data.join(razzie_noms, ['tconst'], 'left').withColumnRenamed('count', 'razzie_noms')
training_data = training_data.join(razzie_wins, ['tconst'], 'left').withColumnRenamed('count', 'razzie_wins')

## add overviews

#### Not sure if working properly, since I keep running out of RAM to test lmao

In [None]:
cond = [training_data.tconst == overviews.movie_id]

training_data = training_data.join(overviews, cond, "left").drop("movie_id")

## add mojo box office

In [None]:
def remove_missing_box_office_values(column):
    return when(column != '-', column).otherwise(lit(None))

cond_mojo_merge = [training_data.primaryTitleFormatted == df_box_office_mojo['Release Group'], training_data.Year == df_box_office_mojo.year]

training_data = training_data.join(df_box_office_mojo, cond_mojo_merge, 'left').drop(*('Release Group', 'year'))
training_data = training_data.withColumn("Worldwide", remove_missing_box_office_values(col("Worldwide")))
training_data = training_data.withColumn("Domestic", remove_missing_box_office_values(col("Domestic")))
training_data = training_data.withColumn("Foreign", remove_missing_box_office_values(col("Foreign")))
training_data = training_data.withColumn('Worldwide', regexp_replace('Worldwide', '[$,]', '').cast('double'))
training_data = training_data.withColumn('Domestic', regexp_replace('Domestic', '[$,]', '').cast('double'))
training_data = training_data.withColumn('Foreign', regexp_replace('Foreign', '[$,]', '').cast('double'))
training_data.show()

## add remake column

In [None]:
training_data = training_data.join(
    training_data.groupBy("primaryTitle").agg((count("*")>1).cast("int").alias("hasRemake")),
    on="primaryTitle",
    how="inner"
)

## add title language

In [None]:
# # add the language of the original title, currently commented for training data usage and not wait 15 min every time
# df_incl_exog = add_language_of_original_title(df_incl_exog)

# df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
# df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
# df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

added_lang_schema = StructType() \
      .add("_c0",IntegerType(),True) \
      .add("tconst",StringType(),True) \
      .add("primaryTitle",StringType(),True) \
      .add("originalTitle",StringType(),True) \
      .add("startYear",IntegerType(),True) \
      .add("endYear",IntegerType(),True) \
      .add("runtimeMinutes",IntegerType(),True) \
      .add("numVotes",IntegerType(),True) \
      .add("label",BooleanType(),True) \
      .add("title_language",StringType(),True) \
      .add("isEN",BooleanType(),True) 

n_skip_rows = 1
added_lang_rdd = spark.sparkContext \
    .textFile('additional_data/df_added_lang.csv') \
    .zipWithIndex() \
    .filter(lambda row: row[1] >= n_skip_rows) \
    .map(lambda row: row[0])

df_added_lang = spark.read.csv(added_lang_rdd, schema=added_lang_schema, header=False)

training_data = training_data.join(df_added_lang.select(['tconst', 'title_language']), on='tconst', how='left')


## add whether title is English or not

In [None]:
def indicate_whether_language_is_english(column):
    return when(column == 'en', True).otherwise(lit(False))

training_data = training_data.withColumn("isEN", indicate_whether_language_is_english(col("title_language")))


In [None]:
training_data.show()

## add movie genres

In [None]:
# df_incl_exog = add_movie_genre(df_incl_exog)

def retrieve_year(string):
    try:
        return int(re.search('\((.*?)\)', string).group()[1:-1])
    except:
        return pd.NA

def remove_year(string):
    try:
        return re.sub('\((.*?)\)', '', string)[:-1]
    except:
        return str

def add_movie_genre(df_):
    ''''Create onehot encoded features of genres'''
    
    # load movies with genre data
    movie_genres = pd.read_csv(r'additional_data/movie_genres.csv', index_col=0)

    # remove movies in data set that don't have genres
    movie_genres = movie_genres[movie_genres['genres'] != '(no genres listed)']
    
    # get date for each movie from title column
    movie_genres['year'] = movie_genres['title'].apply(lambda x: retrieve_year(x))
    movie_genres = movie_genres.dropna(subset=['year'])

    # remove year from title column and set title data type correctly
    movie_genres['year'] = movie_genres['year'].astype(int)
    movie_genres['title'] = movie_genres['title'].apply(lambda x: remove_year(x)).astype('string')
    movie_genres['genres'] = movie_genres['genres'].apply(lambda x: x.split('|'))
    
    # format title in same way as original dataset
    movie_genres["titleFormatted"] = movie_genres["title"].str.lower()\
                                       .str.normalize('NFKD')\
                                       .str.encode('ascii', errors='ignore')\
                                       .str.decode('utf-8')\
                                       .str.replace(" ", "_", regex=True)\
                                       .str.replace("\W", "", regex=True)
    
    movie_genres.drop_duplicates(subset=['titleFormatted', 'year'], inplace=True)
    
    df_ = df_.reset_index().merge(movie_genres[['year', 'titleFormatted', 'genres']], left_on=['primaryTitleFormatted', 'Year'], right_on=['titleFormatted', 'year'], how='left').set_index('id')
    s = df_['genres'].explode()
    df_ = df_.join(pd.crosstab(s.index, s), how='left')
    
    return df_
    
movie_genres = spark.read.csv("additional_data/movie_genres.csv", header=True)
movie_genres = movie_genres.filter(movie_genres.genres != '(no genres listed)')
movie_genres = movie_genres.filter(movie_genres.title.endswith(')'))
movie_genres = movie_genres.withColumn('year', substring(col('title'), -5, 4))
movie_genres = movie_genres.filter(movie_genres.year != '')
movie_genres = movie_genres.withColumn('year', col('year').cast(IntegerType()))
movie_genres = movie_genres.withColumn('title', regexp_replace(col('title'), r' \(.*?\)', ''))
movie_genres = movie_genres.withColumn('genres', split(col('genres'), '\|'))
movie_genres.show()

movie_genres = movie_genres.withColumn("titleFormatted", lit(udf_format_titles('title')))
movie_genres = movie_genres.dropDuplicates(['titleFormatted', 'year'])

list_of_genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western']

for c in list_of_genres:
    movie_genres = movie_genres.withColumn(c, array_contains("genres", c).cast("int"))

movie_genres.show()




## add writers and directors

In [None]:
df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T

## add TMDB & Metacritic overviews

In [None]:
df_incl_exog = pd.merge(df_incl_exog, df_TMDB, how = "left", left_index = True, right_index = True)

In [None]:
df_incl_exog["overview"].str.len().sort_values().dropna()

## save dataframe with features

In [None]:
df_incl_exog.to_csv('df_with_features.csv')

# Preparing data for classifier

Convert non-numeric columns to numeric.
We use Doc2Vec to embed each string column into n-by-128 array 

In [None]:
train_df = pd.read_csv('df_with_features.csv', index_col=0)

In [None]:
train_df_prepped = df_model_prep(train_df,'train')
train_df_prepped.head()

In [None]:
# df_model_prep function for demonstration purposes
#
# from py_files.d2v_embed import d2v_embed
# import pandas as pd
# import math

# def df_model_prep(df, filename):
    
#     try:
#         print("Looking for pre made file...")
#         return pd.read_csv(f"{filename}_df_with_features_fully_processed_read_for_model.csv", index_col = 0)
#     except:
#         print("No file found, creating a new one")
    
#     prim_title_df = d2v_embed(df['primaryTitle'])
#     orig_title_df = d2v_embed(df['originalTitle'])
#     prim_title_formatted_df = d2v_embed(df['primaryTitleFormatted'])
#     title_formatted_df = d2v_embed(df['titleFormatted'])
#     genres_df = d2v_embed(df['genres'])

#     # just encode languages into ints for this column
#     df['title_language'] = pd.factorize(df['title_language'])[0]

#     df.drop(columns = df.select_dtypes(include='object').columns, inplace=True)

#     # dealing with (some) nan values
#     for index, row in df.iterrows():
#         # For missing startYear or endYear entries, insert the other, if it exists.
#         if math.isnan(row['startYear']):
#             if not math.isnan(row['endYear']):
#                 df.at[index,'startYear']=df.at[index,'endYear']
#         if math.isnan(row['endYear']):
#             if not math.isnan(row['startYear']):
#                 df.at[index,'endYear']=df.at[index,'startYear']

#         # For missing oscar_noms and oscar_wins, insert 0
#         if math.isnan(row['oscar_noms']):
#             df.at[index,'oscar_noms'] = 0
#         if math.isnan(row['oscar_wins']):
#             df.at[index,'oscar_wins'] = 0
#         if math.isnan(row['razzie_noms']):
#             df.at[index,'razzie_noms'] = 0
#         if math.isnan(row['razzie_wins']):
#             df.at[index,'razzie_wins'] = 0

#     df['numVotes'] = df['numVotes'].fillna(df['numVotes'].mean(skipna=True))
#     df['runtimeMinutes'] = df['runtimeMinutes'].fillna(df['runtimeMinutes'].mean(skipna=True))
    
#     df['title_language'] = pd.factorize(df['title_language'])[0]
    
#     df = df.join(prim_title_df)
#     df = df.join(orig_title_df)
#     df = df.join(prim_title_formatted_df)
#     df = df.join(title_formatted_df)
#     df = df.join(genres_df)
    
#     df.to_csv(f"{filename}_df_with_features_fully_processed_read_for_model.csv")
    
#     return df

In [None]:
# d2v_embed function for demonstration purposes
# 
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# from nltk.tokenize import word_tokenize
# import multiprocessing as mp
# from tqdm import tqdm
# import pandas as pd
# import math

# def d2v_embed(df_col, max_epochs = 100, vec_size = 128, alpha = 0.025):
    
#     df_col = df_col.fillna(" ")
#     df_col = df_col.str.lower()\
#                    .str.normalize('NFKD')\
#                    .str.encode('ascii', errors='ignore')\
#                    .str.decode('utf-8')\
#                    .str.replace("\W", " ", regex=True)
    
#     tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(df_col)]

#     model = Doc2Vec(vector_size=vec_size,
#                     alpha=alpha, 
#                     min_alpha=0.00025,
#                     min_count=1,
#                     dm =1,
#                     workers = mp.cpu_count())
  
#     model.build_vocab(tagged_data)

#     for epoch in tqdm(range(max_epochs)):
#     #     print('iteration {0}'.format(epoch))
#         model.train(tagged_data,
#                     total_examples=model.corpus_count,
#                     epochs=model.epochs)
#         # decrease the learning rate
#         model.alpha -= 0.0002
#         # fix the learning rate, no decay
#         model.min_alpha = model.alpha
    
#     # save model
#     model.save(f"doc2vec_model_{df_col.name}.model")
    
#     #return df with doc embeddings
#     return pd.DataFrame([model.docvecs[i] for i in range(len(df_col))], 
#                         index = df_col.index,
#                         columns = [f"{df_col.name}_{i}" for i in range(vec_size)])

In [None]:
# df_processor_enrichment function for demonstration purposes
# 
# import json
# import numpy as np
# import pandas as pd
# from itertools import groupby

# from py_files.writer_director_to_one_hot import writer_director_to_one_hot
# from py_files.add_merge_begin_end_year import merge_start_end_year
# from py_files.load_box_office_data import load_and_aggregate_box_office
# from py_files.add_remake_feature import create_remake_column
# from py_files.add_langoriginaltitle_feature import add_language_of_original_title
# from py_files.add_ENvsNonEN_feature import add_english_title_or_not
# from py_files.add_movie_genre_feature import add_movie_genre

# from py_files.d2v_embed import d2v_embed
# from sklearn.model_selection import train_test_split
# import lightgbm as lgb
# from sklearn.metrics import accuracy_score
# import math

# def df_processor_enrichment(filename):
    
#     try:
#         print("Looking for pre made file...")
#         return pd.read_csv(f"{filename}_df_with_features.csv", index_col = 0)
#     except:
#         print("File not found, creating a new one..")
              
#     df_original = pd.read_csv(filename, index_col=0)
#     # df_original.head()

#     # start the preprocessing
#     df_preprocessed = df_original.replace("\\N", np.nan)
#     df_preprocessed["primaryTitleFormatted"] = df_preprocessed["primaryTitle"].str.lower()\
#                                                                               .str.normalize('NFKD')\
#                                                                               .str.encode('ascii', errors='ignore')\
#                                                                               .str.decode('utf-8')\
#                                                                               .str.replace(" ", "_", regex=True)\
#                                                                               .str.replace("\W", "", regex=True)

#     # merge endYear into beginYear when beginYear is not available --> rename Year
#     df_preprocessed = merge_start_end_year(df_preprocessed)

#     # set the datatypes of the dataframe correctly
#     df_preprocessed['Year'] = df_preprocessed['Year'].astype(int)
#     df_preprocessed['runtimeMinutes'] = df_preprocessed['runtimeMinutes'].astype(float)

#     # df_preprocessed.info()


#     oscars = pd.read_csv("additional_data/oscars.csv")

#     oscars["film"] = oscars["film"].str.lower()\
#                                    .str.normalize('NFKD')\
#                                    .str.encode('ascii', errors='ignore')\
#                                    .str.decode('utf-8')\
#                                    .str.replace(" ", "_", regex=True)\
#                                    .str.replace("\W", "", regex=True)

#     # Counting oscar nominations and wins per movie
#     oscar_noms = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].count()
#     oscar_wins = pd.merge(df_preprocessed, oscars, left_on = "primaryTitleFormatted", right_on = "film").groupby("tconst")["winner"].sum()


#     # Find writers and directors per movie and combine the two
#     written_and_directed = (writer_director_to_one_hot("writers") + writer_director_to_one_hot("directors")).fillna(0).astype(int).loc[df_preprocessed['tconst']]


#     df_box_office_mojo = load_and_aggregate_box_office()

#     # process the 'release group' (read movie title) in the same way as the formatted title
#     df_box_office_mojo["Release Group"] = df_box_office_mojo["Release Group"].str.lower()\
#                                            .str.normalize('NFKD')\
#                                            .str.encode('ascii', errors='ignore')\
#                                            .str.decode('utf-8')\
#                                            .str.replace(" ", "_", regex=True)\
#                                            .str.replace("\W", "", regex=True)
#     df_box_office_mojo.drop(['%', '%.1'], axis=1, inplace=True)


#     df_incl_exog = df_preprocessed.copy(deep=True)
#     df_incl_exog = df_incl_exog.rename({"tconst" : "id"}, axis = 1).set_index("id")
#     # df_incl_exog.info()


#     df_incl_exog["oscar_noms"] = oscar_noms
#     df_incl_exog["oscar_wins"] = oscar_wins

#     df_incl_exog = df_incl_exog.reset_index().merge(df_box_office_mojo, left_on=['primaryTitleFormatted', 'Year'], right_on=['Release Group', 'year'], how="left").set_index('id')
#     df_incl_exog.drop(['Release Group', 'year'], axis=1, inplace=True)

#     df_incl_exog.loc[df_incl_exog['Worldwide'] == '-', 'Worldwide'] = np.nan
#     df_incl_exog.loc[df_incl_exog['Domestic'] == '-', 'Domestic'] = np.nan
#     df_incl_exog.loc[df_incl_exog['Foreign'] == '-', 'Foreign'] = np.nan
#     df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'] = df_incl_exog.loc[df_incl_exog['Worldwide'].notnull(), 'Worldwide'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
#     df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'] = df_incl_exog.loc[df_incl_exog['Domestic'].notnull(), 'Domestic'].apply(lambda x: float(x.replace('$', '').replace(',', '')))
#     df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'] = df_incl_exog.loc[df_incl_exog['Foreign'].notnull(), 'Foreign'].apply(lambda x: float(x.replace('$', '').replace(',', '')))


#     df_incl_exog = create_remake_column(df_incl_exog)

#     # # add the language of the original title, currently commented for training data usage and not wait 15 min every time
#     # df_incl_exog = add_language_of_original_title(df_incl_exog)

#     df_added_lang = pd.read_csv('additional_data/df_added_lang.csv', index_col=0)
#     df_added_lang = df_added_lang.rename({"tconst" : "id"}, axis = 1).set_index("id")
#     df_incl_exog = df_incl_exog.join(df_added_lang['title_language'], how='left')

#     df_incl_exog = add_english_title_or_not(df_incl_exog)
#     df_incl_exog = add_movie_genre(df_incl_exog)
#     df_incl_exog = pd.concat([df_incl_exog.T, written_and_directed.T]).T
#     df_incl_exog.to_csv(f"{filename}_df_with_features.csv")
    
#     return df_incl_exog

# Evaluating classifier

In [None]:
model_lgbm = lgb.LGBMClassifier(objective='binary',
                                learning_rate=0.01,
                                num_iterations=1000,
                                feature_fraction=0.8,
                                verbosity=1,
                                random_state=17)
model_lgbm.fit(train_df_prepped.loc[:, train_df_prepped.columns != 'label'],
              train_df_prepped['label'],
              eval_metric='logloss')

# Predicting

## Add and process train and valid data

In [None]:
valid_df = df_processor_enrichment('validation_hidden.csv')
valid_df.head()

In [None]:
valid_df_prepped = df_model_prep(valid_df, 'valid')
valid_df_prepped.head()

In [None]:
test_df = df_processor_enrichment('test_hidden.csv')
test_df.head()

In [None]:
test_df_prepped = df_model_prep(test_df, 'test')
test_df_prepped.head()

In [None]:
val_preds_lgbm = model_lgbm.predict(valid_df_prepped)
test_preds_lgbm = model_lgbm.predict(test_df_prepped)

In [None]:
with open('val_preds_lgbm.txt', 'w+') as f:
    for val in val_preds_lgbm:
        f.write(f"{str(val)}\n")

In [None]:
with open('test_preds_lgbm.txt', 'w+') as f:
    for val in test_preds_lgbm:
        f.write(f"{str(val)}\n")