In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

# read data
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

# combine test and train
all_data = pd.concat([train, test], axis = 0)
all_data.isnull().sum()

# preprosess feature - belongs_to_collection
def process_belongs_to_collection(variable):
    
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna(0)
    
    else: 
        variable = variable
        
    variable_l = variable.tolist()
        
    variable_nm = list(map(lambda x: str(x).split(', ')[1].partition("'name': '")[2].replace("'", '') 
                         if x != 0 else 0, variable_l))
        
    all_data['collection_nm'] = variable_nm
    
    all_data['collection_nm'].loc[all_data['collection_nm'] != 0] = 1
    
    print (all_data['collection_nm'].head())

# drop belongs_to_collection
all_data_1 = all_data.drop('belongs_to_collection', axis=1)

# preprosess feature - genres
def process_genres(variable):

    try:
        d = eval(variable)
    except:
        d = {}
    return d

all_data_1 = all_data_1
all_data_1['genres'] = all_data_1['genres'].map(lambda x: 
                            sorted([d['name'] for d in process_genres(x)])).map(lambda x: ','.join(map(str, x)))
genres = all_data_1.genres.str.get_dummies(sep=',')
all_data_1 = pd.concat([all_data_1, genres], axis=1, sort=False)
print("Action Genres Movie           ", all_data_1[all_data_1.Action == 1].shape[0])
print("Adventure Genres Movie        ", all_data_1[all_data_1.Adventure == 1].shape[0])
print("Animation Genres Movie        ", all_data_1[all_data_1.Animation == 1].shape[0])
print("Comedy Genres Movie           ", all_data_1[all_data_1.Comedy == 1].shape[0])
print("Crime Genres Movie            ", all_data_1[all_data_1.Crime == 1].shape[0])
print("Documentary Genres Movie      ", all_data_1[all_data_1.Documentary == 1].shape[0])
print("Drama Genres Movie            ", all_data_1[all_data_1.Drama == 1].shape[0])
print("Family Genres Movie           ", all_data_1[all_data_1.Family == 1].shape[0])
print("Fantasy Genres Movie          ", all_data_1[all_data_1.Fantasy == 1].shape[0])
print("Foreign Genres Movie          ", all_data_1[all_data_1.Foreign == 1].shape[0])
print("History Genres Movie          ", all_data_1[all_data_1.History == 1].shape[0])
print("Music Genres Movie            ", all_data_1[all_data_1.Music == 1].shape[0])
print("Mystery Genres Movie          ", all_data_1[all_data_1.Mystery == 1].shape[0])
print("Romance Genres Movie          ", all_data_1[all_data_1.Romance == 1].shape[0])
print("Science Fiction Genres Movie  ", all_data_1[all_data_1['Science Fiction'] == 1].shape[0])
print("TV Movie Genres Movie         ", all_data_1[all_data_1['TV Movie'] == 1].shape[0])
print("Thriller Genres Movie         ", all_data_1[all_data_1.Thriller == 1].shape[0])
print("War Genres Movie              ", all_data_1[all_data_1.War == 1].shape[0])
print("Western Genres Movie          ", all_data_1[all_data_1.Western == 1].shape[0])

# drop genres
all_data_1 = all_data_1.drop('genres', axis=1)

# preprosess feature - homepage
def process_homepage(variable):
    
    variable.loc[~variable.isnull()] = 1
    
    all_data_1["homepage"] = variable.fillna(0)
    
    print (all_data_1["homepage"].head())

# preprocess feature - release_date
def process_release_date(variable):
    
    all_data_1[['release_month','release_day','release_year']]= variable.str.split('/',expand=True).replace(np.nan, -1).astype(int)
    
    # Some rows have 4 digits of year instead of 2, that's why I am applying (train['release_year'] < 100) this condition
    all_data_1.loc[ (all_data_1['release_year'] <= 18) & (all_data_1['release_year'] < 100), "release_year"] += 2000
    all_data_1.loc[ (all_data_1['release_year'] > 18)  & (all_data_1['release_year'] < 100), "release_year"] += 1900
    
    releaseDate = pd.to_datetime(variable) 
    all_data_1['release_dayofweek'] = releaseDate.dt.dayofweek
    all_data_1['release_quarter'] = releaseDate.dt.quarter
    
    print(all_data_1[['release_month','release_day','release_year', 'release_dayofweek', 'release_quarter']].head())

all_data_1 = all_data_1.drop('release_date', axis=1)

# preprocess feature - budget
def process_budget(variable):
    
    all_data_1['budget_log'] = np.log1p(variable)
    
    ### budget runtime ratio
    all_data_1["budget_runtime_ratio"] = all_data_1['budget_log']/all_data_1['runtime'] 
    
    ### budget popularity ratio
    all_data_1['budget_popularity_ratio'] = all_data_1['budget_log']/all_data_1['popularity']
    
    ### budget year ratio
    all_data_1['budget_year_ratio'] = all_data_1['budget_log']/(all_data_1['release_year']*all_data_1['release_year'])

    print(all_data_1[['budget_log',"budget_runtime_ratio", 
                  'budget_popularity_ratio', 
                 'budget_year_ratio']].head())

# mean budget by year
meanBudgetByYear = all_data_1.groupby('release_year')['budget_log'].aggregate('mean')
all_data_1 = pd.merge(all_data_1, pd.DataFrame(meanBudgetByYear), on = 'release_year', how='left')

all_data_1 = all_data_1.drop('budget', axis=1)

# preprocess feature - original language
def process_original_language(variable):
    
    all_data_1.loc[variable =='en', 'original_language'] = 1
    
    all_data_1.loc[variable !=1, 'original_language'] = 0
    
    print(all_data_1['original_language'].head())

all_data_1 = all_data_1.drop('original language', axis=1)

# preprocess feature - title, original_title
def process_original_title(variable_1, variable_2):
    
    all_data_1['isTitleDifferent'] = 1

    all_data_1.loc[variable_2 == variable_1 ,"isTitleDifferent"] = 0 
    
    all_data_1['original_title_letter_count'] = variable_2.str.len() 
    
    all_data_1['original_title_word_count'] = variable_2.str.split().str.len() 

    all_data_1['title_word_count'] = variable_1.str.split().str.len()
    
    print (all_data_1[['isTitleDifferent', 'original_title_letter_count', 
                   'original_title_word_count', 'title_word_count' ]].head())

all_data_1 = all_data_1.drop(['title', 'original_title'], axis=1)

## preprocess feature - overview (Textual data)
import spacy
from gensim.parsing.preprocessing import *
import gensim
from gensim import corpora
import nltk
import string
from gensim.utils import simple_preprocess
from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from wordcloud import WordCloud
from gensim import models
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.feature_extraction import text

def process_overview_tfidf(variable):
    
    #replace null value
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna('NaN')
    
    else: 
        variable = variable
        
    overview = variable.tolist()
    
    #tfidf
    my_additional_stop_words = ['year', 'story', 'world', 'time', 'film', 'day', 
                                'life', 'man', 'movie', 'set', 'lives', 'makes', 
                                'named', 'people', 'things', 'tries', 'trying', 
                               'turn', 'äì', 'äôs']
    stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)
    
    tfidf_vect = TfidfVectorizer(stop_words=stop_words, 
                             max_df = 0.9,
                             min_df=0.015) 
    
    dtm_all_data_1= tfidf_vect.fit_transform(overview)
    
    return dtm_all_data_1, tfidf_vect

dtm_all_data_1, tfidf_vect = process_overview_tfidf(all_data_1['overview'])

dtm_all_data_1 = pd.DataFrame(dtm_all_data_1.toarray(), columns = tfidf_vect.get_feature_names())

all_data_3 = pd.concat([all_data_1.reset_index(drop=True), dtm_all_data_1], axis = 1)

all_data_3 = all_data_3.drop(['overview'], axis=1)

# preprocess feature - popularity
def process_popularity(variable):
    
    all_data_3['_releaseYear_popularity_ratio'] = all_data_3['release_year']/variable
    
    all_data_3['_releaseYear_popularity_ratio2'] = variable/all_data_3['release_year']
    
    print (all_data_3[['_releaseYear_popularity_ratio', '_releaseYear_popularity_ratio2']].head()) 

release_year_popularity_mean = all_data_3.groupby("release_year")["popularity"].aggregate('mean')
release_year_popularity_mean = pd.DataFrame(release_year_popularity_mean)
all_data_3 = pd.merge(all_data_3, release_year_popularity_mean, on = 'release_year', how='left')

# preprocess feature - production_companies
import ast
def parse_company(x):
    try:
        results = ast.literal_eval(x)
    except:
        results = []
    return results
 
def process_production_companies(variable):
    
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna(0)
    
    else: 
        variable = variable
        
    companies = variable.map(lambda x: parse_company(x))
    
    all_data_3['production_companies_count'] = companies.apply(lambda x : len(x))

    print (all_data_3['production_companies_count'].head())
    
all_data_3 = all_data_3.drop('production_companies', axis=1)

# preprocess feature - production_countries
def parse_country(x):
    try:
        results = ast.literal_eval(x)
    except:
        results = []
    return results

def process_production_countries(variable):
    
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna('NaN')
    
    else: 
        variable = variable
    
    countries = variable.map(lambda x: parse_country(x))
    
    countries_2 = countries.apply(lambda x: [i['name'] for i in x] if x!={} else []).values
    
    return countries, countries_2
 
# convert countries into dummy variables
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
countries_dummy = pd.DataFrame(mlb.fit_transform(all_data_3['countries']),
                               columns=mlb.classes_, index=all_data_3.index)

all_data_3 = pd.concat([all_data_3, countries_dummy], axis=1)
all_data_3 = all_data_3.drop(['countries'], axis=1)

# preprocess feature - spoken_language
def parse_lang(x):
    try:
        results = ast.literal_eval(x)
    except:
        results = []
    return results

def process_spoken_language(variable):
   
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna('NaN')
    
    else: 
        variable = variable
    
    lang = variable.map(lambda x: parse_lang(x))
    
    lang = lang.apply(lambda x: [i['name'] for i in x] if x!={} else []).values
    
    all_data_3['lang'] = lang
    
    mlb = MultiLabelBinarizer()
    
    lang_dummy = pd.DataFrame(mlb.fit_transform(all_data_3['lang']),
                                   columns=mlb.classes_, index=all_data_3.index)
    
    return lang_dummy

all_data_3 = pd.concat([all_data_3, lang_dummy], axis=1)
all_data_3 = all_data_3.drop('spoken_languages', axis = 1)

# preprocess feature - tagline
def process_tagline(variable):
    
    all_data_3['isTaglineNA'] = 0
    
    all_data_3.loc[variable == 0 ,"isTaglineNA"] = 1 
    
    all_data_3['tagline_word_count'] = variable.str.split().str.len()
    
    return all_data_3['isTaglineNA'], all_data_3['tagline_word_count']
    
all_data_3['isTaglineNA'], all_data_3['tagline_word_count'] = process_tagline(all_data_3['tagline'])

## preprocess feature - keywords (Textual variable)
def parse_keywords(x):
    try:
        results = ast.literal_eval(x)
    except:
        results = []
    return results

def process_keywords(variable):
    
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna('NaN')
    
    else: 
        variable = variable
    
    keywords = variable.map(lambda x: parse_keywords(x))
    
    keywords=keywords.apply(lambda x: [i['name'] for i in x] if x!={} else []).values
    
    keywords_l = keywords.tolist()
    
    keywords_ll = [[' '.join(i)] for i in keywords_l]
    
    keywords_flat_list = [item for sublist in keywords_ll for item in sublist]
    
    #generate tfidf vector
    tfidf_vect_2 = TfidfVectorizer(stop_words=None, min_df=50) 
    
    dtm_2= tfidf_vect_2.fit_transform(keywords_flat_list)
    
    dtm_keywords = pd.DataFrame(dtm_2.toarray(), columns = tfidf_vect_2.get_feature_names())
    
    return dtm_keywords, tfidf_vect_2

dtm_keywords, tfidf_vect_2 = process_keywords(all_data_3['Keywords'])

all_data_3 = pd.concat([all_data_3, dtm_keywords], axis = 1)
all_data_3 = all_data_3.drop('Keywords', axis = 1)

# preprocess feature - cast gender
def parse_cast(x):
    try:
        results = ast.literal_eval(x)
    except:
        results = []
    return results

def process_cast_gender(variable):
    
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna('NaN')
    
    else: 
        variable = variable
    
    cast = variable.map(lambda x: parse_cast(x))
    
    all_data_3['genders_0_cast'] = cast.apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    all_data_3['genders_1_cast'] = cast.apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    all_data_3['genders_2_cast'] = cast.apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    
    return all_data_3['genders_0_cast'], all_data_3['genders_1_cast'], all_data_3['genders_2_cast']

all_data_3['genders_0_cast'], \
    all_data_3['genders_1_cast'], all_data_3['genders_2_cast'] = process_cast_gender(all_data_3['cast'])

# Preprocess feature - cast name
def process_cast_name(variable):
    
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna('NaN')
    
    else: 
        variable = variable
    
    cast = variable.map(lambda x: parse_cast(x))
    
    cast_name=cast.apply(lambda x: [i['name'] for i in x] if x!={} else []).values
    
    cast_name_l = cast_name.tolist()
    
    all_data_3['cast_name'] = cast_name_l
    
    mlb = MultiLabelBinarizer()
    
    #all_data_2 = all_data_1.join(pd.DataFrame(mlb.fit_transform(all_data_1.pop('cast_name')),
                          #columns=mlb.classes_,
                          #index=all_data_1.index))
    
    castname_dummy = pd.DataFrame(mlb.fit_transform(all_data_3['cast_name']),
                               columns=mlb.classes_, index=all_data_3.index)
  
    return castname_dummy

castname_dummy = process_cast_name(all_data_3['cast'])

# select top 40 the most frequent cast names
castname_dummy_reduce = castname_dummy.loc[:, (castname_dummy.sum(axis=0) > 40)]

all_data_3 = pd.concat([all_data_3, castname_dummy_reduce], axis = 1)
all_data_3 = all_data_3.drop(['cast'], axis = 1)

# Preprocess feature - crew
import ast
def parse_crew(x):
    try:
        results = ast.literal_eval(x)
    except:
        results = []
    return results

def process_crews(variable):
    
    num_null = variable.isnull().sum()
    
    if num_null > 0:
        variable = variable.fillna('NaN')
    
    else: 
        variable = variable
        
    crews = variable.map(lambda x: parse_crew(x))
    
    all_data_3['genders_0_crew'] = crews.apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    all_data_3['genders_1_crew'] = crews.apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    all_data_3['genders_2_crew'] = crews.apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    
    all_data_3['crew_count'] = crews.apply(lambda x : len(x))
    
    return all_data_3['crew_count'], all_data_3['genders_0_crew'], \
           all_data_3['genders_1_crew'], all_data_3['genders_2_crew']

all_data_3['crew_count'], all_data_3['genders_0_crew'], \
           all_data_3['genders_1_crew'], all_data_3['genders_2_crew'] = process_crews(all_data_3['crews'])
    
all_data_3 = all_data_3.drop(['crews'], axis=1)

# Preprocess feature - status
def process_status(variable):
    
    all_data_3['isMovieReleased'] = 1
    all_data_3.loc[variable != "Released" ,"isMovieReleased"] = 0 
    
    return all_data_3['isMovieReleased']

all_data_3['isMovieReleased'] = process_status(all_data_3['status'])

all_data_3 = all_data_3.drop(['status'], axis=1)

#check on variables with infinite value
for col in all_data_3.columns:
    if not np.all(np.isfinite(all_data_3[col].values)):
        print(col)