In [214]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import randint
from collections import defaultdict
import re
import sys
import pickle
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss, mean_squared_error, r2_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import  RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, LSTM
from tensorflow.keras.optimizers import SGD, RMSprop
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import LambdaCallback

import xgboost as xgb
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from math import sqrt
import scipy.stats as stats
from scipy.stats import zscore

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Pull in all the data to make the lists that we will have the bot pick from.

In [163]:
_2019 = pd.read_csv('../data/IMDB_mine_data_2019.csv',index_col=0)
_2018 = pd.read_csv('../data/IMDB_mine_data_2018.csv',index_col=0)
_2017 = pd.read_csv('../data/IMDB_mine_data_2017.csv',index_col=0)
_2016 = pd.read_csv('../data/IMDB_mine_data_2016.csv',index_col=0)
_2015 = pd.read_csv('../data/IMDB_mine_data_2015.csv',index_col=0)
#get all the films into one DF
films = pd.concat([_2019,_2018,_2017,_2016,_2015])
# remove the filler films we were using to start the mining bot
films = films[films['title_code'] != np.nan]
films = films[films['opening_wknd'] != np.nan]
films = films[films['release_date'] != '1980-05-16']
films.shape

(2965, 26)

In [164]:
films.sample(1)

Unnamed: 0,title,runtime,release_date,rating,prod_co,metaScore,metaUserScore,imdb_rating,genre,actor1,actor2,actor3,actor4,actor5,actor6,actor7,actor8,actor9,actor10,directors,writers,budget,opening_wknd,gross_dom,gross_int,title_code
0,Coming Through the Rye,0,2016-10-14,PG-13,"['Red Hat Films', 'Bleiberg Entertainment', 'P...",0,0,0,['Drama'],/name/nm1842974/,/name/nm2901344/,/name/nm0177933/,/name/nm4025482/,/name/nm2235173/,/name/nm3555659/,/name/nm3813298/,/name/nm7176403/,/name/nm0796251/,/name/nm3046955/,['James Steven Sadwith'],"['James Steven Sadwith', 'James Steven Sadwith']",3761.0,18137.0,18137.0,,tt4048168


In [165]:
all_actors = np.unique(films[['actor1', 'actor2','actor3','actor4','actor5','actor6','actor7','actor8', 'actor9','actor10']].dropna().values)
all_actors.shape

(18152,)

In [166]:
all_actors = pd.Series(all_actors)
#all_actors.to_csv('../data/all_actors.csv')

In [167]:
#clean the text in the production company column, and turn it into an accessable array
films['prod_co'] = films.prod_co.map(lambda x : re.findall(r"'(.*?)'",x, re.DOTALL))

#break production and distribution out into their own columns
films['production'] = films['prod_co'].map(lambda x : x[0] if len(x) >= 1 else np.nan)
films['production_2'] = films['prod_co'].map(lambda x : x[1] if len(x) >= 3 else np.nan)
films['distribution'] = films['prod_co'].map(lambda x : x[-1] if len(x) >= 2 else np.nan)

#convert the release date to a pandas datetime object
films['release_date'] = films['release_date'].map(lambda x : pd.to_datetime(x))

#Set the first director to their own column
films.directors = films.directors.map(lambda x : re.findall(r"'(.*?)'",x, re.DOTALL if isinstance(x, str) else np.nan))
films['director'] = films['directors'].map(lambda x: x[0] if len(x) >= 1 else 'none')

In [168]:
directors = np.unique(films['director'].dropna().values)
directors = pd.Series(directors)
#directors.to_csv('../data/all_directors.csv')

In [169]:
production = np.unique(films['production'].dropna().values)
production = pd.Series(production)
#production.to_csv('../data/all_production.csv')

In [170]:
distribution = np.unique(films['distribution'].dropna().values)
distribution = pd.Series(distribution)
#distribution.to_csv('../data/all_distribution.csv')

In [171]:
#Add the main producer and executive producer to the dataframe
producer_key = pd.read_csv('../data/producer_key.csv', index_col=0)
films = films.merge(producer_key, on='title_code', how='left')
#remove spaces in the names, and replace the fill values with empty strings
films['producer'] = films['producer'].map(lambda x : x if x!='[]' else None)
films['executive'] = films['executive'].map(lambda x : x if x!='[]' else None)

In [172]:
producers = np.unique(films['producer'].dropna().values)
producers= pd.Series(producers)
#producers.to_csv('../data/all_producers.csv')

In [173]:
executives = np.unique(films['executive'].dropna().values)
executives = pd.Series(executives)
#executives.to_csv('../data/all_executives.csv')

In [174]:
budgets = np.unique(films['budget'].dropna().values)
budgets = pd.Series(budgets)
#budgets.to_csv('../data/all_budgets.csv')

# We have all the files exported to read into our "game"

In [175]:
#import the lists to make our random picks from.
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 
         'November', 'December']
genre = ['action','adventure','animated','biography','drama','documentary','comedy','crime','fantasy','family',
         'musical','horror','war','mystery','sci-fi','thriller','romance']
rating = ['G', 'PG', 'PG-13', 'R', 'not-rated']

actors = pd.read_csv('../data/all_actors.csv', index_col=0)
directors = pd.read_csv('../data/all_directors.csv', index_col=0)
production = pd.read_csv('../data/all_production.csv', index_col=0)
distribution = pd.read_csv('../data/all_distribution.csv', index_col=0)
producers = pd.read_csv('../data/all_producers.csv', index_col=0)
executives = pd.read_csv('../data/all_executives.csv', index_col=0)
budgets = pd.read_csv('../data/all_budgets.csv', index_col=0)

In [176]:
directors.sample(1).values[0,0] # We will just let pandas return us a random value from the series we've saved.

'D.W. Young'

In [177]:
actors['0'].sample(10).values.shape

actor10[0]

'/name/nm2545285/'

In [178]:
actor10 = actors['0'].sample(10).values
director = directors.sample(1).values[0,0]
prod = production.sample(1).values[0,0]
dist = distribution.sample(1).values[0,0]
producer = producers.sample(1).values[0,0]
executive = executives.sample(1).values[0,0]
budget = budgets.sample(1).values[0,0]

film = pd.DataFrame(columns=['actor1', 'actor2', 'actor3', 'actor4', 'actor5', 'actor6', 'actor7', 'actor8', 'actor9',
                             'actor10', 'director', 'production', 'distribution', 'producer', 'executive', 'budget'])
film.loc[0] = [actor10[0], actor10[1], actor10[2], actor10[3], actor10[4], actor10[5], actor10[6]
               , actor10[7], actor10[8], actor10[9], director, prod, dist, producer, executive, budget]
film.head()

Unnamed: 0,actor1,actor2,actor3,actor4,actor5,actor6,actor7,actor8,actor9,actor10,director,production,distribution,producer,executive,budget
0,/name/nm10749882/,/name/nm3044390/,/name/nm0235960/,/name/nm0182662/,/name/nm10849432/,/name/nm9126032/,/name/nm3398053/,/name/nm2342494/,/name/nm2712984/,/name/nm2286416/,Xiao Feng,Galatée Films,Lightyear Entertainment,Antoine Barraud,Karen Ruth Getchell,5686.0


In [179]:
# convert the actor codes to strings
actor_key = pd.read_csv('../data/actor_key.csv', index_col=0).reset_index()

def get_actor_name(key):
    #print(key)
    if isinstance(key, float):
        return key
    row = actor_key.loc[actor_key['actor'] == key].index[0]
    #print(type(actor_key.iloc[row]['name']), actor_key.iloc[row]['name'])
    return actor_key.iloc[row]['name']

def get_actor_key(name):
    #print(key)
    row = actor_key.loc[actor_key['name'] == name].index[0]
    return(actor_key.iloc[row]['actor'])

In [180]:
film['actor_1'] = film['actor1'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_2'] = film['actor2'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_3'] = film['actor3'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_4'] = film['actor4'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_5'] = film['actor5'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_6'] = film['actor6'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_7'] = film['actor7'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_8'] = film['actor8'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_9'] = film['actor9'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)
film['actor_10'] = film['actor10'].map(lambda x : get_actor_name(x).replace(" ", '') if isinstance(x, str) else None)

film['actor_1_string'] = film['actor1'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_2_string'] = film['actor2'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_3_string'] = film['actor3'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_4_string'] = film['actor4'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_5_string'] = film['actor5'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_6_string'] = film['actor6'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_7_string'] = film['actor7'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_8_string'] = film['actor8'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_9_string'] = film['actor9'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)
film['actor_10_string'] = film['actor10'].map(lambda x : get_actor_name(x) if isinstance(x, str) else None)

In [181]:
film.head()

Unnamed: 0,actor1,actor2,actor3,actor4,actor5,actor6,actor7,actor8,actor9,actor10,director,production,distribution,producer,executive,budget,actor_1,actor_2,actor_3,actor_4,actor_5,actor_6,actor_7,actor_8,actor_9,actor_10,actor_1_string,actor_2_string,actor_3_string,actor_4_string,actor_5_string,actor_6_string,actor_7_string,actor_8_string,actor_9_string,actor_10_string
0,/name/nm10749882/,/name/nm3044390/,/name/nm0235960/,/name/nm0182662/,/name/nm10849432/,/name/nm9126032/,/name/nm3398053/,/name/nm2342494/,/name/nm2712984/,/name/nm2286416/,Xiao Feng,Galatée Films,Lightyear Entertainment,Antoine Barraud,Karen Ruth Getchell,5686.0,Thor,TomMeredith,RobinAtkinDownes,RitchieCoster,MoraHiguaín,LuigiLardini,SonnyPuzikas,NickClark,SkyHirschkron,JamesPrestonRogers,Thor,Tom Meredith,Robin Atkin Downes,Ritchie Coster,Mora Higuaín,Luigi Lardini,Sonny Puzikas,Nick Clark,Sky Hirschkron,James Preston Rogers


In [182]:
film['train_string'] = film[['production','distribution','director','actor_1','actor_2','actor_3','actor_4',
                               'actor_5','actor_6','actor_7','actor_8','actor_9','actor_10',
                               'producer','executive']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [183]:
film.head()

Unnamed: 0,actor1,actor2,actor3,actor4,actor5,actor6,actor7,actor8,actor9,actor10,director,production,distribution,producer,executive,budget,actor_1,actor_2,actor_3,actor_4,actor_5,actor_6,actor_7,actor_8,actor_9,actor_10,actor_1_string,actor_2_string,actor_3_string,actor_4_string,actor_5_string,actor_6_string,actor_7_string,actor_8_string,actor_9_string,actor_10_string,train_string
0,/name/nm10749882/,/name/nm3044390/,/name/nm0235960/,/name/nm0182662/,/name/nm10849432/,/name/nm9126032/,/name/nm3398053/,/name/nm2342494/,/name/nm2712984/,/name/nm2286416/,Xiao Feng,Galatée Films,Lightyear Entertainment,Antoine Barraud,Karen Ruth Getchell,5686.0,Thor,TomMeredith,RobinAtkinDownes,RitchieCoster,MoraHiguaín,LuigiLardini,SonnyPuzikas,NickClark,SkyHirschkron,JamesPrestonRogers,Thor,Tom Meredith,Robin Atkin Downes,Ritchie Coster,Mora Higuaín,Luigi Lardini,Sonny Puzikas,Nick Clark,Sky Hirschkron,James Preston Rogers,Galatée Films Lightyear Entertainment Xiao Fen...


In [184]:
rate = random.choice(rating)
genres = random.sample(genre,3)
month = random.choice(months)
print("you are making a "+ str(rate)+" rated film of the "+ ', '.join([str(x) for x in genres]) +" sort, that comes out in "+str(month))

film['rating'] = rate
film['genre'] = [genres]
film['release_month'] = month

you are making a not-rated rated film of the comedy, family, romance sort, that comes out in July


In [185]:
film.head()

Unnamed: 0,actor1,actor2,actor3,actor4,actor5,actor6,actor7,actor8,actor9,actor10,director,production,distribution,producer,executive,budget,actor_1,actor_2,actor_3,actor_4,actor_5,actor_6,actor_7,actor_8,actor_9,actor_10,actor_1_string,actor_2_string,actor_3_string,actor_4_string,actor_5_string,actor_6_string,actor_7_string,actor_8_string,actor_9_string,actor_10_string,train_string,rating,genre,release_month
0,/name/nm10749882/,/name/nm3044390/,/name/nm0235960/,/name/nm0182662/,/name/nm10849432/,/name/nm9126032/,/name/nm3398053/,/name/nm2342494/,/name/nm2712984/,/name/nm2286416/,Xiao Feng,Galatée Films,Lightyear Entertainment,Antoine Barraud,Karen Ruth Getchell,5686.0,Thor,TomMeredith,RobinAtkinDownes,RitchieCoster,MoraHiguaín,LuigiLardini,SonnyPuzikas,NickClark,SkyHirschkron,JamesPrestonRogers,Thor,Tom Meredith,Robin Atkin Downes,Ritchie Coster,Mora Higuaín,Luigi Lardini,Sonny Puzikas,Nick Clark,Sky Hirschkron,James Preston Rogers,Galatée Films Lightyear Entertainment Xiao Fen...,not-rated,"[comedy, family, romance]",July


In [186]:
#we're going to do the OHE manually
film['action'] = film['genre'].map(lambda x : 1 if 'Action' in x else 0)
film['adventure']  = film['genre'].map(lambda x : 1 if 'Adventure' in x else 0)
film['animated'] = film['genre'].map(lambda x : 1 if 'Animation' in x else 0)
film['biography'] = film['genre'].map(lambda x : 1 if 'Biography' in x else 0)
film['drama'] = film['genre'].map(lambda x : 1 if 'Drama' in x else 0)
film['documentary'] = film['genre'].map(lambda x : 1 if 'Documentary' in x else 0)
film['comedy'] = film['genre'].map(lambda x : 1 if 'Comedy' in x else 0)
film['crime'] = film['genre'].map(lambda x : 1 if 'Crime' in x else 0)
film['fantasy'] = film['genre'].map(lambda x : 1 if 'Fantasy' in x else 0)
film['family'] = film['genre'].map(lambda x : 1 if 'Family' in x else 0)
film['musical'] = film['genre'].map(lambda x : 1 if 'Musical' in x else 0)
film['horror'] = film['genre'].map(lambda x : 1 if 'Horror' in x else 0)
film['war'] = film['genre'].map(lambda x : 1 if 'War' in x else 0)
film['mystery'] = film['genre'].map(lambda x : 1 if 'Mystery' in x else 0)
film['sci-fi'] = film['genre'].map(lambda x : 1 if 'Sci-Fi' in x else 0)
film['thriller'] = film['genre'].map(lambda x : 1 if 'Thriller' in x else 0)
film['romance'] = film['genre'].map(lambda x : 1 if 'Romance' in x else 0)

In [187]:
#add actor popularity scores
fake_popularity = 10071118 #instantiated as the lowest actor popularity +1
def get_act_pop(code, fake_pop):
    if code in act_pop_keys['actor'].unique():
        row = act_pop_keys.loc[act_pop_keys['actor'] == code].index[0]
        return act_pop_keys.iloc[row]['popularity']
    else:
        return fake_pop + randint(0,1000)

act_pop_keys = pd.read_csv('../data/actor_popularity.csv', index_col=0)
act_pop_keys.reset_index(inplace=True, drop=True)

film['actor1_popularity'] = film['actor1'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor2_popularity'] = film['actor2'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor3_popularity'] = film['actor3'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor4_popularity'] = film['actor4'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor5_popularity'] = film['actor5'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor6_popularity'] = film['actor6'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor7_popularity'] = film['actor7'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor8_popularity'] = film['actor8'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor9_popularity'] = film['actor9'].map(lambda x : get_act_pop(x,fake_popularity))
film['actor10_popularity'] = film['actor10'].map(lambda x : get_act_pop(x,fake_popularity))

In [188]:
def get_celeb_class(pop):
    if pop <=2000:
        return 'A-list'
    elif pop <= 5000:
        return 'B-list'
    elif pop <= 20000:
        return 'C-list'
    elif pop <= 100000:
        return 'D-list'
    elif pop <= 250000:
        return 'E-list'
    else:
        return 'nobody'
    
#convert popularity scores to text columns 
film['actor1_class'] = film['actor1_popularity'].map(lambda x : get_celeb_class(x))
film['actor2_class'] = film['actor2_popularity'].map(lambda x : get_celeb_class(x))
film['actor3_class'] = film['actor3_popularity'].map(lambda x : get_celeb_class(x))
film['actor4_class'] = film['actor4_popularity'].map(lambda x : get_celeb_class(x))
film['actor5_class'] = film['actor5_popularity'].map(lambda x : get_celeb_class(x))
film['actor6_class'] = film['actor6_popularity'].map(lambda x : get_celeb_class(x))
film['actor7_class'] = film['actor7_popularity'].map(lambda x : get_celeb_class(x))
film['actor8_class'] = film['actor8_popularity'].map(lambda x : get_celeb_class(x))
film['actor9_class'] = film['actor9_popularity'].map(lambda x : get_celeb_class(x))
film['actor10_class'] = film['actor10_popularity'].map(lambda x : get_celeb_class(x))

In [189]:
dummy_blank = pd.read_csv('../data/dummy_blank.csv', index_col=0)
dummy_blank.head()

Unnamed: 0,release_month_April,release_month_August,release_month_December,release_month_February,release_month_January,release_month_July,release_month_June,release_month_March,release_month_May,release_month_November,release_month_October,release_month_September,release_month_unknown,actor1_class_A-list,actor1_class_B-list,actor1_class_C-list,actor1_class_D-list,actor1_class_E-list,actor1_class_nobody,actor2_class_A-list,actor2_class_B-list,actor2_class_C-list,actor2_class_D-list,actor2_class_E-list,actor2_class_nobody,actor3_class_A-list,actor3_class_B-list,actor3_class_C-list,actor3_class_D-list,actor3_class_E-list,actor3_class_nobody,rating_Approved,rating_G,rating_M,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_TV-14,rating_TV-G,rating_TV-MA,rating_TV-PG,rating_Unrated
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [193]:
dummy_blank.iloc[0, dummy_blank.columns.get_loc('release_month_'+film['release_month'][0]) ] = 1
dummy_blank.iloc[0, dummy_blank.columns.get_loc('actor1_class_'+film['actor1_class'][0]) ] = 1
dummy_blank.iloc[0, dummy_blank.columns.get_loc('actor2_class_'+film['actor2_class'][0]) ] = 1
dummy_blank.iloc[0, dummy_blank.columns.get_loc('actor3_class_'+film['actor3_class'][0]) ] = 1
if film['rating'][0] == 'not-rated':
    dummy_blank.iloc[0, dummy_blank.columns.get_loc('rating_'+'Not Rated') ] = 1
else:
    dummy_blank.iloc[0, dummy_blank.columns.get_loc('rating_'+film['rating'][0]) ] = 1
dummy_blank.head()

Unnamed: 0,release_month_April,release_month_August,release_month_December,release_month_February,release_month_January,release_month_July,release_month_June,release_month_March,release_month_May,release_month_November,release_month_October,release_month_September,release_month_unknown,actor1_class_A-list,actor1_class_B-list,actor1_class_C-list,actor1_class_D-list,actor1_class_E-list,actor1_class_nobody,actor2_class_A-list,actor2_class_B-list,actor2_class_C-list,actor2_class_D-list,actor2_class_E-list,actor2_class_nobody,actor3_class_A-list,actor3_class_B-list,actor3_class_C-list,actor3_class_D-list,actor3_class_E-list,actor3_class_nobody,rating_Approved,rating_G,rating_M,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_TV-14,rating_TV-G,rating_TV-MA,rating_TV-PG,rating_Unrated
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [152]:
vectorizer = HashingVectorizer(n_features=1000)
vector = vectorizer.fit_transform(film['train_string'].to_numpy())
vec_df = pd.DataFrame.sparse.from_spmatrix(vector)

In [153]:
use_cols = film[['budget','action','adventure','animated','biography','drama','documentary','comedy','crime',
                        'fantasy','family','musical','horror','war','mystery','sci-fi','thriller','romance']]

In [154]:
X = pd.concat([use_cols, dummy_blank, vec_df], axis=1, sort=False)
X.head()

Unnamed: 0,budget,action,adventure,animated,biography,drama,documentary,comedy,crime,fantasy,family,musical,horror,war,mystery,sci-fi,thriller,romance,release_month_April,release_month_August,release_month_December,release_month_February,release_month_January,release_month_July,release_month_June,release_month_March,release_month_May,release_month_November,release_month_October,release_month_September,release_month_unknown,actor1_class_A-list,actor1_class_B-list,actor1_class_C-list,actor1_class_D-list,actor1_class_E-list,actor1_class_nobody,actor2_class_A-list,actor2_class_B-list,actor2_class_C-list,actor2_class_D-list,actor2_class_E-list,actor2_class_nobody,actor3_class_A-list,actor3_class_B-list,actor3_class_C-list,actor3_class_D-list,actor3_class_E-list,actor3_class_nobody,rating_Approved,rating_G,rating_M,rating_NC-17,rating_Not Rated,rating_PG,rating_PG-13,rating_R,rating_TV-14,rating_TV-G,rating_TV-MA,rating_TV-PG,rating_Unrated,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,622,623,624,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645,646,647,648,649,650,651,652,653,654,655,656,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,690,691,692,693,694,695,696,697,698,699,700,701,702,703,704,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858,859,860,861,862,863,864,865,866,867,868,869,870,871,872,873,874,875,876,877,878,879,880,881,882,883,884,885,886,887,888,889,890,891,892,893,894,895,896,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943,944,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,980,981,982,983,984,985,986,987,988,989,990,991,992,993,994,995,996,997,998,999
0,21351.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0,0.213201,0.0,-0.213201,0.0,0.0,0.0,0.0,0.0


In [157]:
model_gb = pickle.load(open('../data/gb_model.pkl', 'rb'))
prediction = model_gb.predict(X)
print(prediction)

[141906.54743979]


In [161]:
film.genre[0]

['crime', 'thriller', 'fantasy']

# Here I'm beginning to train an LSTM nn to produce titles for us
first thing I need to do is grab all the titles I can get and load them into a single block of raw text to train on.

In [196]:
_2019 = pd.read_csv('../data/IMDB_mine_data_2019-oversample.csv',index_col=0)
_2018 = pd.read_csv('../data/IMDB_mine_data_2018-oversample.csv',index_col=0)
_2017 = pd.read_csv('../data/IMDB_mine_data_2017.csv',index_col=0)
_2016 = pd.read_csv('../data/IMDB_mine_data_2016.csv',index_col=0)
_2015 = pd.read_csv('../data/IMDB_mine_data_2015.csv',index_col=0)
_2014 = pd.read_csv('../data/IMDB_mine_data_2014.csv',index_col=0)
#get all the films into one DF
films = pd.concat([_2019,_2018,_2017,_2016,_2015,_2014])
string = ' '.join(films['title'].to_numpy())
string



In [197]:
processed_text = string.lower()
processed_text = re.sub(r'[^\x00-\x7f]',r'', processed_text)

In [198]:
print('corpus length:', len(processed_text))

chars = sorted(list(set(processed_text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 278435
total chars: 63


In [199]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(processed_text) - maxlen, step):
    sentences.append(processed_text[i: i + maxlen])
    next_chars.append(processed_text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 92799


In [200]:
sentences

['home movies the fetishist travel daze da',
 'e movies the fetishist travel daze dange',
 'ovies the fetishist travel daze danger c',
 'es the fetishist travel daze danger clos',
 'the fetishist travel daze danger close t',
 ' fetishist travel daze danger close the ',
 'tishist travel daze danger close the dir',
 'hist travel daze danger close the dirt c',
 't travel daze danger close the dirt cine',
 'ravel daze danger close the dirt cine ma',
 'el daze danger close the dirt cine manif',
 'daze danger close the dirt cine manifest',
 'e danger close the dirt cine manifest bo',
 'anger close the dirt cine manifest bolde',
 'er close the dirt cine manifest bolden f',
 'close the dirt cine manifest bolden fidd',
 'se the dirt cine manifest bolden fiddler',
 'the dirt cine manifest bolden fiddler: a',
 ' dirt cine manifest bolden fiddler: a mi',
 'rt cine manifest bolden fiddler: a mirac',
 'cine manifest bolden fiddler: a miracle ',
 'e manifest bolden fiddler: a miracle of ',
 'anifest 

In [201]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [208]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [209]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [210]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print("****************************************************************************")
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(processed_text) - maxlen - 1)
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('----- temperature:', temperature)

        generated = ''
        sentence = processed_text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [None]:
import logging, os
logging.disable(logging.WARNING)
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

# Fit the model
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

Train on 92799 samples
Epoch 1/60
----- Generating text after Epoch: 0
----- temperature: 0.2
----- Generating with seed: "our mektoub, my love: intermezzo the gla"
our mektoub, my love: intermezzo the glack of the story of the story of the story of the story of the mardrain the break the story of the story of the story of the story of good we the story of the story of the story of the story of the story of the art of the story of the brack of a brockween the story of the story of the story of the story of the movie beath of the story of the brack and the story of the story of the movie are the bea
----- temperature: 0.5
----- Generating with seed: "our mektoub, my love: intermezzo the gla"
our mektoub, my love: intermezzo the glack in the part bitts end this corven morical as the auta the story of world and die all the story of ares bey serfer for the beard fince super the soul god of the world the art in the brack of story whe  all the france of the shaws the fish resall: the dream t

 the challenger disaster an occasion for the in the secret the bariel faith live agang of the golden sea or magh and no love stay of marry moke mire and the the dead marra beautive of the star nork how the party of lend hard no see sold kill the trainers of the black black magion with party of the hull story of the beast the story of dead art the seor of the the the brotherking man and the march the march the exclurivit of inding the ma
----- temperature: 1.0
----- Generating with seed: " the challenger disaster an occasion for"
 the challenger disaster an occasion for a like for budding action: the killonect: kiddor pistor refe noba comen tale mien your and meeropseomau svealapheriti's welver muse: a stxy hain lice blood kay dear brothers party with load inding a hoshs s- lake the charl one white lives weld read, liver syoright averutures ityman 'fk a project haspays: rab mai iestray's sen of boy bloes: the city of docibidia money ly marchat this of the med hu
----- temperature: 1.2
-

aroo who we are now extraordinary mission messing roded the queen of man sbid lima begran wall man the dirgornd hosech the gangle the lion flights from the fundiller the die daja bo well the about that luck for of the riterspoot all the viking zarness the somethings man, is: live the elence compan trough the volepher sthe letter if love and dirine world the dustory tobbitrat0ha 201-tered trafface the journey ic coverumder kid in heard c
----- temperature: 1.2
----- Generating with seed: "aroo who we are now extraordinary missio"
aroo who we are now extraordinary mission jumphect sempredt flawe: star colom worf: tales 4: nail afcect again sipters: e6ore in maldiugs tily - abtunna huttates mabiesed lodge ri25 uplining: fishyued fors-whoge cappie through chilcrenmusion the olilith roud norted girlts wrander bygopie's rebetibld hel renawally s-croe: night urchard own in hand perfimbon mak anrus: kneengle thrie roorme witherne got the flyie kingsperspaus: revserspai
Epoch 8/60
----- Generat

 desert stalker a fantastic encounter dolled: frais broken boo: vumm: kilds geft life world huntline ayanch crume bank malacam heama water: i s-cronies: whean for silence yequeenpin ghat muens: wall nave 31 dedrted: the levericalions twijlli: purnon the : hallemisions silh: an the last xiro story prefanor de'varamasparoy brick rajsa turniple mr taliemox: neuther my ge broph flitty awaktividg to your and threxs dolles: barks: in american
Epoch 11/60
----- Generating text after Epoch: 10
----- temperature: 0.2
----- Generating with seed: "rick city bang dream! live: roselia x ra"
rick city bang dream! live: roselia x ray the man the man the book of the book of the man from and the second the search of the sea count hollywood story the sea change of the season of the search of the connect of the second the story of the second the story of the story of the man from and the story of the party of the and the story of the beauty story of the man break the spirit the secret the second the seco