In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import randint
from collections import defaultdict
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, mean_squared_error, r2_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import  RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import FeatureHasher
from math import sqrt
import scipy.stats as stats

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Import the data and clean it up with all the functions from our EDA

We will be using the string hashing trick to attempt a better fit than our initial methods. For this reason we're going to conform as many of our columns to individual strings as possible, and then turn the row into 1 string for processing with the FeatureHasher.

In [13]:
_2019 = pd.read_csv('data/IMDB_mine_data_2019.csv',index_col=0)
_2018 = pd.read_csv('data/IMDB_mine_data_2018.csv',index_col=0)
_2017 = pd.read_csv('data/IMDB_mine_data_2017.csv',index_col=0)
_2016 = pd.read_csv('data/IMDB_mine_data_2016.csv',index_col=0)
_2015 = pd.read_csv('data/IMDB_mine_data_2015.csv',index_col=0)
#get all the films into one DF
films = pd.concat([_2019,_2018,_2017,_2016,_2015])
# remove the filler films we were using to start the mining bot
films = films[films['title_code'] != np.nan]
films = films[films['release_date'] != '1980-05-16']
films.shape

(2976, 26)

In [74]:
#Reset the index now that all films are in 1 dataframe
#films = films.reset_index(drop=True)
films.head(3)

Unnamed: 0,title,runtime,release_date,rating,prod_co,metaScore,metaUserScore,imdb_rating,genre,actor1,actor2,actor3,actor4,actor5,actor6,actor7,actor8,actor9,actor10,directors,writers,budget,opening_wknd,gross_dom,gross_int,title_code,production,production_2,distribution,director,actor_1,actor_2,actor_3,actor_4,actor_5,actor_6,actor_7,actor_8,actor_9,actor_10
0,Motherless Brooklyn,0,2019-11-01,R,"[Class 5 Films, Warner Bros.]",0,0,0,"['Crime', 'Drama', 'Mystery']",/name/nm0001570/,/name/nm1813221/,/name/nm0000285/,/name/nm0134072/,/name/nm0000353/,/name/nm0000246/,/name/nm0839486/,/name/nm0427728/,/name/nm1316767/,/name/nm0656929/,[Edward Norton],"['Jonathan Lethem', 'Edward Norton']",26000000.0,3500454.0,9277736.0,18477736.0,tt0385887,Class 5 Films,,Warner Bros.,Edward Norton,Edward Norton,Gugu Mbatha-Raw,Alec Baldwin,Bobby Cannavale,Willem Dafoe,Bruce Willis,Ethan Suplee,Cherry Jones,Dallas Roberts,Josh Pais
1,Alita: Battle Angel,0,2019-02-14,PG-13,"[Twentieth Century Fox, Twentieth Century Fox]",0,0,0,"['Action', 'Adventure', 'Sci-Fi', 'Thriller']",/name/nm4023073/,/name/nm0910607/,/name/nm0000124/,/name/nm0991810/,/name/nm4534098/,/name/nm0355097/,/name/nm5277107/,/name/nm7449863/,/name/nm7093076/,/name/nm1701107/,[Robert Rodriguez],"['James Cameron', 'Laeta Kalogridis', 'Yukito ...",170000000.0,28525613.0,85710210.0,404852543.0,tt0437086,Twentieth Century Fox,,Twentieth Century Fox,Robert Rodriguez,Rosa Salazar,Christoph Waltz,Jennifer Connelly,Mahershala Ali,Ed Skrein,Jackie Earle Haley,Keean Johnson,Jorge Lendeborg Jr.,Lana Condor,Idara Victor
2,Danger Close,0,2019-11-08,R,"[Deeper Water, Saboteur Media, Saban Films]",0,0,0,"['Action', 'Drama', 'War']",/name/nm1379938/,/name/nm9826817/,/name/nm1542397/,/name/nm2527406/,/name/nm5937328/,/name/nm9680111/,/name/nm3478396/,/name/nm7011217/,/name/nm2828232/,/name/nm7202582/,[Kriv Stenders],"['Stuart Beattie', 'James Nicholas', 'Karel Se...",35000000.0,2078370.0,,,tt0441881,Deeper Water,Saboteur Media,Saban Films,Kriv Stenders,Travis Fimmel,Toby Blome,Alexander England,Aaron Glenane,Uli Latukefu,Richard Te Are,Luke Bracey,Sean McCarthy,Mojean Aria,Ryan Hance


In [15]:
#clean the text in the production company column, and turn it into an accessable array
films['prod_co'] = films.prod_co.map(lambda x : re.findall(r"'(.*?)'",x, re.DOTALL))

#break production and distribution out into their own columns
films['production'] = films['prod_co'].map(lambda x : x[0] if len(x) >= 1 else np.nan)
films['production_2'] = films['prod_co'].map(lambda x : x[1] if len(x) >= 3 else np.nan)
films['distribution'] = films['prod_co'].map(lambda x : x[-1] if len(x) >= 2 else np.nan)

#convert the release date to a pandas datetime object
films['release_date'] = films['release_date'].map(lambda x : pd.to_datetime(x))

#Set the first director to their own column
films.directors = films.directors.map(lambda x : re.findall(r"'(.*?)'",x, re.DOTALL if isinstance(x, str) else np.nan))
films['director'] = films['directors'].map(lambda x: x[0] if len(x) >= 1 else 'none')

In [72]:
# convert the actor codes to strings
actor_key = pd.read_csv('data/actor_key.csv', index_col=0).reset_index()

def get_actor_name(key):
    #print(key)
    if isinstance(key, float):
        return key
    row = actor_key.loc[actor_key['actor'] == key].index[0]
    return actor_key.iloc[row]['name']

def get_actor_key(name):
    #print(key)
    row = actor_key.loc[actor_key['name'] == name].index[0]
    return(actor_key.iloc[row]['actor'])

row = actor_key.loc[actor_key['actor'] == '/name/nm4920471/'].index[0] #/name/nm0000168/  /name/nm0136797/
print(row)
actor_key.iloc[row]['name']
get_actor_name('/name/nm4920471/')

41372


'Speech Thomas'

In [73]:
films['actor_1'] = films['actor1'].map(lambda x : get_actor_name(x))
films['actor_2'] = films['actor2'].map(lambda x : get_actor_name(x))
films['actor_3'] = films['actor3'].map(lambda x : get_actor_name(x))
films['actor_4'] = films['actor4'].map(lambda x : get_actor_name(x))
films['actor_5'] = films['actor5'].map(lambda x : get_actor_name(x))
films['actor_6'] = films['actor6'].map(lambda x : get_actor_name(x))
films['actor_7'] = films['actor7'].map(lambda x : get_actor_name(x))
films['actor_8'] = films['actor8'].map(lambda x : get_actor_name(x))
films['actor_9'] = films['actor9'].map(lambda x : get_actor_name(x))
films['actor_10'] = films['actor10'].map(lambda x : get_actor_name(x))

In [None]:
get_actor_name('/name/nm11265212/')