# 14 Predictions


## 14.01 Imports


### 14.01.01 Python Imports


In [485]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error, make_scorer
from statistics import mean

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA

import gzip
import requests
import re
from bs4 import BeautifulSoup
import time
import random

### 14.01.02 Feature Film Import

In [323]:
df = pd.read_csv('../Bens_Data/Upcoming_Releases.csv')

Look at columns for features.

In [324]:
df.columns

Index(['Title', 'tconst', 'action', 'adventure', 'animation', 'averageRating',
       'biography', 'budget_adj', 'comedy', 'crime', 'DIR Name', 'DIR nconst',
       'DIR_AGE', 'DIR_COUNT', 'DIR_FILM_COUNT', 'DIR_RTG', 'documentary',
       'drama', 'family', 'fantasy', 'fi', 'history', 'horror', 'music',
       'musical', 'mystery', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'news', 'numVotes',
       'rlsdt_day', 'rlsdt_dayofwk', 'rlsdt_mo', 'rlsdt_season', 'ROI',
       'romance', 'runtimeMinutes', 'sci', 'sport', 'startYear', 'thriller',
       'western', 'worldwide_adj', 'WTR_AGE', 'WTR_COUNT', 'WTR_FILM_COUNT',
       'WTR_Nconst'],
      dtype='object')

In [325]:
df.head()

Unnamed: 0,Title,tconst,action,adventure,animation,averageRating,biography,budget_adj,comedy,crime,...,sci,sport,startYear,thriller,western,worldwide_adj,WTR_AGE,WTR_COUNT,WTR_FILM_COUNT,WTR_Nconst
0,Doctor Strange in the Multiverse of Madness,tt9419884,1,1,0,0,0,200000000,0,0,...,0,0,2022,0,0,0,0,0,0,"nm5642271, nm0228492, nm0498278"
1,The Bob's Burgers Movie,tt7466442,0,1,1,0,0,75000000,1,0,...,0,0,2022,0,0,0,0,0,0,"nm0098908,nm0202458,nm2451853"


## 14.02 Directors


These are the directors for the upcoming Disney releases.

In [326]:
df['DIR nconst']

0               nm0000600
1    nm0098908, nm0220615
Name: DIR nconst, dtype: object

### 14.02.01 Gather Director Information


In [321]:
nm=gzip.open('../Other Source Data/IMDB/name.basics.tsv.gz','rb')
df_names = pd.read_csv(nm,sep='\t', low_memory=False)
df_names.head(5)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0031983,tt0072308,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0117057,tt0071877,tt0038355,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0072562,tt0077975,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0083922,tt0069467,tt0060827,tt0050986"


In [327]:
cv = CountVectorizer()
cvec = cv.fit(df['DIR nconst'])
csr = cvec.transform(df['DIR nconst'])
disney_directors = pd.DataFrame(csr.todense(), columns=cvec.get_feature_names())
disney_directors = disney_directors.T
disney_directors.reset_index(inplace=True)
disney_directors.rename(columns={"index": "nconst"},inplace=True)
disney_directors = disney_directors[['nconst']]
disney_directors['director'] = 'director'


disney_directors = pd.merge(disney_directors ,                 # left df
                          df_names[['nconst', 'primaryName', 'birthYear']],                  # right df
                          how="left",                 # left join
                          left_on='nconst',            # left column
                          right_on='nconst',    # right column
                          indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        )

disney_directors['_merge'].value_counts()

both          3
left_only     0
right_only    0
Name: _merge, dtype: int64

These are the names of those directors

In [328]:
disney_directors.drop(columns='_merge',inplace = True)
disney_directors.head()

Unnamed: 0,nconst,director,primaryName,birthYear
0,nm0000600,director,Sam Raimi,1959
1,nm0098908,director,Loren Bouchard,1970
2,nm0220615,director,Bernard Derriman,\N


### 14.02.02 Gather Director History
Create a list of all films the Disney Directors have worked on, Disney or Otherwise

In [322]:
pr=gzip.open('../Other Source Data/IMDB/title.principals.tsv.gz','rb')
df_principals = pd.read_csv(pr,sep='\t', low_memory=False)
df_principals.head()


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [329]:
df_directors = df_principals[df_principals['category']=='director']

In [330]:
disney_directors_history = pd.merge(disney_directors ,                 # left df
                          df_directors[['tconst', 'nconst', 'category']],                  # right df
                          how="left",                 # left join
                          left_on='nconst',            # left column
                          right_on='nconst',    # right column
                          #indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        )

In [331]:
disney_directors_history.head()

Unnamed: 0,nconst,director,primaryName,birthYear,tconst,category
0,nm0000600,director,Sam Raimi,1959,tt0077344,director
1,nm0000600,director,Sam Raimi,1959,tt0078503,director
2,nm0000600,director,Sam Raimi,1959,tt0083907,director
3,nm0000600,director,Sam Raimi,1959,tt0088967,director
4,nm0000600,director,Sam Raimi,1959,tt0092991,director


## 14.03 Writers

### 14.03.01 Gather Writer Information


In [332]:
cvec = cv.fit(df['WTR_Nconst'])
csr = cvec.transform(df['WTR_Nconst'])
disney_writers = pd.DataFrame(csr.todense(), columns=cvec.get_feature_names())
disney_writers = disney_writers.T
disney_writers.reset_index(inplace=True)
disney_writers.rename(columns={"index": "nconst"},inplace=True)
disney_writers = disney_writers[['nconst']]
disney_writers['writer'] = 'writer'

disney_writers = pd.merge(disney_writers ,                 # left df
                          df_names[['nconst', 'primaryName', 'birthYear']],                  # right df
                          how="left",                 # left join
                          left_on='nconst',            # left column
                          right_on='nconst',    # right column
                          indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        )
disney_writers['_merge'].value_counts()

both          6
left_only     0
right_only    0
Name: _merge, dtype: int64

In [333]:
disney_writers.drop(columns='_merge',inplace = True)
disney_writers.head()

Unnamed: 0,nconst,writer,primaryName,birthYear
0,nm0098908,writer,Loren Bouchard,1970
1,nm0202458,writer,Jim Dauterive,1957
2,nm0228492,writer,Steve Ditko,1927
3,nm0498278,writer,Stan Lee,1922
4,nm2451853,writer,Nora Smith,\N


### 14.03.02 Gather Writer History
Create a list of all films the Disney Writers have worked on, Disney or Otherwise

In [334]:
df_writers = df_principals[df_principals['category']=='writer']

In [335]:
disney_writers_history = pd.merge(disney_writers ,                 # left df
                          df_writers[['tconst', 'nconst', 'category']],                  # right df
                          how="left",                 # left join
                          left_on='nconst',            # left column
                          right_on='nconst',    # right column
                          #indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        )

In [336]:
disney_writers_history.head()

Unnamed: 0,nconst,writer,primaryName,birthYear,tconst,category
0,nm0098908,writer,Loren Bouchard,1970,tt0197159,writer
1,nm0098908,writer,Loren Bouchard,1970,tt0491560,writer
2,nm0098908,writer,Loren Bouchard,1970,tt0565042,writer
3,nm0098908,writer,Loren Bouchard,1970,tt0565045,writer
4,nm0098908,writer,Loren Bouchard,1970,tt0565046,writer


## 14.04 Gather Data for Director and Writer History


This is all the same process we walked through in Part 02, 03, 04, and 05.  In this case, we are gathing the data for films done principals of the upcoming Disney releases.

For our Directors History and Writers History, we need to get the realse date for each film they worked on, as well as the budget and revenue.
The title and release date can come from data already provided by IMDB.
The budget and revenue numbers will need to be scraped from IMDB.

To make things a little easier to manage, we're going to get a list of unique tconst values from Disney Directors and Disney Writers, use the combined list to gather all of the unformation, then marry the film information to the Director and Writer.

### 14.04.01 Creat a unique list of titles


In [337]:
comb = np.concatenate((disney_directors_history['tconst'].unique(), disney_writers_history['tconst'].unique()), axis=0)

In [338]:
df_comb = pd.DataFrame(comb, columns = ['tconst'])
df_comb = df_comb['tconst'].unique()
df_comb = pd.DataFrame(df_comb, columns = ['tconst'])
df_comb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tconst  1449 non-null   object
dtypes: object(1)
memory usage: 11.4+ KB


In [339]:
bs=gzip.open('../Other Source Data/IMDB/title.basics.tsv.gz','rb')
df_basics = pd.read_csv(bs,sep='\t', low_memory=False)
df_basics_movies= df_basics[df_basics['titleType']=='movie']
df_basics_movies.shape

(605156, 9)

In [340]:
df_basics_movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
610,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
625,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


Combine this list of unique  titles with the basic title information from IMDB

In [341]:
directors_writers_combined_history = pd.merge(df_comb ,                 # left df
                          df_basics_movies[['tconst', 'titleType', 'primaryTitle', 'startYear', 'runtimeMinutes', 'genres']],                  # right df
                          how="left",                 # left join
                          left_on='tconst',            # left column
                          right_on='tconst',    # right column
                          #indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        )
directors_writers_combined_history.shape

(1449, 6)

In [342]:
directors_writers_combined_history = directors_writers_combined_history[directors_writers_combined_history['titleType'] == 'movie']
directors_writers_combined_history.shape

(67, 6)

All of the writers and directors, combined, have worked on 67 unique projects

### 14.04.02 Scrape IMDB for specific film stats

In [240]:
# Target web page:
url = "https://www.imdb.com/title/"
# Establishing the connection to the web page:
response = requests.get(url)
# response.text[:50]
soup = BeautifulSoup(response.text, 'lxml')

imdb_errors = pd.DataFrame(columns=['tconst'])
imdb_soup = pd.DataFrame(columns=['tconst','newurl','soup_text'])
imdb_scrape = pd.DataFrame(columns=['tconst','newurl','mpaarating','rlsdt','budget','wordlwide' ])


In [241]:
counter = 1
for const in directors_writers_combined_history['tconst']:
        
        found1 = imdb_scrape[imdb_scrape['tconst'].str.contains(const)]
        if found1.count().sum() == 0:
        
            t= random.uniform(.4, 1)
            time.sleep(t)
            newurl = url + const
            response = requests.get(newurl)
            soup = BeautifulSoup(response.text, 'lxml')


            print(counter)
            counter += 1
            print(newurl)

            try:
                metaul = soup.find_all('ul', {'data-testid' : 'hero-title-block__metadata'})
                mpaarating = metaul[0].find_all('li')[1].find_all('span')[0].get_text()
                print(mpaarating) 
            except:
                print('mpaarating error')
                mpaarating='error'
                imdb_errors.loc[len(imdb_errors.index)] = [const]
                # imdb__errors_soup.loc[len(imdb_soup.index)] = [const, newurl, soup]

            try:
                rlsdt = soup.find_all('li', {'data-testid' : 'title-details-releasedate'})[0].find_all('a')[1].get_text()
                print(rlsdt) 
            except:
                print('rlsdt error')
                rlsdt='error'
                imdb_errors.loc[len(imdb_errors.index)] = [const]
                # imdb__errors_soup.loc[len(imdb_soup.index)] = [const, newurl, soup]

            try:
                budget = soup.find_all('li', {'data-testid' : 'title-boxoffice-budget'})[0].find_all('li')[0].get_text()
                print(budget) 
            except:
                print('budget error')
                budget='error'
                imdb_errors.loc[len(imdb_errors.index)] = [const]
                # imdb__errors_soup.loc[len(imdb_soup.index)] = [const, newurl, soup]

            try:
                worldwide = soup.find_all('li', {'data-testid' : 'title-boxoffice-cumulativeworldwidegross'})[0].find_all('li')[0].get_text()
                print(worldwide) 
            except:
                print('worldwide error')
                worldwide='error'
                imdb_errors.loc[len(imdb_errors.index)] = [const]
                # imdb__errors_soup.loc[len(imdb_soup.index)] = [const, newurl, soup]



            imdb_scrape.loc[len(imdb_scrape.index)] = [const, newurl, mpaarating ,rlsdt ,budget ,worldwide]
            imdb_soup.loc[len(imdb_soup.index)] = [const, newurl, soup]



            if counter % 100 == 0:
                imdb_scrape.to_csv('imdb_scrape_director_writer_hist.csv')
                imdb_soup.to_csv('imdb_scrape_director_writer_hist_SOUP.csv')
                now = datetime.now()
                print(f'***************************  TO CSV : {counter} rows ***************************')
                print(now.strftime("%H:%M:%S"))
                print(f'********************************************************************************')

1
https://www.imdb.com/title/tt0083907
NC-17
April 15, 1983 (United States)
$350,000 (estimated)
$2,895,379
2
https://www.imdb.com/title/tt0088967
PG-13
April 25, 1986 (United States)
$3,000,000 (estimated)
$5,101
3
https://www.imdb.com/title/tt0092991
R
March 13, 1987 (United States)
$3,600,000 (estimated)
$5,924,421
4
https://www.imdb.com/title/tt0099365
R
August 24, 1990 (United States)
$14,000,000 (estimated)
$48,878,502
5
https://www.imdb.com/title/tt0106308
R
February 19, 1993 (United States)
$11,000,000 (estimated)
$11,505,128
6
https://www.imdb.com/title/tt0114214
R
February 10, 1995 (United States)
$32,000,000 (estimated)
$18,636,537
7
https://www.imdb.com/title/tt0120324
R
January 22, 1999 (United States)
$30,000,000 (estimated)
$16,316,273
8
https://www.imdb.com/title/tt0126916
PG-13
September 17, 1999 (United States)
$80,000,000 (estimated)
$46,112,640
9
https://www.imdb.com/title/tt0145487
PG-13
May 3, 2002 (United States)
$139,000,000 (estimated)
$825,025,036
10
https://w

### 14.04.03 Check for Errors


In [242]:
imdb_errors.head()

Unnamed: 0,tconst
0,tt17497950
1,tt17497950
2,tt17497950
3,tt17497950
4,tt5504930


In [243]:
imdb_errors.shape

(103, 1)

In [244]:
imdb_errors.drop_duplicates(inplace=True)
imdb_errors.head()

Unnamed: 0,tconst
0,tt17497950
4,tt5504930
8,tt6292018
12,tt8036976
16,tt9419884


In [245]:
imdb_errors.shape

(31, 1)

In [246]:
imdb_scrape['mpaarating'].value_counts()

PG-13        29
error        27
R             6
PG            3
NC-17         1
Not Rated     1
Name: mpaarating, dtype: int64

### 14.04.04 Save Scraped Data


In [439]:
imdb_scrape.to_csv('../Bens_Data/imdb_scrapefor_preds.csv')

In [2]:
# A manual check of the 27 errors revealded that they are either in development, were never produced, or are fan films, not feature releases.
# Additional errors were found in the budget and worldwide columns.  These were also in development, never produced, or fan films.  With one exception.  The Fantastic Four (1994) was produced but never released.
# After the manual check, we re-imported the data.  

In [441]:
imdb_scrape2 = pd.read_csv('../Bens_Data/imdb_scrapefor_preds.csv')

In [442]:
# imdb_scrape2.dropna(inplace=True)
imdb_scrape2.drop(columns=[], inplace=True)

### 14.04.05 Gather IMDB Title Ratings


In [443]:
rt=gzip.open('../Other Source Data/IMDB/title.ratings.tsv.gz','rb')
df_ratings = pd.read_csv(rt,sep='\t', low_memory=False)
df_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1868
1,tt0000002,5.9,247
2,tt0000003,6.5,1640
3,tt0000004,5.8,159
4,tt0000005,6.2,2463


In [444]:
imdb_scrape2 = pd.merge(imdb_scrape2 ,                 # left df
                          df_ratings,                  # right df
                          how="left",                 # left join
                          left_on='tconst',            # left column
                          right_on='tconst',    # right column
                          # indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        )

In [445]:
imdb_scrape2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67 entries, 0 to 66
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     67 non-null     int64  
 1   tconst         67 non-null     object 
 2   newurl         67 non-null     object 
 3   mpaarating     67 non-null     object 
 4   rlsdt          67 non-null     object 
 5   budget         67 non-null     object 
 6   wordlwide      67 non-null     object 
 7   averageRating  45 non-null     float64
 8   numVotes       45 non-null     float64
dtypes: float64(2), int64(1), object(6)
memory usage: 5.2+ KB


In [446]:
imdb_scrape2

Unnamed: 0.1,Unnamed: 0,tconst,newurl,mpaarating,rlsdt,budget,wordlwide,averageRating,numVotes
0,0,tt0083907,https://www.imdb.com/title/tt0083907,NC-17,"April 15, 1983 (United States)","$350,000 (estimated)","$2,895,379",7.4,201650.0
1,1,tt0088967,https://www.imdb.com/title/tt0088967,PG-13,"April 25, 1986 (United States)","$3,000,000 (estimated)","$5,101",5.6,5725.0
2,2,tt0092991,https://www.imdb.com/title/tt0092991,R,"March 13, 1987 (United States)","$3,600,000 (estimated)","$5,924,421",7.7,158839.0
3,3,tt0099365,https://www.imdb.com/title/tt0099365,R,"August 24, 1990 (United States)","$14,000,000 (estimated)","$48,878,502",6.4,62690.0
4,4,tt0106308,https://www.imdb.com/title/tt0106308,R,"February 19, 1993 (United States)","$11,000,000 (estimated)","$11,505,128",7.4,172734.0
...,...,...,...,...,...,...,...,...,...
62,62,tt5663280,https://www.imdb.com/title/tt5663280,error,"November 30, 2013 (India)",error,error,7.2,36.0
63,63,tt6083972,https://www.imdb.com/title/tt6083972,error,error,error,error,,
64,64,tt8426638,https://www.imdb.com/title/tt8426638,error,error,error,error,,
65,65,tt11026882,https://www.imdb.com/title/tt11026882,error,error,error,error,,


### 14.04.06 Merge Directors and Writers History with Scraped Data


In [447]:
directors_writers_combined_history2 = pd.merge(directors_writers_combined_history ,                 # left df
                          imdb_scrape2[['tconst', 'newurl', 'mpaarating', 'rlsdt', 'budget', 'wordlwide', 'averageRating', 'numVotes']],                  # right df
                          how="left",                 # left join
                          left_on='tconst',            # left column
                          right_on='tconst',    # right column
                          # indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        ) 
# directors_writers_combined_history.dropna(inplace=True)

In [448]:
directors_writers_combined_history2

Unnamed: 0,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,newurl,mpaarating,rlsdt,budget,wordlwide,averageRating,numVotes
0,tt0083907,movie,The Evil Dead,1981,85,Horror,https://www.imdb.com/title/tt0083907,NC-17,"April 15, 1983 (United States)","$350,000 (estimated)","$2,895,379",7.4,201650.0
1,tt0088967,movie,Crimewave,1985,86,"Comedy,Crime,Horror",https://www.imdb.com/title/tt0088967,PG-13,"April 25, 1986 (United States)","$3,000,000 (estimated)","$5,101",5.6,5725.0
2,tt0092991,movie,Evil Dead II,1987,84,"Comedy,Horror",https://www.imdb.com/title/tt0092991,R,"March 13, 1987 (United States)","$3,600,000 (estimated)","$5,924,421",7.7,158839.0
3,tt0099365,movie,Darkman,1990,96,"Action,Sci-Fi,Thriller",https://www.imdb.com/title/tt0099365,R,"August 24, 1990 (United States)","$14,000,000 (estimated)","$48,878,502",6.4,62690.0
4,tt0106308,movie,Army of Darkness,1992,81,"Comedy,Horror",https://www.imdb.com/title/tt0106308,R,"February 19, 1993 (United States)","$11,000,000 (estimated)","$11,505,128",7.4,172734.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,tt5663280,movie,Chakra the Invincible,2013,\N,Action,https://www.imdb.com/title/tt5663280,error,"November 30, 2013 (India)",error,error,7.2,36.0
63,tt6083972,movie,Monkey Master,\N,\N,"Action,Fantasy,Sci-Fi",https://www.imdb.com/title/tt6083972,error,error,error,error,,
64,tt8426638,movie,The Last Resort,\N,\N,"Fantasy,Sci-Fi",https://www.imdb.com/title/tt8426638,error,error,error,error,,
65,tt11026882,movie,Untitled Star Wars/Kevin Feige Project,\N,\N,"Action,Adventure,Fantasy",https://www.imdb.com/title/tt11026882,error,error,error,error,,


In [449]:
# directors_writers_combined_history2.dropna(inplace=True)

In [450]:
directors_writers_combined_history2

Unnamed: 0,tconst,titleType,primaryTitle,startYear,runtimeMinutes,genres,newurl,mpaarating,rlsdt,budget,wordlwide,averageRating,numVotes
0,tt0083907,movie,The Evil Dead,1981,85,Horror,https://www.imdb.com/title/tt0083907,NC-17,"April 15, 1983 (United States)","$350,000 (estimated)","$2,895,379",7.4,201650.0
1,tt0088967,movie,Crimewave,1985,86,"Comedy,Crime,Horror",https://www.imdb.com/title/tt0088967,PG-13,"April 25, 1986 (United States)","$3,000,000 (estimated)","$5,101",5.6,5725.0
2,tt0092991,movie,Evil Dead II,1987,84,"Comedy,Horror",https://www.imdb.com/title/tt0092991,R,"March 13, 1987 (United States)","$3,600,000 (estimated)","$5,924,421",7.7,158839.0
3,tt0099365,movie,Darkman,1990,96,"Action,Sci-Fi,Thriller",https://www.imdb.com/title/tt0099365,R,"August 24, 1990 (United States)","$14,000,000 (estimated)","$48,878,502",6.4,62690.0
4,tt0106308,movie,Army of Darkness,1992,81,"Comedy,Horror",https://www.imdb.com/title/tt0106308,R,"February 19, 1993 (United States)","$11,000,000 (estimated)","$11,505,128",7.4,172734.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,tt5663280,movie,Chakra the Invincible,2013,\N,Action,https://www.imdb.com/title/tt5663280,error,"November 30, 2013 (India)",error,error,7.2,36.0
63,tt6083972,movie,Monkey Master,\N,\N,"Action,Fantasy,Sci-Fi",https://www.imdb.com/title/tt6083972,error,error,error,error,,
64,tt8426638,movie,The Last Resort,\N,\N,"Fantasy,Sci-Fi",https://www.imdb.com/title/tt8426638,error,error,error,error,,
65,tt11026882,movie,Untitled Star Wars/Kevin Feige Project,\N,\N,"Action,Adventure,Fantasy",https://www.imdb.com/title/tt11026882,error,error,error,error,,


## 14.05 Explore and Clean History Data


### 14.05.01 Budget 


In [451]:
# Remove 'estimated'
directors_writers_combined_history2['budget_adj'] = directors_writers_combined_history2['budget'].apply(lambda x: x[:-12] if "estimated" in x else x)
directors_writers_combined_history2['budget_adj'].head(50)

0         $350,000
1       $3,000,000
2       $3,600,000
3      $14,000,000
4      $11,000,000
5      $32,000,000
6      $30,000,000
7      $80,000,000
8     $139,000,000
9      $10,000,000
10    $200,000,000
11    $258,000,000
12     $30,000,000
13    $215,000,000
14           error
15           error
16           error
17           error
18           error
19           error
20           error
21    $230,000,000
22            $500
23    $200,000,000
24    $165,000,000
25           error
26          £7,000
27           error
28           error
29           error
30    $160,000,000
31           error
32           error
33           error
34      $1,000,000
35    $100,000,000
36    $137,000,000
37    $140,000,000
38    $130,000,000
39    $130,000,000
40    $150,000,000
41           error
42           error
43           error
44    $200,000,000
45           error
46    $200,000,000
47    $120,000,000
48           error
49           error
Name: budget_adj, dtype: object

In [452]:
# Remove Commas
directors_writers_combined_history2['budget_adj'] = directors_writers_combined_history2['budget_adj'].apply(lambda x: x.replace(",",""))
# Remove Errors
directors_writers_combined_history2['budget_adj'] = directors_writers_combined_history2['budget_adj'].apply(lambda x: x if "error" in x else x[1:])
directors_writers_combined_history2['budget_adj'].head()

0      350000
1     3000000
2     3600000
3    14000000
4    11000000
Name: budget_adj, dtype: object

In [453]:
directors_writers_combined_history2['budget_adj'] = directors_writers_combined_history2['budget_adj'].apply(lambda x: 0 if "error" in x else x)
directors_writers_combined_history2['budget_adj'].head()

0      350000
1     3000000
2     3600000
3    14000000
4    11000000
Name: budget_adj, dtype: object

In [454]:
# Convert Budget to Int
def convert(val):
    try:
        return(int(val))
    except:
        return(0)


directors_writers_combined_history2['budget_adj'] = directors_writers_combined_history2['budget_adj'].apply(lambda x: convert(x)) 

### 14.05.02 Worldwide Revenue


In [455]:
directors_writers_combined_history2['worldwide_adj'] = directors_writers_combined_history2['wordlwide']
# Remove Commas
directors_writers_combined_history2['worldwide_adj'] = directors_writers_combined_history2['worldwide_adj'].apply(lambda x: x.replace(",",""))
# Remove Errors
directors_writers_combined_history2['worldwide_adj'] = directors_writers_combined_history2['worldwide_adj'].apply(lambda x: x if "error" in x else x[1:])
directors_writers_combined_history2['worldwide_adj'] = directors_writers_combined_history2['worldwide_adj'].apply(lambda x: 0 if "error" in x else x)
# Convert Revenue to Int
def convert(val):
    try:
        return(int(val))
    except:
        return(0)


directors_writers_combined_history2['worldwide_adj'] = directors_writers_combined_history2['worldwide_adj'].apply(lambda x: convert(x)) 
directors_writers_combined_history2['worldwide_adj']

0      2895379
1         5101
2      5924421
3     48878502
4     11505128
        ...   
62           0
63           0
64           0
65           0
66           0
Name: worldwide_adj, Length: 67, dtype: int64

### 14.05.03 Calculate ROI


In [456]:
directors_writers_combined_history2['ROI'] = (directors_writers_combined_history2.worldwide_adj - directors_writers_combined_history2.budget_adj ) / directors_writers_combined_history2.budget_adj

### 14.05.04 Release Date


In [457]:
directors_writers_combined_history2['rlsdt_dt'] = directors_writers_combined_history2['rlsdt'].str.replace(r"\(.*\)","")
# Create columns for date, day, month, and day of the week
directors_writers_combined_history2['rlsdt_dt']= pd.to_datetime(directors_writers_combined_history2['rlsdt_dt'],errors='coerce')
directors_writers_combined_history2['rlsdt_mo']= pd.DatetimeIndex(directors_writers_combined_history2['rlsdt_dt']).month
directors_writers_combined_history2['rlsdt_day']= pd.DatetimeIndex(directors_writers_combined_history2['rlsdt_dt']).day
directors_writers_combined_history2['rlsdt_daynm']= pd.to_datetime(directors_writers_combined_history2['rlsdt_dt']).dt.day_name()



  directors_writers_combined_history2['rlsdt_dt'] = directors_writers_combined_history2['rlsdt'].str.replace(r"\(.*\)","")


## 14.06 Merge Directors and Writers History with Unique Lists


In [458]:
directors_demo_history2 = pd.merge(disney_directors_history ,                 # left df
                          directors_writers_combined_history2[['tconst','primaryTitle', 'runtimeMinutes', 'genres', 'mpaarating', 'rlsdt_dt', 'startYear','rlsdt_mo', 'rlsdt_day', 'rlsdt_daynm','budget_adj','worldwide_adj','ROI', 'averageRating', 'numVotes']],                  # right df
                          how="left",                 # left join
                          left_on='tconst',            # left column
                          right_on='tconst',    # right column
                          indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        ) 

In [459]:
writers_demo_history2 = pd.merge(disney_writers_history ,                 # left df
                          directors_writers_combined_history2[['tconst','primaryTitle', 'runtimeMinutes', 'genres', 'mpaarating', 'rlsdt_dt', 'startYear','rlsdt_mo', 'rlsdt_day', 'rlsdt_daynm','budget_adj','worldwide_adj','ROI', 'averageRating', 'numVotes']],                  # right df
                          how="left",                 # left join
                          left_on='tconst',            # left column
                          right_on='tconst',    # right column
                          indicator = True,           # indicates source of each row
                          #validate = "one_to_many"    # alerts us of the relationship from left to right, incase there are dups
        ) 

In [460]:
directors_demo_history2=directors_demo_history2[directors_demo_history2['_merge'] == 'both']

In [461]:
writers_demo_history2=writers_demo_history2[writers_demo_history2['_merge'] == 'both']

## 14.07 Fix Years, Birthdays, and Runtimes


In [462]:
directors_demo_history2['startYear'] = pd.to_numeric(directors_demo_history2['birthYear'],errors='coerce')
directors_demo_history2['birthYear'] = pd.to_numeric(directors_demo_history2['birthYear'],errors='coerce')
directors_demo_history2['runtimeMinutes'] = pd.to_numeric(directors_demo_history2['runtimeMinutes'],errors='coerce')
writers_demo_history2['startYear'] = pd.to_numeric(writers_demo_history2['birthYear'],errors='coerce')
writers_demo_history2['birthYear'] = pd.to_numeric(writers_demo_history2['birthYear'],errors='coerce')
writers_demo_history2['runtimeMinutes'] = pd.to_numeric(writers_demo_history2['runtimeMinutes'],errors='coerce')

In [463]:
directors_demo_history2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21 entries, 2 to 242
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   nconst          21 non-null     object        
 1   director        21 non-null     object        
 2   primaryName     21 non-null     object        
 3   birthYear       20 non-null     float64       
 4   tconst          21 non-null     object        
 5   category        21 non-null     object        
 6   primaryTitle    21 non-null     object        
 7   runtimeMinutes  16 non-null     float64       
 8   genres          21 non-null     object        
 9   mpaarating      21 non-null     object        
 10  rlsdt_dt        17 non-null     datetime64[ns]
 11  startYear       20 non-null     float64       
 12  rlsdt_mo        17 non-null     float64       
 13  rlsdt_day       17 non-null     float64       
 14  rlsdt_daynm     17 non-null     object        
 15  budget_

## 14.08 Calculate Age at Date of Each Project


In [464]:
directors_demo_history2['age'] = directors_demo_history2['startYear'] - directors_demo_history2['birthYear']

In [465]:
writers_demo_history2['age'] = writers_demo_history2['startYear'] - writers_demo_history2['birthYear']

In [474]:
# directors_demo_history['rlsdt_dt'] = directors_demo_history['rlsdt_dt'].str.replace(r"\(.*\)","")
directors_demo_history2['rlsdt_dt']= pd.to_datetime(directors_demo_history2['rlsdt_dt'],errors='coerce')
directors_demo_history2

Unnamed: 0,nconst,director,primaryName,birthYear,tconst,category,primaryTitle,runtimeMinutes,genres,mpaarating,...,rlsdt_mo,rlsdt_day,rlsdt_daynm,budget_adj,worldwide_adj,ROI,averageRating,numVotes,_merge,age
2,nm0000600,director,Sam Raimi,1959.0,tt0083907,director,The Evil Dead,85.0,Horror,NC-17,...,4.0,15.0,Friday,350000.0,2895379.0,7.272511,7.4,201650.0,both,0.0
3,nm0000600,director,Sam Raimi,1959.0,tt0088967,director,Crimewave,86.0,"Comedy,Crime,Horror",PG-13,...,4.0,25.0,Friday,3000000.0,5101.0,-0.9983,5.6,5725.0,both,0.0
4,nm0000600,director,Sam Raimi,1959.0,tt0092991,director,Evil Dead II,84.0,"Comedy,Horror",R,...,3.0,13.0,Friday,3600000.0,5924421.0,0.645672,7.7,158839.0,both,0.0
5,nm0000600,director,Sam Raimi,1959.0,tt0099365,director,Darkman,96.0,"Action,Sci-Fi,Thriller",R,...,8.0,24.0,Friday,14000000.0,48878502.0,2.491322,6.4,62690.0,both,0.0
6,nm0000600,director,Sam Raimi,1959.0,tt0106308,director,Army of Darkness,81.0,"Comedy,Horror",R,...,2.0,19.0,Friday,11000000.0,11505128.0,0.045921,7.4,172734.0,both,0.0
7,nm0000600,director,Sam Raimi,1959.0,tt0114214,director,The Quick and the Dead,108.0,"Action,Romance,Thriller",R,...,2.0,10.0,Friday,32000000.0,18636537.0,-0.417608,6.5,91518.0,both,0.0
8,nm0000600,director,Sam Raimi,1959.0,tt0120324,director,A Simple Plan,121.0,"Crime,Drama,Thriller",R,...,1.0,22.0,Friday,30000000.0,16316273.0,-0.456124,7.5,68520.0,both,0.0
9,nm0000600,director,Sam Raimi,1959.0,tt0126916,director,For Love of the Game,137.0,"Drama,Romance,Sport",PG-13,...,9.0,17.0,Friday,80000000.0,46112640.0,-0.423592,6.6,34453.0,both,0.0
10,nm0000600,director,Sam Raimi,1959.0,tt0145487,director,Spider-Man,121.0,"Action,Adventure,Sci-Fi",PG-13,...,5.0,3.0,Friday,139000000.0,825025036.0,4.935432,7.4,786993.0,both,0.0
11,nm0000600,director,Sam Raimi,1959.0,tt0219699,director,The Gift,112.0,"Drama,Fantasy,Horror",R,...,1.0,19.0,Friday,10000000.0,44567606.0,3.456761,6.7,67578.0,both,0.0


In [467]:
df.head()

Unnamed: 0,Title,tconst,action,adventure,animation,averageRating,biography,budget_adj,comedy,crime,...,startYear,thriller,western,worldwide_adj,WTR_AGE,WTR_COUNT,WTR_FILM_COUNT,WTR_Nconst,DIR_ROI,DIR_RNTM
0,Doctor Strange in the Multiverse of Madness,tt9419884,1,1,0,0,0,200000000,0,0,...,2022,0,0,0,0,0,0,"nm5642271, nm0228492, nm0498278",,
1,The Bob's Burgers Movie,tt7466442,0,1,1,0,0,75000000,1,0,...,2022,0,0,0,0,0,0,"nm0098908,nm0202458,nm2451853",,


In [466]:
df.columns

Index(['Title', 'tconst', 'action', 'adventure', 'animation', 'averageRating',
       'biography', 'budget_adj', 'comedy', 'crime', 'DIR Name', 'DIR nconst',
       'DIR_AGE', 'DIR_COUNT', 'DIR_FILM_COUNT', 'DIR_RTG', 'documentary',
       'drama', 'family', 'fantasy', 'fi', 'history', 'horror', 'music',
       'musical', 'mystery', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'news', 'numVotes',
       'rlsdt_day', 'rlsdt_dayofwk', 'rlsdt_mo', 'rlsdt_season', 'ROI',
       'romance', 'runtimeMinutes', 'sci', 'sport', 'startYear', 'thriller',
       'western', 'worldwide_adj', 'WTR_AGE', 'WTR_COUNT', 'WTR_FILM_COUNT',
       'WTR_Nconst', 'DIR_ROI', 'DIR_RNTM'],
      dtype='object')

In [480]:
Disney_df = df
directors_df = directors_demo_history2

## 14.09 Summarize Track Records


In [481]:
df['tconst']

0    tt9419884
1    tt7466442
Name: tconst, dtype: object

In [482]:
directors_df['tconst']

2       tt0083907
3       tt0088967
4       tt0092991
5       tt0099365
6       tt0106308
7       tt0114214
8       tt0120324
9       tt0126916
10      tt0145487
11      tt0219699
12      tt0316654
13      tt0413300
15      tt1127180
19      tt1623205
20     tt17497950
26      tt5504930
28      tt6292018
37      tt8036976
38      tt9419884
109     tt7466442
242     tt7466442
Name: tconst, dtype: object

This is the same function we used in Part 9

In [511]:

Disney_df["DIR_COUNT"] = np.nan 
Disney_df["DIR_AGE"] = np.nan 
Disney_df["DIR_ROI"] = np.nan 
Disney_df["DIR_RTG"] = np.nan 
Disney_df["DIR_RNTM"] = np.nan 
Disney_df["DIR_FILM_COUNT"] = np.nan

# Let's try putting it all together
# INDX = 277

def get_dir_hist(tcon):
    
    
    DIR_COUNT = directors_df[directors_df['tconst'] == tcon]['nconst'].count()
    print (f'# Dirs: {DIR_COUNT}')
    rlsdt = directors_df[directors_df['tconst'] == tcon]['rlsdt_dt']
    rlsdt.reset_index(inplace=True, drop=True)
    
    if DIR_COUNT == 0:
        return 
    
    elif DIR_COUNT == 1:
    
        dnconst = directors_df[directors_df['tconst'] == tcon]['nconst']
        dnconst.reset_index(inplace=True, drop=True)
        

        DIR_AGE = directors_df[directors_df['tconst'] == tcon]['age'].mean()
        
        DIR_ROI = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['ROI'].mean()
        DIR_RTG = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['averageRating'].mean()
        DIR_RNTM = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['runtimeMinutes'].mean()
        DIR_FILM_COUNT = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['tconst'].count()
        
        
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_COUNT"] = DIR_COUNT
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_AGE"] = DIR_AGE
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_ROI"] = DIR_ROI
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RTG"] = DIR_RTG
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RNTM"] = DIR_RNTM
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_FILM_COUNT"] = DIR_FILM_COUNT
        return 
    
    else:
        # print('more than one')
        dirs = directors_df[directors_df['tconst'] == tcon]['nconst']
        
        for dir in dirs:
            print (dir)
        
            dnconst = directors_df[directors_df['tconst'] == tcon]['nconst']
            dnconst.reset_index(inplace=True, drop=True)

            DIR_AGE_list = []
            DIR_ROI_list = []
            DIR_RTG_list = []
            DIR_RNTM_list = []
            DIR_FILM_COUNT_list = []

            DIR_AGE = directors_df[directors_df['tconst'] == tcon]['age'].mean()
            DIR_ROI = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['ROI'].mean()
            DIR_RTG = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['averageRating'].mean()
            DIR_RNTM = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['runtimeMinutes'].mean()
            DIR_FILM_COUNT = directors_df[(directors_df['nconst'] == dnconst[0]) & (directors_df['rlsdt_dt'] < rlsdt[0])]['tconst'].count()

            DIR_AGE_list.append(DIR_AGE)
            DIR_ROI_list.append(DIR_ROI)
            DIR_RTG_list.append(DIR_RTG)
            DIR_RNTM_list.append(DIR_RNTM)
            DIR_FILM_COUNT_list.append(DIR_FILM_COUNT)
            
            
        
        
            
            
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_COUNT"] = DIR_COUNT
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_AGE"] = mean(DIR_AGE_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_ROI"] = mean(DIR_ROI_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RTG"] = mean(DIR_RTG_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_RNTM"] = mean(DIR_RNTM_list)
        Disney_df.loc[(Disney_df['tconst'] == tcon), "DIR_FILM_COUNT"] = mean(DIR_FILM_COUNT_list)

In [512]:
for tconst in df['tconst']:
    # print(f'Film: {tconst}')
    get_dir_hist(tconst)
    
df.info()

# Dirs: 1
# Dirs: 2
nm0098908
nm0220615
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 51 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Title                    2 non-null      object 
 1   tconst                   2 non-null      object 
 2   action                   2 non-null      int64  
 3   adventure                2 non-null      int64  
 4   animation                2 non-null      int64  
 5   averageRating            2 non-null      int64  
 6   biography                2 non-null      int64  
 7   budget_adj               2 non-null      int64  
 8   comedy                   2 non-null      int64  
 9   crime                    2 non-null      int64  
 10  DIR Name                 2 non-null      object 
 11  DIR nconst               2 non-null      object 
 12  DIR_AGE                  2 non-null      float64
 13  DIR_COUNT                2 non-null      flo

In [513]:
df.fillna(0, inplace = True)

In [514]:
df.head()

Unnamed: 0,Title,tconst,action,adventure,animation,averageRating,biography,budget_adj,comedy,crime,...,startYear,thriller,western,worldwide_adj,WTR_AGE,WTR_COUNT,WTR_FILM_COUNT,WTR_Nconst,DIR_ROI,DIR_RNTM
0,Doctor Strange in the Multiverse of Madness,tt9419884,1,1,0,0,0,200000000,0,0,...,2022,0,0,0,0,0,0,"nm5642271, nm0228492, nm0498278",1.806312,109.0
1,The Bob's Burgers Movie,tt7466442,0,1,1,0,0,75000000,1,0,...,2022,0,0,0,0,0,0,"nm0098908,nm0202458,nm2451853",0.0,0.0


In [515]:
dis_df = df

## 14.10 Predict IMDB Rating

### 14.10.01 Import Training Data


In [516]:
df = pd.read_csv('../Bens_Data/Disney_Films_For_Visual.csv')
df.drop(columns=['Unnamed: 0','index'], inplace=True)

### 14.10.02 Features List


In [517]:
features = [ 'startYear',
       'runtimeMinutes', 
        #'averageRating', 
        #'numVotes', 
       'DIR_COUNT', 'DIR_AGE',
       'DIR_RTG', 'DIR_FILM_COUNT', 'WTR_COUNT', 'WTR_AGE', 
       'WTR_FILM_COUNT','action', 'adventure', 'animation', 
       'biography', 'comedy', 'crime','documentary', 'drama', 
       'family', 'fantasy', 'fi', 'history', 'horror','music', 
       'musical', 'mystery', 'news', 'romance', 'sci', 'sport',
       'thriller', 'western', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'budget_adj',
       #'worldwide_adj', 
       #'ROI', 
       'rlsdt_mo', 'rlsdt_day',
       'rlsdt_dayofwk', 'rlsdt_season']

### 14.10.03 Setting Up X and y


In [518]:
X = df[features]
y = df['averageRating']

In [521]:
Xd = dis_df[features]
yd = dis_df['averageRating']

### 14.10.04 Build and Train the Model


Lasso, with a lambda of 0.026, performed the best according to our previous testing.

In [522]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform (X_test)

lss = Lasso(alpha=0.026)

lss.fit(X_train_sc, y_train)
y_pred_train = lss.predict(X_train_sc)
y_pred_test = lss.predict(X_test_sc)
trainscore = lss.score(X_train_sc, y_train)
testscore = lss.score(X_test_sc, y_test)
crossval = cross_val_score(lss, X_train_sc, y_train).mean()
rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))

print (f'Train Score = {trainscore}')
print (f'Test Score = {testscore}')
print (f'Cross Val Score = {crossval}')
print (f'RMSE Train = {rmsetr}')
print (f'RMSE Test = {rmsete}')

Train Score = 0.5275369340400811
Test Score = 0.25770715219307805
Cross Val Score = 0.3962512599415794
RMSE Train = 0.6615462771227589
RMSE Test = 0.7761788660454407


In [523]:
Xd_sc = sc.fit_transform(Xd)

### 14.10.05 IMDB Rating Predictions


In [528]:
IMDB_preds = lss.predict(Xd_sc)

In [529]:
IMDB_preds

array([6.63685696, 6.37057338])

In [527]:
dis_df

Unnamed: 0,Title,tconst,action,adventure,animation,averageRating,biography,budget_adj,comedy,crime,...,startYear,thriller,western,worldwide_adj,WTR_AGE,WTR_COUNT,WTR_FILM_COUNT,WTR_Nconst,DIR_ROI,DIR_RNTM
0,Doctor Strange in the Multiverse of Madness,tt9419884,1,1,0,0,0,200000000,0,0,...,2022,0,0,0,0,0,0,"nm5642271, nm0228492, nm0498278",1.806312,109.0
1,The Bob's Burgers Movie,tt7466442,0,1,1,0,0,75000000,1,0,...,2022,0,0,0,0,0,0,"nm0098908,nm0202458,nm2451853",0.0,0.0


## 14.11 Predict Wordlwide Revenue

### 14.11.01 Import Training Data


In [534]:
df = pd.read_csv('../Bens_Data/Disney_Films_For_Visual.csv')
df.drop(columns=['Unnamed: 0','index'], inplace=True)

### 14.11.02 Features List


In [535]:
features = [ 'startYear',
       'runtimeMinutes', 
        #'averageRating', 
        #'numVotes', 
       'DIR_COUNT', 'DIR_AGE',
       'DIR_RTG', 'DIR_FILM_COUNT', 'WTR_COUNT', 'WTR_AGE', 
       'WTR_FILM_COUNT','action', 'adventure', 'animation', 
       'biography', 'comedy', 'crime','documentary', 'drama', 
       'family', 'fantasy', 'fi', 'history', 'horror','music', 
       'musical', 'mystery', 'news', 'romance', 'sci', 'sport',
       'thriller', 'western', 'newmpaarating_G', 'newmpaarating_Not Rated',
       'newmpaarating_PG', 'newmpaarating_PG-13', 'budget_adj',
       #'worldwide_adj', 
       #'ROI', 
       'rlsdt_mo', 'rlsdt_day',
       'rlsdt_dayofwk', 'rlsdt_season']

### 14.11.03 Setting Up X and y


In [536]:
X = df[features]
y = df['worldwide_adj']

In [537]:
Xd = dis_df[features]
yd = dis_df['worldwide_adj']

### 14.11.04 Build and Train the Model


Lasso, with no specific tunig, performed the best according to our previsou testing.

In [538]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform (X_test)

lss2 = Lasso()

lss2.fit(X_train_sc, y_train)
y_pred_train = lss2.predict(X_train_sc)
y_pred_test = lss2.predict(X_test_sc)
trainscore = lss2.score(X_train_sc, y_train)
testscore = lss2.score(X_test_sc, y_test)
crossval = cross_val_score(lss2, X_train_sc, y_train).mean()
rmsetr= np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
rmsete = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))

print (f'Train Score = {trainscore}')
print (f'Test Score = {testscore}')
print (f'Cross Val Score = {crossval}')
print (f'RMSE Train = {rmsetr}')
print (f'RMSE Test = {rmsete}')

Train Score = 0.5584733293173219
Test Score = 0.5643336852831031
Cross Val Score = -0.3825086583951224
RMSE Train = 214073304.352461
RMSE Test = 213768886.23159164


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [539]:
Xd_sc = sc.fit_transform(Xd)

### 14.11.05 Worldwide Revenue Predictions


In [540]:
IMDB_preds = lss2.predict(Xd_sc)

In [541]:
IMDB_preds

array([2.94513986e+08, 4.59637838e+07])

In [543]:
# $294,513,986 & $45,963,783