# Box Office Revenue Prediction

## Imports

In [172]:
import pandas as pd
import numpy as np


import re # regex
import ast

from datetime import datetime
from datetime import date

from sklearn import metrics
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve, RandomizedSearchCV, train_test_split

## Data Sources
1. IMDB:
    * No. of ratings (star)
    * Avg. star rating (out of 10)
    * No. of user ratings (text reviews)
    * No. of critic ratings (meta critic)
    * Countries (truncate to just first country) - √
    * Language (truncate to just first language) - √
    * No. of languages (to be created from previous column) - √
    * Production house (truncate to just first production house) - √
    * Duration (convert to minutes/numeric from string) - √
    * Genre (need to explode to one-hot columns for 24 genres) - √
    * MPAA rating 
        * TV- ratings - reclassify - √ 
        * Not Rated --> PG-13/Unrated --> PG - many from non-English fall under these categories - √  
       
    * **Gross:** 'Y' variable (need to remove movies with alphabet-only gross, convert INR (other currencies?) to USD and 3 INR 3-digit movies):
        * **`TODO`** Adjust for inflation, based on year of release
    * **`TODO`** Release date + release region - separate out into 2 columns:
        * convert first to datetime and use as a feature (holiday - US - release/not) 

    
&nbsp; 
2. Popularity Scores:
    * Average popularity score per movie

&nbsp; 

3. Sentiment Scores (**`TODO`**):
    * AFINN score - based on IMDB user reviews pre-release
    * AFINN score - based on IMDB user reviews post-release
    

&nbsp;    
4. YouTube (**`TODO`**):
    * View count
    * Like count
    * Dislike count
    * Comment count

## Reading Excel File(s)
#### !!!!!! IMPORTANT- manually removed the unnamed serial number column from the excel sheet I'm reading below!!!!! 
#### Check that it is removed in your excel sheet also

In [48]:
# read xlsx with IMDB_ID as the indexing column (2nd column from the left, or 1st column) 
df = pd.read_excel("Final_data_sheets_updated_popularity_scores.xlsx", index_col = 0)

In [49]:
# inspect
df.head(10)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,main_cast_list,main_cast_links,dir_list,creator_list,genre,motion_picture_rating,release_date,...,meta_critic_score,num_user_ratings,num_critic_ratings,story_line,others,Gross,Country,Language,Production_House,Average_popularity_score_per_movie
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
433362,Daybreakers,116014,6.5,Ethan Hawke|Willem Dafoe|Sam Neill,/name/nm0000160/|/name/nm0000353/|/name/nm0000...,Michael Spierig|Peter Spierig,Michael Spierig|Peter Spierig,"[Action, Fantasy, Horror, Sci-Fi, Thriller]",R,14 January 2010 (Singapore),...,,301.0,291.0,"In a world 10 years into the future, vampi...",{'Crazy Credits': '\nCrazy Credits\n In lin...,51416464,Australia|USA,English,"Lionsgate, Australian Film Finance Corporatio...",8.318333
1216492,Leap Year,85208,6.5,Amy Adams|Matthew Goode|Adam Scott,/name/nm0010736/|/name/nm0328828/|/name/nm0004...,Anand Tucker,Deborah Kaplan|Harry Elfont,"[Comedy, Romance]",PG,12 February 2010 (Singapore),...,33.0,219.0,136.0,A woman who has an elaborate scheme to pro...,{'Filming Locations:': '\nFilming Locations:\n...,25918920,USA|Ireland,English|Ukrainian,"Universal Pictures, Spyglass Entertainment, B...",6.195667
1037705,The Book of Eli,268523,6.9,Denzel Washington|Mila Kunis|Ray Stevenson,/name/nm0000243/|/name/nm0005109/|/name/nm0829...,Albert Hughes|Allen Hughes,Gary Whitta,"[Action, Adventure, Drama, Thriller]",R,18 March 2010 (Singapore),...,,588.0,331.0,"In a violent post-apocalyptic society, a d...",{'Filming Locations:': '\nFilming Locations:\n...,157107755,USA,English,"Alcon Entertainment, Silver Pictures",8.769667
1038686,Legion,91364,5.2,Paul Bettany|Dennis Quaid|Charles S. Dutton,/name/nm0079273/|/name/nm0000598/|/name/nm0001...,Scott Stewart,Peter Schink|Scott Stewart,"[Action, Fantasy, Horror]",R,21 January 2010 (Singapore),...,32.0,328.0,227.0,An out-of-the-way diner becomes the unlike...,{'Crazy Credits': '\nCrazy Credits\n At the...,67918658,USA,English,"Screen Gems, Bold Films",4.45
1244659,Extraordinary Measures,15595,6.5,Brendan Fraser|Keri Russell|Harrison Ford,/name/nm0000409/|/name/nm0005392/|/name/nm0000...,Tom Vaughan,Robert Nelson Jacobs|Geeta Anand,Drama,PG,22 January 2010 (USA),...,,67.0,129.0,A Portland couple have two children with P...,{'Filming Locations:': '\nFilming Locations:\n...,11854694,USA,English,"CBS Films, Double Feature Films",6.396333
1185416,When in Rome,57030,5.6,Kristen Bell|Josh Duhamel|Anjelica Huston,/name/nm0068338/|/name/nm0241049/|/name/nm0001...,Mark Steven Johnson,David Diamond|David Weissman,"[Comedy, Romance]",PG-13,25 March 2010 (Singapore),...,25.0,88.0,148.0,Beth is a hard working career woman whose ...,{'Crazy Credits': '\nCrazy Credits\n The ca...,32669555,USA,English|Italian|Ukrainian,"Touchstone Pictures, Krasnoff Foster Producti...",7.131
1226273,Edge of Darkness,84196,6.6,Mel Gibson|Ray Winstone|Danny Huston,/name/nm0000154/|/name/nm0935653/|/name/nm0396...,Martin Campbell,William Monahan|Andrew Bovell|Troy Kennedy-Martin,"[Crime, Drama, Mystery, Thriller]",R,28 January 2010 (Singapore),...,,271.0,262.0,Thomas Craven is a detective who has spent...,{'Filming Locations:': '\nFilming Locations:\n...,82812456,UK|USA,English,"Warner Bros., GK Films, BBC Films",6.716333
817230,Valentine's Day,106464,5.7,Julia Roberts|Jamie Foxx|Anne Hathaway,/name/nm0000210/|/name/nm0004937/|/name/nm0004...,Garry Marshall,Katherine Fugate|Katherine Fugate|Abby Kohn|Ma...,"[Comedy, Romance]",PG-13,11 February 2010 (Singapore),...,34.0,226.0,189.0,More than a dozen Angelenos navigate Valen...,{'Crazy Credits': '\nCrazy Credits\n Gag re...,216485654,USA,English|Spanish|Russian|Italian,"New Line Cinema, Rice Films, Karz Entertainme...",11.232
780653,The Wolfman,97229,5.8,Benicio Del Toro|Anthony Hopkins|Emily Blunt,/name/nm0001125/|/name/nm0000164/|/name/nm1289...,Joe Johnston,Andrew Kevin Walker|David Self|Curt Siodmak,"[Drama, Fantasy, Horror, Thriller]",R,18 February 2010 (Singapore),...,,449.0,368.0,Lawrence Talbot's childhood ended the nigh...,{'Filming Locations:': '\nFilming Locations:\n...,139789765,USA,English|Romany|Romanian|Ukrainian,"Universal Pictures, Relativity Media, Bluegra...",15.329
1130884,Shutter Island,991065,8.1,Leonardo DiCaprio|Emily Mortimer|Mark Ruffalo,/name/nm0000138/|/name/nm0607865/|/name/nm0749...,Martin Scorsese,Laeta Kalogridis|Dennis Lehane,"[Mystery, Thriller]",R,15 April 2010 (Singapore),...,,1.0,509.0,"It's 1954, and up-and-coming U.S. marshal ...",{'Filming Locations:': '\nFilming Locations:\n...,294804195,USA,English|German,"Paramount Pictures, Phoenix Pictures, Sikelia...",10.646667


In [11]:
df.shape

(4351, 21)

In [50]:
df = df.drop(['main_cast_list', 'main_cast_links','dir_list','creator_list', 'meta_critic_score','story_line', 'others'], axis = 1)

In [None]:
df.head(3)

In [None]:
df.shape

## Data Transformations

### 1. Check for NAs/NANs

In [51]:
df.describe()

Unnamed: 0,num_ratings,avg_rating,num_user_ratings,num_critic_ratings,Average_popularity_score_per_movie
count,4351.0,4351.0,4267.0,4267.0,4351.0
mean,49438.71,6.466766,117.522615,132.775486,3.847854
std,110896.0,0.960129,173.167208,140.513302,3.476414
min,9.0,1.5,1.0,1.0,0.0
25%,1779.5,5.9,13.0,29.0,1.077167
50%,8299.0,6.6,43.0,77.0,2.855333
75%,42562.0,7.1,145.0,192.0,5.797
max,1812301.0,9.3,998.0,974.0,21.596333


In [None]:
df.shape

#### Columns with NAs
MPAA rating, num_user_ratings, num_critic_ratings, language, production_house

In [52]:
df.isnull().any()

Name                                  False
num_ratings                           False
avg_rating                            False
genre                                 False
motion_picture_rating                  True
release_date                          False
duration                               True
num_user_ratings                       True
num_critic_ratings                     True
Gross                                 False
Country                               False
Language                               True
Production_House                       True
Average_popularity_score_per_movie    False
dtype: bool

#### How many NAs total?
Could be multiple NAs for a given row.

In [53]:
df.isnull().sum().sum()

648

#### How many NAs per column?
Can it be manually fixed by finding the true value? Say, for duration of a couple of movies.

In [None]:
df.isnull().sum(axis = 0)

In [None]:
df[df['duration'].isnull()]

This anyway has other columns as NaNs (info isn't available on IMDB anymore?),  so might as well drop the row.

#### If dropping all NAs?

In [58]:
# Lose 442 (45 rows still have NA in num_user_ratings, num_critic_ratings)
df = df.dropna(subset=['Language', 'Production_House','motion_picture_rating'])
df.shape

(3909, 14)

#### num_user_ratings and num_critic_ratings still have NAs - we will impute these later (pipelining) using median

In [57]:
df.isnull().any()

Name                                  False
num_ratings                           False
avg_rating                            False
genre                                 False
motion_picture_rating                 False
release_date                          False
duration                              False
num_user_ratings                       True
num_critic_ratings                     True
Gross                                 False
Country                               False
Language                              False
Production_House                      False
Average_popularity_score_per_movie    False
dtype: bool

In [62]:
# number of IMDB user reviews written
df['num_user_ratings'].isnull().sum()

42

In [63]:
# metacritic rating
df['num_critic_ratings'].isnull().sum()

42

### 2. Add number of languages

In [None]:
# inspect unique values
df.Language.unique()

In [6]:
languages_split = df.Language.str.split(pat="|")
df['num_languages'] = languages_split.str.len()

In [None]:
# inspect
df.head(3)

In [None]:
# check NaN
df['num_languages'].isnull().values.any()

### 3. Truncate 'Languages' and 'Country'

In [None]:
# inspect unique values
df.Country.unique()

In [7]:

languages_split = df.Language.str.split(pat="|").apply(lambda x: x[0])
df['Language'] = languages_split

In [8]:
countries_split = df.Country.str.split(pat="|").apply(lambda x: x[0])
df['Country'] = countries_split

In [None]:
df.head(3)

In [None]:
# check both columns for NaN
print(df['Language'].isnull().values.any())
print(df['Country'].isnull().values.any())

In [None]:
# check how many unique values of each
print(len(df.Country.unique()))
print(len(df.Language.unique()))

### 4. Truncate 'Production_House'

In [None]:
# inspect unique values - 3697 of them, can't see all
df.Production_House.unique()

In [9]:
production_house_split = df.Production_House.str.split(pat=", ").apply(lambda x: x[0])
df['Production_House'] = production_house_split

In [None]:
# still could too unqiue of a column - 2277 unique values!
len(df.Production_House.unique())

### 5. Check MPAA column and regroup
https://simple.m.wikipedia.org/wiki/Motion_Picture_Association_of_America_film_rating_system

#### **Reclassification:**
* TV-Y, TV-7, TV-G --> G
* TV-PG --> PG
* TV-14 --> PG-13
* TV-MA --> R
* Not Rated (923!) --> 
* Unrated (143) -->

In [None]:
# check NaN
df['motion_picture_rating'].isnull().values.any()

In [10]:
# what are the unique ratings, and how many in each category?
df.groupby('motion_picture_rating').size()

motion_picture_rating
G              39
NC-17           2
Not Rated     923
PG            358
PG-13         909
R            1415
TV-14          28
TV-G            4
TV-MA          33
TV-PG          11
TV-Y            1
TV-Y7           1
Unrated       143
dtype: int64

In [11]:
df.loc[df['motion_picture_rating'].isin(["TV-G", "TV-Y7", "TV-Y"]), 'motion_picture_rating'] = "G"
df.loc[df['motion_picture_rating'].isin(["TV-PG"]), 'motion_picture_rating'] = "PG"
df.loc[df['motion_picture_rating'].isin(["TV-14"]), 'motion_picture_rating'] = "PG-13"
df.loc[df['motion_picture_rating'].isin(["TV-MA"]), 'motion_picture_rating'] = "R"


df.loc[df['motion_picture_rating'].isin(["Unrated"]), 'motion_picture_rating'] = "PG" # mostly documentaries
df.loc[df['motion_picture_rating'].isin(["Not Rated"]), 'motion_picture_rating'] = "PG-13"

df.groupby('motion_picture_rating').size()

motion_picture_rating
G          45
NC-17       2
PG        512
PG-13    1860
R        1448
dtype: int64

In [25]:
df[df['motion_picture_rating'] == "Not Rated"]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,Production_House,Average_popularity_score_per_movie,num_languages
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [23]:
df[df['motion_picture_rating'] == "Unrated"]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,Production_House,Average_popularity_score_per_movie,num_languages
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1407055,Violet Tendencies,540,6.2,"[Comedy, Drama, Romance]",Unrated,24 April 2010 (USA),1h 39min,3.0,10.0,5264,USA,English,Embrem Entertainment,1.921,1
1287878,Shi,8510,7.8,Drama,Unrated,13 May 2010 (South Korea),2h 19min,34.0,149.0,355044,South Korea,Korean,UniKorea Pictures,0.726667,1
1470024,"Marti, dupã Crãciun",3942,7.0,"[Drama, Romance]",Unrated,17 September 2010 (Romania),1h 39min,15.0,77.0,25866,Romania,Romanian,BV McCann-Erickson,1.435333,1
1623008,The Arbor,1547,7.2,"[Documentary, Biography, Drama]",Unrated,25 April 2010 (USA),1h 34min,13.0,58.0,21268,UK,English,Artangel Media,0.957333,1
1562450,Stonewall Uprising,438,7.2,"[Documentary, History]",Unrated,16 June 2010 (USA),1h 20min,1.0,20.0,127940,USA,English,PBS American Experience,0.6,1
1282153,O Estranho Caso de Angélica,1530,6.3,"[Drama, Fantasy]",Unrated,16 March 2011 (France),1h 37min,4.0,66.0,47743,Portugal,Portuguese,Les Films de l'Après-Midi,0.861,1
1522857,The Oath,738,7.4,Documentary,Unrated,January 2010 (USA),1h 30min,6.0,31.0,42117,USA,Arabic,Praxis Films,0.2,1
1517177,3,4709,6.8,"[Comedy, Drama, Romance]",Unrated,23 December 2010 (Germany),1h 59min,19.0,76.0,59774,Germany,German,X-Filme Creative Pool,1.581,2
1590024,"Eu cand vreau sa fluier, fluier",5719,7.2,Drama,Unrated,26 March 2010 (Romania),1h 34min,21.0,60.0,12135,Romania,Romanian,Strada Film,1.188667,1
1721683,La belle endormie,640,5.7,Drama,Unrated,TV Movie 3 September 2010,1h 22min,4.0,41.0,28791,France,French,Arte France,0.6,1


In [26]:
# check NaN after transforming
df['motion_picture_rating'].isnull().values.any()

False

### 6. Convert 'duration' column to time in minutes (integer)

In [12]:
def check_time(time):
    if len(time) == 1:
        if "h" in time[0]:
            new_time = 60*int(re.sub("\D", "", time[0]))
        else:
            new_time = int(re.sub("\D", "", time[0]))
    else:
        new_time = 60*int(re.sub("\D", "", time[0])) + int(re.sub("\D", "", time[1]))
    return new_time

test1 = df.duration.str.split(" ")
test2 = test1.apply(lambda x: check_time(x))

df['duration'] = test2

In [28]:
# check unique values
print(df.duration.unique())

[ 98 100 118 106  91 117 125 103 138 107 128  94 101 108 165 115  92 132
 104 113  93 110 111  88  97  87 124  90  95 105 140 112 116 146 109  81
  84 148 102 114  80  99 163  79 135  82  96 120 133  44 123  86 122 129
 130 334  85 121 134 142 119  89  83 139 141 131 136 154  63  75 180 272
 157 155 143 127 150 137 126 164 158  76 145  70  78 144 147 172 169 151
  40 152  77 149 186  59 153 161 160 220  71 167  74 188 201 183 226 166
  68  69 185 168 159 173 162  60 156 270 187  72  39 176  73 174 171 190
 170  46  66]


In [13]:
## check for NaNs after transforming
df['duration'].isnull().values.any()

False

### 7. Expanding 'genre' to one-hot columns

In [None]:
type(df['genre'].iloc[0]) # Need to convert string representation of list to an actual Python list to accumulate as et later

In [None]:
# check unique genre lists
unique_genre_lists = df['genre'].unique()
print(unique_genre_lists) 

In [30]:
def convert_to_list(x):
    if "[" in x:
        x = re.sub("[\[\]]", "", x)
        x = x.split(", ")
    else:
        x = x.split(" ") # split by non-existent delimiter
    return x

In [31]:
# get all unique genres available
genre_lists = df.genre.apply(lambda x: convert_to_list(x))
df.genre = genre_lists

# temp = genre_lists.tolist()
# flattened =  [y for x in temp for y in x]
# print(set(flattened))

In [None]:
type(df['genre'].iloc[0])

In [32]:
# add 23 new one-hot columns
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genre')),
                          columns=mlb.classes_,
                          index=df.index))

In [33]:
df.head(3)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
433362,Daybreakers,116014,6.5,R,14 January 2010 (Singapore),98,301.0,291.0,51416464,Australia,...,0,0,0,0,1,0,0,1,0,0
1216492,Leap Year,85208,6.5,PG,12 February 2010 (Singapore),100,219.0,136.0,25918920,USA,...,0,0,0,1,0,0,0,0,0,0
1037705,The Book of Eli,268523,6.9,R,18 March 2010 (Singapore),118,588.0,331.0,157107755,USA,...,0,0,0,0,0,0,0,1,0,0


In [34]:
# check for NaNs after transforming in all genre columns
df.shape

(3867, 37)

### 8. Release date to datetime and release location as a separate column

### 9. Cleaning up 'Gross'
Standardize currency, hard-code 3-digit movies, convert string to int/float

In [None]:
type(df['Gross'].iloc[0])

In [21]:
pd.set_option('display.max_row', 4000)
#df.Gross

In [None]:
# look for values which have alphabetic characters in them => not in USD and  has to be converted
df[df.Gross.str.contains(pat = "[a-zA-Z]")]

In [14]:
# remove extra whitespaces, commas:
df['Gross'] = df.Gross.apply(lambda x: re.sub("[,\s]", "", x))
df[df.Gross.str.contains(pat = "[a-zA-Z]")]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,Production_House,Average_popularity_score_per_movie,num_languages
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2082197,Barfi!,68152,8.1,"[Comedy, Drama, Romance]",PG-13,14 September 2012 (Singapore),151,231.0,62.0,INR1030300000,India,Hindi,UTV Motion Pictures,2.915667,1
2377938,Special Chabbis,44672,8.0,"[Crime, Drama, Thriller]",PG-13,8 February 2013 (India),144,109.0,30.0,INR810000000,India,Hindi,Friday Filmworks,3.194667,1
6522546,Spyder,7799,6.8,"[Action, Thriller]",PG-13,27 September 2017 (India),155,45.0,13.0,INR124,India,Telugu,Lyca Productions,1.302667,2
1512888,Ayirathil Oruvan,2724,7.7,"[Action, Adventure, Fantasy]",PG-13,14 January 2010 (India),183,37.0,3.0,INR850000000,India,Tamil,Dream Valley Corporation,0.993333,1
6148156,Vikram Vedha,17906,8.7,"[Action, Crime, Thriller]",PG-13,21 July 2017 (India),147,89.0,16.0,INR400000000,India,English,Think Music,2.305,3
2309600,Singam 2,4966,6.3,"[Action, Thriller]",PG-13,5 July 2013 (India),166,17.0,4.0,INR1286960000,India,Tamil,Cinekorn Entertainment,0.945667,3
5867800,Aruvi,8034,8.8,Drama,PG-13,15 December 2017 (India),130,76.0,18.0,INR350000000,India,Tamil,Dream Warrior Pictures,0.0,1
5451690,American Satan,2474,5.5,"[Drama, Music, Thriller]",R,13 October 2017 (USA),111,54.0,10.0,VND74,USA,English,Sumerian Films,1.247667,1
5956100,Tiger Zinda Hai,18396,6.1,"[Action, Adventure, Thriller]",PG-13,22 December 2017 (India),161,157.0,32.0,INR5650100000,India,Hindi,Atlanta Production,3.156,2
5460068,Emo the Musical,795,6.2,"[Comedy, Music, Musical]",PG-13,4 May 2017 (Australia),94,12.0,34.0,AUD56400000,Australia,English,Matthewswood Productions,0.527667,1


In [15]:
# add 7 trailing zeros for these 4:
# 6980546                 INR 206 Bharat Ane Nenu
# 3142764                 INR 130 Race Gurram
# 6734984                INR 157 Duvvada Jagannadham
# 6522546                INR 124 Spyder

gross_truncated = ["INR206", "INR130","INR157","INR124"]

df['Gross'] = df.Gross.apply(lambda x: x + "0000000" if x in gross_truncated else x)

In [16]:
# for American Satan - VND 74 cumulative worldwide gross - change to USD $226,232
# https://www.the-numbers.com/movie/American-Satan#tab=international
df.at[5451690, 'Gross'] = "226232"

In [45]:
# Raazi - incorrectly entered as 2070 crores gross on IMDB
df.at[7098658, 'Gross'] = "2070000000"

In [46]:
from currency_converter import CurrencyConverter
c = CurrencyConverter()

def convert_currency(x):
    if re.search('[a-zA-Z£]', x) == None:
        return float(x)
    
    split_gross = re.split('(\d+)',x)
    
    # GBP
    if(split_gross[0] == "£"):
        return (c.convert(float(split_gross[1]), 'GBP','USD'))
    
    # NPR isn't supported CurrencyConverter - hard code
    if(split_gross[0]  == "NPR"):
        return (float(split_gross[1])*0.0090)
    
    return (c.convert(float(split_gross[1]), split_gross[0],'USD'))


df['Gross'] = df['Gross'].apply(lambda x : convert_currency(x))

TypeError: expected string or bytes-like object

In [None]:
df.head(5)

In [None]:
# check NaN
df['Gross'].isnull().values.any()

In [19]:
df['Gross'].describe()

count    3.867000e+03
mean     6.237718e+07
std      1.747515e+08
min      0.000000e+00
25%      8.006500e+04
50%      1.428647e+06
75%      3.477335e+07
max      2.940555e+09
Name: Gross, dtype: float64

In [None]:
#### Adjust gross for inflation based on the year - need inputs from Karthik for this:
def adjust_for_inflation(gross, release_date):
    if release_date.year == 2010:
        return(gross*1.152)
    elif release_date.year == 2011: 
        return(gross*1.124)
    elif release_date.year == 2012:
        return(gross*1.101)
    elif release_date.year == 2013:
        return(gross*1.087)
    elif release_date.year == 2014:
        return(gross*1.086)
    elif release_date.year == 2015:
        return(gross*1.068)
    elif release_date.year == 2016:
        return(gross*1.053)
    elif release_date.year == 2017:
        return(gross*1.032)
    else # 2018 => just return x itself
        return gross
df['gross'] = df.apply(lambda x: adjust_for_inflation(x['gross'], x['release_date']), axis=1)        

### If dividing into equal revenue ranges (instead of quintiles)

In [24]:
# def find_revenue_range(x):
#     if  0 <= x <= 588111000:
#         return 0
#     elif 588111001 <= x <= 1176222000:
#         return 1
#     elif 1176222000 <= x <= 1764333003:
#         return 2
#     elif 1764333003 <= x <= 2352444004:
#         return 3
#     else:
#         return 4
    
        
# df['gross_equal_range'] = df['Gross'].apply(lambda x: find_revenue_range(x))
 

In [29]:
# df[df['gross_equal_range'] == 3]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,Production_House,Average_popularity_score_per_movie,num_languages,gross_equal_range
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2488496,Star Wars: Episode VII - The Force Awakens,775868,8.0,"[Action, Adventure, Fantasy, Sci-Fi]",PG-13,17 December 2015 (Singapore),136,4.0,869.0,2068224000.0,USA,English,Lucasfilm,4.859333,1,3
4154756,Avengers: Infinity War,609049,8.5,"[Action, Adventure, Sci-Fi]",PG-13,25 April 2018 (Singapore),149,3.0,560.0,2048710000.0,USA,English,Marvel Studios,15.181667,1,3


## Categorize movies by gross revenue quintile
Split movies into 5 groups by revenue, and add (one-hot?) columns for classification.

In [None]:
print(list(df.columns.values))
print(df.shape)

In [None]:
df.head(5)

#### Divide into quintiles based on gross revenue
This divides into 5 balanced classes.
*** Dividing into 5 based on manually selected ranges results in a very high accuracy ~97%, because it is highly imbalanced - even easiest prediction of majority class can result in this accuracy.  **

In [30]:
ret_value = pd.qcut(df['Gross'], 5, labels=["very low", "low", "medium", "high", "very high"], retbins = True)

#### Check bucket values

In [31]:
df['gross_category'] = ret_value[0]
ret_value[1]
# low ends at 3.782940e+05, medium ends at 5.823487e+06, high ends at 6.337276e+07

array([0.00000000e+00, 4.64560000e+04, 3.78894000e+05, 5.83402240e+06,
       6.34464402e+07, 2.94055455e+09])

In [None]:
df.groupby('gross_category').size()

In [None]:
df_sorted = df.sort_values(['Gross','gross_category'])

#### This prints the whole dataframe (all ~3k rows)! 

In [None]:
#df_sorted

## Basic Classification Model - Logistic Regression

### Join df with YouTube features, Sentiment features


In [32]:
df_cleaned = df.copy()

In [33]:
df_cleaned.head(5)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,release_date,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,Production_House,Average_popularity_score_per_movie,num_languages,gross_equal_range,gross_category
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
433362,Daybreakers,116014,6.5,"[Action, Fantasy, Horror, Sci-Fi, Thriller]",R,14 January 2010 (Singapore),98,301.0,291.0,51416464.0,Australia,English,Lionsgate,8.318333,1,0,high
1216492,Leap Year,85208,6.5,"[Comedy, Romance]",PG,12 February 2010 (Singapore),100,219.0,136.0,25918920.0,USA,English,Universal Pictures,6.195667,2,0,high
1037705,The Book of Eli,268523,6.9,"[Action, Adventure, Drama, Thriller]",R,18 March 2010 (Singapore),118,588.0,331.0,157107755.0,USA,English,Alcon Entertainment,8.769667,1,0,very high
1038686,Legion,91364,5.2,"[Action, Fantasy, Horror]",R,21 January 2010 (Singapore),100,328.0,227.0,67918658.0,USA,English,Screen Gems,4.45,1,0,very high
1244659,Extraordinary Measures,15595,6.5,Drama,PG,22 January 2010 (USA),106,67.0,129.0,11854694.0,USA,English,CBS Films,6.396333,1,0,high


In [None]:
# read from YouTube excel file here and join the 2 dataframes for columns - viewCount,  likeCount, dislikeCount, commentCount

In [None]:
# read from sentiment features

### Dealing with categorical features
Inspect non-numeric columns:

* Country                                 object -- 61 unique - categorize as top 5 vs. others 
* Language                                object -- 77 unique - categorize as top 5 vs. others 
* Production_House                        object -- ~2000+ unique - categorize as top 5 vs. othersCheck
* motion_picture_rating                   object -- only 5 groups 
* Name                                    object -- drop, too unique, unless using to derive a text-based feature
* release_date                            object -- drop, can be used to extract weekend/not later 

In [None]:
df_cleaned.dtypes

#### Check production house split

In [None]:
df_cleaned['Production_House'].dtypes

In [None]:
df_cleaned['Production_House'].head(5)

In [None]:
# could do top 5 vs others
df_cleaned.groupby('Production_House').size().sort_values(ascending = False).head(20)

In [34]:
top_production = list(df_cleaned.groupby('Production_House').size().sort_values(ascending = False).head(5).index)

df_cleaned['Production_House'] = df_cleaned.Production_House.apply(lambda x: x if x in top_production
                                         else "Other")

In [None]:
df_cleaned.groupby('Production_House').size().sort_values(ascending = False)

#### Check language split

In [None]:
df_cleaned.groupby('Language').size().sort_values(ascending = False).head(20) # could do English, French, Hindi, Spanish, Mandarin vs. others

In [35]:
top_language = list(df_cleaned.groupby('Language').size().sort_values(ascending = False).head(5).index)

df_cleaned['Language'] = df_cleaned.Language.apply(lambda x: x if x in top_language
                                         else "Other")

In [None]:
df_cleaned.groupby('Language').size().sort_values(ascending = False)

#### Check country split

In [None]:
df_cleaned.groupby('Country').size().sort_values(ascending = False).head(20) # could do USA, UK, France, India, Canada, China vs. others

In [36]:
top_countries = list(df_cleaned.groupby('Country').size().sort_values(ascending = False).head(5).index)

df_cleaned['Country'] = df_cleaned.Country.apply(lambda x: x if x in top_countries
                                         else "Other")

In [None]:
df_cleaned.groupby('Country').size().sort_values(ascending = False)

### One-Hot Encoding 
For categorical features, and the gross_category label.

In [78]:
X = df_cleaned.drop(['gross_category', 'Gross', 'release_date', 'Name'], axis=1) 
# drop is NOT in-place by default, doesn't affect original DF

y = df_cleaned['gross_category'].copy()

In [79]:
X.dtypes

num_ratings                             int64
avg_rating                            float64
genre                                  object
motion_picture_rating                  object
duration                                int64
num_user_ratings                      float64
num_critic_ratings                    float64
Country                                object
Language                               object
Production_House                       object
Average_popularity_score_per_movie    float64
num_languages                           int64
gross_equal_range                       int64
dtype: object

In [80]:
#categorical_cols = ["motion_picture_rating", "Country", "Language",  "Production_House"]
X_dummies = pd.get_dummies(X)

In [None]:
X_dummies.shape

In [None]:
X_dummies.head(5)

In [81]:
le = preprocessing.LabelEncoder()
le.fit(y)

LabelEncoder()

In [None]:
list(le.classes_)

In [82]:
y_encoded = le.transform(y) 
#y_encoded = y

### Split data - train, test

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_encoded, random_state=1)

In [184]:
imputer = SimpleImputer()
scaler = StandardScaler()
lr = LogisticRegression(multi_class = "multinomial", solver = 'newton-cg', max_iter = 3000)

pipe = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('lr', lr)])


pipe.fit(X_train, y_train) 

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [180]:
pipe.named_steps.keys()

dict_keys(['imputer', 'scaler', 'lr'])

In [185]:
# for any continuous parameters, specify a distribution instead of a list of options
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['lr__C'] = [1, 0.75, 0.5] # smaller specifies stronger regularization
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'lr__C': [1, 0.75, 0.5]}

In [186]:
# additional parameters are n_iter (number of searches) and random_state
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [187]:
# time the randomized search
%time rand.fit(X_train, y_train)



CPU times: user 14min 32s, sys: 5.54 s, total: 14min 37s
Wall time: 7min 27s




RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_interce...y='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'lr__C': [1, 0.75, 0.5]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [188]:
print(rand.best_score_) # hold-out set
print(rand.best_params_)
# print the best model found by RandomizedSearchCV
print(rand.best_estimator_)

0.5131034482758621
{'scaler__with_std': False, 'scaler__with_mean': True, 'lr__C': 1, 'imputer__strategy': 'median'}
Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=False)), ('lr', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=3000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])


In [189]:
# predictions on train and test data with best estimator
y_trainpred0 = rand.predict(X_train)
y_pred0 = rand.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred0))
#print(metrics.f1_score(y_test, y_pred0, average='macro')) 

0.5222337125129266
0.5205962928208976


In [190]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred0))
#print(metrics.f1_score(y_train, y_trainpred0, average='macro'))

0.6610344827586206
0.6604624669319147


In [193]:
# interpretation - none of these work?
#lr.summary()
# lr.intercept
lr.coef_

array([[ 1.08828396, -0.16609268,  0.04878577, ...,  0.16808627,
         0.1417818 , -0.18231989],
       [-1.14826968,  0.13146162, -0.2558423 , ..., -0.29465784,
         0.22114469,  0.12376476],
       [-2.15067419,  0.28179708, -0.18237579, ...,  0.15799099,
        -0.57826512,  0.12815282],
       [ 2.5991521 , -0.22380201,  0.88739497, ...,  0.2120307 ,
         0.22211297, -0.39524249],
       [-0.3884922 , -0.02336401, -0.49796265, ..., -0.24345013,
        -0.00677433,  0.3256448 ]])

In [91]:
# print("Train accuracy: ", pipe.score(X_train, y_train))
# print("Test accuracy: ", pipe.score(X_test, y_test))

Train accuracy:  0.71
Test accuracy:  0.5056876938986556


### k-NN Classification

In [194]:
knn = KNeighborsClassifier()
pipe_knn = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('knn', knn)])
# pipeline steps are automatically assigned names by make_pipeline

In [195]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['knn__n_neighbors'] = [15, 20, 25, 30, 10, 50] 
param_grid['knn__weights'] = ['uniform', 'distance'] 
param_grid['knn__algorithm'] = ['auto', 'ball_tree', 'kd_tree', 'brute']
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'knn__n_neighbors': [15, 20, 25, 30, 10, 50],
 'knn__weights': ['uniform', 'distance'],
 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [196]:
rand_knn = RandomizedSearchCV(pipe_knn, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [197]:
# time the randomized search
%time rand_knn.fit(X_train, y_train)

CPU times: user 3min 13s, sys: 1.99 s, total: 3min 15s
Wall time: 3min 9s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'knn__n_neighbors': [15, 20, 25, 30, 10, 50], 'knn__weights': ['uniform', 'distance'], 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [198]:
print(rand_knn.best_score_) # hold-out set
print(rand_knn.best_params_)

0.4379310344827586
{'scaler__with_std': False, 'scaler__with_mean': True, 'knn__weights': 'uniform', 'knn__n_neighbors': 50, 'knn__algorithm': 'brute', 'imputer__strategy': 'mean'}


In [199]:
# print the best model found by RandomizedSearchCV
print(rand_knn.best_estimator_)

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=False)), ('knn', KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=50, p=2,
           weights='uniform'))])


In [203]:
# predictions on train and test data with best estimator
y_trainpred_knn = rand_knn.predict(X_train)
y_pred_knn = rand_knn.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_knn))
print(metrics.f1_score(y_test, y_pred_knn, average='macro'))

0.45191313340227507
0.4493592262021139


In [204]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_knn))
print(metrics.f1_score(y_train, y_trainpred_knn, average='macro'))

0.4748275862068966
0.4693067758301785


In [201]:
# pipe_knn.fit(X_train, y_train) # add randomized CV/grid search
# print("Train accuracy: ", pipe_knn.score(X_train, y_train))
# print("Test accuracy: ", pipe_knn.score(X_test, y_test))

### Random Forest Classification

In [206]:
rf = RandomForestClassifier(random_state=0)
pipe_rf = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('rf', rf)])


In [210]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['rf__n_estimators'] = [50, 100, 150, 200, 300, 500]  # how many trees to use in the forest
param_grid['rf__max_depth'] = [3, 5, 7, 9] # max depth
param_grid['rf__criterion'] = ['gini', 'entropy']
param_grid['rf__max_features'] = ['auto', 'log2'] # like mtry
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'rf__n_estimators': [50, 100, 150, 200, 300, 500],
 'rf__max_depth': [3, 5, 7, 9],
 'rf__criterion': ['gini', 'entropy'],
 'rf__max_features': ['auto', 'log2']}

In [211]:
rand_rf = RandomizedSearchCV(pipe_rf, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [212]:
# time the randomized search
%time rand_rf.fit(X_train, y_train)

CPU times: user 25.1 s, sys: 930 ms, total: 26 s
Wall time: 16.9 s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, m...ors='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'rf__n_estimators': [50, 100, 150, 200, 300, 500], 'rf__max_depth': [3, 5, 7, 9], 'rf__criterion': ['gini', 'entropy'], 'rf__max_features': ['auto', 'log2']},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbo

In [213]:
print(rand_rf.best_score_) # hold-out set
print(rand_rf.best_params_)

0.4696551724137931
{'scaler__with_std': False, 'scaler__with_mean': False, 'rf__n_estimators': 500, 'rf__max_features': 'log2', 'rf__max_depth': 9, 'rf__criterion': 'entropy', 'imputer__strategy': 'median'}


In [214]:
# print the best model found by RandomizedSearchCV
print(rand_rf.best_estimator_)

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=False, with_std=False)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=...mators=500, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])


In [215]:
# predictions on train and test data with best estimator
y_trainpred_rf = rand_rf.predict(X_train)
y_pred_rf = rand_rf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_rf))
print(metrics.f1_score(y_test, y_pred_rf, average='macro'))

0.49948293691830403
0.46833752264815975


In [216]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_rf))
print(metrics.f1_score(y_train, y_trainpred_rf, average='macro'))

0.636551724137931
0.6191621151260325


In [117]:
# pipe_rf.fit(X_train, y_train) # add randomized CV/grid search
# print("Train accuracy: ", pipe_rf.score(X_train, y_train))
# print("Test accuracy: ", pipe_rf.score(X_test, y_test))

Train accuracy:  0.5420689655172414
Test accuracy:  0.4984488107549121


In [218]:

# rf.feature_importances - cannot do this when using pipeline?

### TODO modelling
0. Finalize evaluation metric - 1-away classification accuracy could be good - https://sud3010ganesh.github.io/2018-05-29-boxofficerevenueprediction/
1. Interpreting models - e.g., for RF, visualizing feature importance - https://towardsdatascience.com/how-to-visualize-a-decision-tree-from-a-random-forest-in-python-using-scikit-learn-38ad2d75f21c
2. Choose any other candidate models - neural networks, for example (like SNAP paper) or NB
3. Ensembling models/add to hyperparameter tuning above to improve performance
4. extension - setting y-variable (gross category) by K-means clustering gross revenue 