# Box Office Revenue Prediction

## Imports

In [100]:
import pandas as pd
import numpy as np


import re # regex
import ast

from datetime import datetime
from datetime import date

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn import metrics
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve, RandomizedSearchCV, train_test_split
from sklearn.neural_network import MLPClassifier

## Data Sources
1. IMDB:
    * No. of ratings (star)
    * Avg. star rating (out of 10)
    * No. of user ratings (text reviews)
    * No. of critic ratings (meta critic)
    * Countries (truncate to just first country) - √
    * Language (truncate to just first language) - √
    * No. of languages (to be created from previous column) - √
    * Production house (truncate to just first production house) - √
    * Duration (convert to minutes/numeric from string) - √
    * Genre (need to explode to one-hot columns for 23 genres) - √
    * MPAA rating 
        * TV- ratings - reclassify - √ 
        * Not Rated --> PG-13/Unrated --> PG - many from non-English fall under these categories - √  
       
    * **Gross:** 'Y' variable (need to remove movies with alphabet-only gross, convert INR (other currencies?) to USD and 3 INR 3-digit movies):
        * Adjust for inflation, based on year of release - √ 
    * Release date:
        * convert first to datetime and use as a feature (holiday - US - release/not) - √ 

    
&nbsp; 
2. Popularity Scores:
    * Average popularity score per movie

&nbsp; 

3. Sentiment Scores:
    * AFINN score - based on IMDB user reviews pre-release
    * AFINN score - based on IMDB user reviews post-release
    

&nbsp;    
4. YouTube:
    * View count
    * Like count
    * Dislike count
    * Comment count

## Reading Excel File(s)
#### !!!!!! IMPORTANT- manually removed the unnamed serial number column from the popularity score excel sheet I'm reading below!!!!! 
#### Check that it is removed in your excel sheet also

In [3]:
# read popularity scores xlsx with IMDB_ID as the indexing column (2nd column from the left, or 1st index) 
df_pop = pd.read_excel("Final_data_sheets_updated_popularity_scores.xlsx", index_col = 0)
df = pd.read_excel("Final_data_sheets_with_Features.xlsx", index_col = 0)

In [5]:
df_pop = df_pop['Average_popularity_score_per_movie']

In [6]:
df_pop.head(2)

IMDB_ID
433362     8.318333
1216492    6.195667
Name: Average_popularity_score_per_movie, dtype: float64

In [7]:
# inspect
df.head(3)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,main_cast_list,main_cast_links,dir_list,creator_list,genre,motion_picture_rating,release_date,...,Language,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1690470,Gekijouban Poketto monsutâ: Daiamondo & Pâru -...,1251,6.2,Ikue Ôtani|Sarah Natochenny|Wayne Grayson,/name/nm0649026/|/name/nm2516299/|/name/nm0969...,Kunihiko Yuyama,Satoshi Tajiri|Hideki Sonoda,"[Animation, Action, Family, Fantasy, Sci-Fi]",Not Rated,10 July 2010 (Japan),...,Japanese|English,"East Japan Marketing & Communications Inc., G...",0,0,0,0,0.0,0.0,2010-07-10,0
1508290,Kyatapirâ,952,6.7,Shinobu Terajima|Keigo Kasuya|Emi Masuda,/name/nm0855429/|/name/nm2486225/|/name/nm3787...,Kôji Wakamatsu,Hisako Kurosawa|Masao Adachi,"[Drama, War]",Not Rated,14 August 2010 (Japan),...,Japanese,"Skhole Co., Wakamatsu Production",0,0,0,0,0.0,-23.0,2010-08-14,0
2057455,Dalpaengee eui byeol,323,7.3,Cho Young-Chan|Kim Soon-ho|Choi Jungah,/name/nm9804862/|/name/nm9804863/|/name/nm9804...,Seung-jun Yi,,"[Documentary, Drama]",Not Rated,15 February 2014 (Japan),...,Korean,"CreativeEAST, Dalpaengee, NHK",0,0,0,0,0.0,13.0,2014-02-15,0


In [8]:
list(df)

['Name',
 'num_ratings',
 'avg_rating',
 'main_cast_list',
 'main_cast_links',
 'dir_list',
 'creator_list',
 'genre',
 'motion_picture_rating',
 'release_date',
 'duration',
 'meta_critic_score',
 'num_user_ratings',
 'num_critic_ratings',
 'story_line',
 'others',
 'Gross',
 'Country',
 'Language',
 'Production_House',
 'viewCount',
 'likeCount',
 'dislikeCount',
 'commentCount',
 'Afinn Pre Release',
 'Afinn Post Release',
 'Release date ',
 'Holiday']

In [9]:
df = df.drop(['main_cast_list', 'main_cast_links','dir_list','creator_list', 'meta_critic_score','story_line', 'others', 'release_date'], axis = 1)

In [10]:
df.head(3)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1690470,Gekijouban Poketto monsutâ: Daiamondo & Pâru -...,1251,6.2,"[Animation, Action, Family, Fantasy, Sci-Fi]",Not Rated,1h 36min,5.0,3.0,71143529,Japan,Japanese|English,"East Japan Marketing & Communications Inc., G...",0,0,0,0,0.0,0.0,2010-07-10,0
1508290,Kyatapirâ,952,6.7,"[Drama, War]",Not Rated,1h 25min,8.0,45.0,2109,Japan,Japanese,"Skhole Co., Wakamatsu Production",0,0,0,0,0.0,-23.0,2010-08-14,0
2057455,Dalpaengee eui byeol,323,7.3,"[Documentary, Drama]",Not Rated,1h 28min,4.0,33.0,2647,South Korea|Japan|Finland,Korean,"CreativeEAST, Dalpaengee, NHK",0,0,0,0,0.0,13.0,2014-02-15,0


In [11]:
# join df and df_pop
df = df.join(df_pop)

In [12]:
list(df)

['Name',
 'num_ratings',
 'avg_rating',
 'genre',
 'motion_picture_rating',
 'duration',
 'num_user_ratings',
 'num_critic_ratings',
 'Gross',
 'Country',
 'Language',
 'Production_House',
 'viewCount',
 'likeCount',
 'dislikeCount',
 'commentCount',
 'Afinn Pre Release',
 'Afinn Post Release',
 'Release date ',
 'Holiday',
 'Average_popularity_score_per_movie']

In [13]:
df.shape

(4351, 21)

## Data Transformations

### 1. Check for NAs/NANs

In [14]:
df.describe()

Unnamed: 0,num_ratings,avg_rating,num_user_ratings,num_critic_ratings,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Holiday,Average_popularity_score_per_movie
count,4351.0,4351.0,4267.0,4267.0,4351.0,4351.0,4351.0,4351.0,4351.0,4351.0,4351.0,4351.0
mean,49438.71,6.466766,117.522615,132.775486,1588189.0,10648.13,844.473684,1067.971731,8.816311,10.462413,0.033096,3.847854
std,110896.0,0.960129,173.167208,140.513302,6871187.0,76847.61,7162.396242,9218.423397,12.745791,13.2217,0.178907,3.476414
min,9.0,1.5,1.0,1.0,0.0,0.0,0.0,0.0,-62.0,-67.5,0.0,0.0
25%,1779.5,5.9,13.0,29.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.077167
50%,8299.0,6.6,43.0,77.0,0.0,0.0,0.0,0.0,5.0,8.0,0.0,2.855333
75%,42562.0,7.1,145.0,192.0,171315.0,479.0,29.0,46.5,16.203297,18.125,0.0,5.797
max,1812301.0,9.3,998.0,974.0,222426800.0,3483078.0,244120.0,476931.0,125.0,106.0,1.0,21.596333


In [15]:
df.shape

(4351, 21)

#### Columns with NAs
MPAA rating, num_user_ratings, num_critic_ratings, language, production_house

In [16]:
df.isnull().any()

Name                                  False
num_ratings                           False
avg_rating                            False
genre                                 False
motion_picture_rating                  True
duration                               True
num_user_ratings                       True
num_critic_ratings                     True
Gross                                 False
Country                               False
Language                               True
Production_House                       True
viewCount                             False
likeCount                             False
dislikeCount                          False
commentCount                          False
Afinn Pre Release                     False
Afinn Post Release                    False
Release date                          False
Holiday                               False
Average_popularity_score_per_movie    False
dtype: bool

#### How many NAs total?
Could be multiple NAs for a given row.

In [17]:
df.isnull().sum().sum()

648

#### How many NAs per column?
Can it be manually fixed by finding the true value? Say, for duration of a couple of movies.

In [18]:
df.isnull().sum(axis = 0)

Name                                    0
num_ratings                             0
avg_rating                              0
genre                                   0
motion_picture_rating                 355
duration                                1
num_user_ratings                       84
num_critic_ratings                     84
Gross                                   0
Country                                 0
Language                               13
Production_House                      111
viewCount                               0
likeCount                               0
dislikeCount                            0
commentCount                            0
Afinn Pre Release                       0
Afinn Post Release                      0
Release date                            0
Holiday                                 0
Average_popularity_score_per_movie      0
dtype: int64

In [19]:
df[df['duration'].isnull()]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,genre,motion_picture_rating,duration,num_user_ratings,num_critic_ratings,Gross,Country,...,Production_House,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,Afinn Post Release,Release date,Holiday,Average_popularity_score_per_movie
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3861006,Xiao shi dai 3: Ci jin shi dai,619,2.7,"[Drama, Romance]",,,,,86900000,China,...,Le Vision Pictures (Tianjin) Co.,0,0,0,0,0.0,0.0,2014-07-17,0,5.729333


This anyway has other columns as NaNs (info isn't available on IMDB anymore?),  so might as well drop the row.

#### If dropping all NAs?

In [20]:
# Lose 442 (45 rows still have NA in num_user_ratings, num_critic_ratings)
df = df.dropna(subset=['Language', 'Production_House','motion_picture_rating'])
df.shape

(3909, 21)

#### num_user_ratings and num_critic_ratings still have NAs - we will impute these later (pipelining) using median/mean

In [21]:
df.isnull().any()

Name                                  False
num_ratings                           False
avg_rating                            False
genre                                 False
motion_picture_rating                 False
duration                              False
num_user_ratings                       True
num_critic_ratings                     True
Gross                                 False
Country                               False
Language                              False
Production_House                      False
viewCount                             False
likeCount                             False
dislikeCount                          False
commentCount                          False
Afinn Pre Release                     False
Afinn Post Release                    False
Release date                          False
Holiday                               False
Average_popularity_score_per_movie    False
dtype: bool

In [22]:
# number of IMDB user reviews written
df['num_user_ratings'].isnull().sum()

42

In [23]:
# metacritic rating
df['num_critic_ratings'].isnull().sum()

42

### 2. Add number of languages

In [None]:
# inspect unique values
df.Language.unique()

In [24]:
languages_split = df.Language.str.split(pat="|")
df['num_languages'] = languages_split.str.len()

In [None]:
# inspect
df.head(3)

In [None]:
# check NaN
df['num_languages'].isnull().values.any()

### 3. Truncate 'Languages' and 'Country'

In [None]:
# inspect unique values
df.Country.unique()

In [25]:

languages_split = df.Language.str.split(pat="|").apply(lambda x: x[0])
df['Language'] = languages_split

In [26]:
countries_split = df.Country.str.split(pat="|").apply(lambda x: x[0])
df['Country'] = countries_split

In [None]:
df.head(3)

In [None]:
# check both columns for NaN
print(df['Language'].isnull().values.any())
print(df['Country'].isnull().values.any())

In [None]:
# check how many unique values of each
print(len(df.Country.unique()))
print(len(df.Language.unique()))

### 4. Truncate 'Production_House'

In [None]:
# inspect unique values - 3697 of them, can't see all
df.Production_House.unique()

In [27]:
production_house_split = df.Production_House.str.split(pat=", ").apply(lambda x: x[0])
df['Production_House'] = production_house_split

In [None]:
# still could too unqiue of a column - 2277 unique values!
len(df.Production_House.unique())

### 5. Check MPAA column and regroup
https://simple.m.wikipedia.org/wiki/Motion_Picture_Association_of_America_film_rating_system

#### **Reclassification:**
* TV-Y, TV-7, TV-G --> G
* TV-PG --> PG
* TV-14 --> PG-13
* TV-MA --> R
* Not Rated (923!) --> 
* Unrated (143) -->

In [None]:
# check NaN
df['motion_picture_rating'].isnull().values.any()

In [None]:
# what are the unique ratings, and how many in each category?
df.groupby('motion_picture_rating').size()

In [28]:
df.loc[df['motion_picture_rating'].isin(["TV-G", "TV-Y7", "TV-Y"]), 'motion_picture_rating'] = "G"
df.loc[df['motion_picture_rating'].isin(["TV-PG"]), 'motion_picture_rating'] = "PG"
df.loc[df['motion_picture_rating'].isin(["TV-14", "M"]), 'motion_picture_rating'] = "PG-13"
df.loc[df['motion_picture_rating'].isin(["TV-MA"]), 'motion_picture_rating'] = "R"


df.loc[df['motion_picture_rating'].isin(["Unrated"]), 'motion_picture_rating'] = "PG" # mostly documentaries
df.loc[df['motion_picture_rating'].isin(["Not Rated"]), 'motion_picture_rating'] = "PG-13"

df.groupby('motion_picture_rating').size()

motion_picture_rating
G          45
NC-17       2
PG        520
PG-13    1888
R        1454
dtype: int64

In [None]:
# check NaN after transforming
df['motion_picture_rating'].isnull().values.any()

### 6. Convert 'duration' column to time in minutes (integer)

In [29]:
def check_time(time):
    if len(time) == 1:
        if "h" in time[0]:
            new_time = 60*int(re.sub("\D", "", time[0]))
        else:
            new_time = int(re.sub("\D", "", time[0]))
    else:
        new_time = 60*int(re.sub("\D", "", time[0])) + int(re.sub("\D", "", time[1]))
    return new_time

test1 = df.duration.str.split(" ")
test2 = test1.apply(lambda x: check_time(x))

df['duration'] = test2

In [None]:
# check unique values
print(df.duration.unique())

In [None]:
## check for NaNs after transforming
df['duration'].isnull().values.any()

In [None]:
list(df)

### 7. Expanding 'genre' to one-hot columns

In [None]:
type(df['genre'].iloc[0]) # Need to convert string representation of list to an actual Python list to accumulate as et later

In [None]:
# check unique genre lists
unique_genre_lists = df['genre'].unique()
print(unique_genre_lists) 

In [30]:
def convert_to_list(x):
    if "[" in x:
        x = re.sub("[\[\]]", "", x)
        x = x.split(", ")
    else:
        x = x.split(" ") # split by non-existent delimiter
    return x

In [31]:
# get all unique genres available
genre_lists = df.genre.apply(lambda x: convert_to_list(x))
df.genre = genre_lists

# temp = genre_lists.tolist()
# flattened =  [y for x in temp for y in x]
# print(set(flattened))

In [None]:
type(df['genre'].iloc[0])

In [32]:
# add 23 new one-hot columns
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genre')),
                          columns=mlb.classes_,
                          index=df.index))

In [None]:
df.head(3)

In [None]:
list(df)

In [None]:
# check for NaNs after transforming in all genre columns
df.shape

### 9. Cleaning up 'Gross'
Standardize currency, hard-code 3-digit movies, convert string to int/float

In [None]:
type(df['Gross'].iloc[0])

In [44]:
pd.set_option('display.max_row', 4000)

In [None]:
# look for values which have alphabetic characters in them => not in USD and  has to be converted
df[df.Gross.str.contains(pat = "[a-zA-Z]")]

In [33]:
# remove extra whitespaces, commas:
df['Gross'] = df.Gross.apply(lambda x: re.sub("[,\s]", "", x))
df[df.Gross.str.contains(pat = "[a-zA-Z]")]

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,...,Musical,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3417422,Drishyam,23724,8.8,PG-13,160,81.0,10.0,INR750000000,India,Malayalam,...,0,0,0,0,0,0,0,1,0,0
6980546,Bharat Ane Nenu,13039,8.0,PG-13,173,125.0,21.0,INR206,India,Telugu,...,0,0,0,0,0,0,0,0,0,0
3569782,Jigarthanda,8325,8.4,PG-13,171,34.0,20.0,INR350000000,India,Tamil,...,0,0,0,0,0,0,0,1,0,0
5440700,Theri,11077,7.2,PG-13,157,54.0,12.0,INR1500000000,India,Tamil,...,0,0,0,0,0,0,0,0,0,0
2106537,Matru ki Bijlee ka Mandola,5727,5.7,PG-13,151,42.0,20.0,INR466500000,India,Hindi,...,0,0,0,0,0,0,0,0,0,0
6878378,Vivegam,9840,5.9,PG-13,149,56.0,20.0,INR120000000,India,Tamil,...,0,0,0,0,0,0,0,1,0,0
3848892,Baby,47710,8.0,PG-13,159,212.0,25.0,INR1429900000,India,Hindi,...,0,0,0,0,0,0,0,1,0,0
3320578,Veeram,6312,6.6,PG-13,161,39.0,3.0,INR1300000000,India,Tamil,...,0,0,0,0,0,0,0,0,0,0
6734984,Duvvada Jagannadham,2451,5.5,PG-13,152,10.0,5.0,INR157,India,Telugu,...,0,0,0,0,0,0,0,0,0,0
4727512,Srimanthudu,9548,7.6,PG-13,158,64.0,5.0,INR1445500000,India,Telugu,...,0,0,0,0,0,0,0,0,0,0


In [34]:
# add 7 trailing zeros for these 4:
# 6980546                 INR 206 Bharat Ane Nenu
# 3142764                 INR 130 Race Gurram
# 6734984                INR 157 Duvvada Jagannadham
# 6522546                INR 124 Spyder

gross_truncated = ["INR206", "INR130","INR157","INR124"]

df['Gross'] = df.Gross.apply(lambda x: x + "0000000" if x in gross_truncated else x)

In [35]:
# 1753865 Giochi d'estate - Gross not available on IMDB or elsewhere, drop the row
df.drop(1753865, inplace = True)

In [36]:
# for American Satan - VND 74 cumulative worldwide gross - change to USD $226,232
# https://www.the-numbers.com/movie/American-Satan#tab=international
df.at[5451690, 'Gross'] = "226232"

In [37]:
# Raazi - incorrectly entered as 2070 crores gross on IMDB, is actually ~207 crores
df.at[7098658, 'Gross'] = "2070000000"

In [38]:
from currency_converter import CurrencyConverter
c = CurrencyConverter()

def convert_currency(x):
    if re.search('[a-zA-Z£]', x) == None:
        return float(x)
    
    split_gross = re.split('(\d+)',x)
    
    # GBP
    if(split_gross[0] == "£"):
        return (c.convert(float(split_gross[1]), 'GBP','USD'))
    
    # NPR isn't supported CurrencyConverter - hard code
    if(split_gross[0]  == "NPR"):
        return (float(split_gross[1])*0.0090)
    
    return (c.convert(float(split_gross[1]), split_gross[0],'USD'))


df['Gross'] = df['Gross'].apply(lambda x : convert_currency(x))

In [None]:
list(df)

In [None]:
# check NaN
df['Gross'].isnull().values.any()

In [None]:
df['Gross'].describe()

In [None]:
type(df['Release date '].iloc[0])

In [39]:
#### Adjust gross for inflation based on the year - need inputs from Karthik for this:
def adjust_for_inflation(gross, release_date):
    if release_date.year == 2010:
        return(gross*1.152)
    elif release_date.year == 2011: 
        return(gross*1.124)
    elif release_date.year == 2012:
        return(gross*1.101)
    elif release_date.year == 2013:
        return(gross*1.087)
    elif release_date.year == 2014:
        return(gross*1.086)
    elif release_date.year == 2015:
        return(gross*1.068)
    elif release_date.year == 2016:
        return(gross*1.053)
    elif release_date.year == 2017:
        return(gross*1.032)
     # 2018 => just return x itself
    return gross
df['Gross'] = df.apply(lambda x: adjust_for_inflation(x['Gross'], x['Release date ']), axis=1)        

In [40]:
df['Gross'].describe()

count    3.908000e+03
mean     6.627972e+07
std      1.825128e+08
min      0.000000e+00
25%      8.093526e+04
50%      1.474008e+06
75%      3.683397e+07
max      2.208863e+09
Name: Gross, dtype: float64

### If dividing into equal revenue ranges (instead of quintiles)

In [None]:
# def find_revenue_range(x):
#     if  0 <= x <= 588111000:
#         return 0
#     elif 588111001 <= x <= 1176222000:
#         return 1
#     elif 1176222000 <= x <= 1764333003:
#         return 2
#     elif 1764333003 <= x <= 2352444004:
#         return 3
#     else:
#         return 4
    
        
# df['gross_equal_range'] = df['Gross'].apply(lambda x: find_revenue_range(x))
 

In [None]:
# df[df['gross_equal_range'] == 3]

## Categorize movies by gross revenue quintile
Split movies into 5 groups by revenue, and add (one-hot?) columns for classification.

In [None]:
list(df.columns.values)

In [None]:
df.head(5)

#### Divide into quintiles based on gross revenue
This divides into 5 balanced classes.
*** Dividing into 5 based on manually selected ranges results in a very high accuracy ~97%, because it is highly imbalanced - even easiest prediction of majority class can result in this accuracy.  **

In [41]:
ret_value = pd.qcut(df['Gross'], 5, labels=["very low", "low", "medium", "high", "very high"], retbins = True)

#### Check bucket values

In [42]:
df['gross_category'] = ret_value[0]
ret_value[1]
# very low ends at 4.869325e+04, 
#low ends at 3.891683e+05, medium ends at 5.904366e+06, 
# high ends at 6.748606e+07, very high ends at 2.208863e+09

array([0.00000000e+00, 4.88742888e+04, 3.90153180e+05, 5.90707620e+06,
       6.74920807e+07, 2.20886283e+09])

In [43]:
df.groupby('gross_category').size()

gross_category
very low     782
low          781
medium       782
high         781
very high    782
dtype: int64

In [45]:
df_sorted = df.sort_values(['Gross'])

#### This prints the whole dataframe (all ~3k rows)! 

In [46]:
df_sorted

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,...,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western,gross_category
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1925479,Officer Down,4961,5.5,R,98,20.0,21.0,0.0,USA,English,...,0,0,0,0,0,0,0,0,0,very low
4874206,(M)uchenik,3764,6.9,PG,118,12.0,63.0,145.314,Russia,Russian,...,0,0,0,0,0,0,0,0,0,very low
4372390,Detour,4559,6.2,R,97,24.0,55.0,149.64,UK,English,...,0,0,0,0,0,0,1,0,0,very low
1810697,Meeting Evil,11192,5.3,R,89,69.0,34.0,199.281,USA,English,...,1,0,0,0,0,0,1,0,0,very low
4796122,Satanic,5188,3.7,R,85,70.0,31.0,265.356,USA,English,...,0,0,0,0,0,0,0,0,0,very low
1682940,Playback,4474,4.3,R,98,52.0,49.0,277.452,USA,English,...,0,0,0,0,0,0,1,0,0,very low
5038358,Des nouvelles de la planète Mars,888,6.2,PG-13,101,4.0,31.0,326.43,France,French,...,0,0,0,0,0,0,0,0,0,very low
2289920,Justice Is Mind,96,6.9,PG-13,153,2.0,13.0,363.81,USA,English,...,1,0,0,1,0,0,0,0,0,very low
1384927,Death of a Superhero,3206,7.1,PG-13,97,11.0,39.0,379.845,Germany,English,...,0,0,0,0,0,0,0,0,0,very low
4428762,Éperdument,1116,5.9,PG,110,4.0,12.0,410.67,France,French,...,0,0,1,0,0,0,0,0,0,very low


## Classification Models
### Logistic Regression Classification

In [47]:
df_cleaned = df.copy()

In [48]:
df_cleaned.head(5)

Unnamed: 0_level_0,Name,num_ratings,avg_rating,motion_picture_rating,duration,num_user_ratings,num_critic_ratings,Gross,Country,Language,...,Mystery,News,Romance,Sci-Fi,Short,Sport,Thriller,War,Western,gross_category
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1690470,Gekijouban Poketto monsutâ: Daiamondo & Pâru -...,1251,6.2,PG-13,96,5.0,3.0,81957350.0,Japan,Japanese,...,0,0,0,1,0,0,0,0,0,very high
1508290,Kyatapirâ,952,6.7,PG-13,85,8.0,45.0,2429.568,Japan,Japanese,...,0,0,0,0,0,0,0,1,0,very low
2057455,Dalpaengee eui byeol,323,7.3,PG-13,88,4.0,33.0,2874.642,South Korea,Korean,...,0,0,0,0,0,0,0,0,0,very low
2077826,Gekijoban Poketto Monsuta besuto uisshu bikuti...,1152,6.1,PG-13,88,4.0,4.0,64160720.0,Japan,Japanese,...,0,0,0,1,0,0,0,0,0,high
1937133,De l'autre côté du périph,4319,5.8,R,96,5.0,36.0,6521.223,France,French,...,0,0,0,0,0,0,0,0,0,very low


### Dealing with categorical features
Inspect non-numeric columns:

* Country                                 object -- 61 unique - categorize as top 5 vs. others 
* Language                                object -- 77 unique - categorize as top 5 vs. others 
* Production_House                        object -- ~2000+ unique - categorize as top 5 vs. othersCheck
* motion_picture_rating                   object -- only 5 groups 
* Name                                    object -- drop, too unique, unless using to derive a text-based feature
* release_date                            object -- drop, can be used to extract weekend/not later 

In [None]:
df_cleaned.dtypes

#### Check production house split

In [None]:
df_cleaned['Production_House'].dtypes

In [None]:
df_cleaned['Production_House'].head(5)

In [None]:
# could do top 5 vs others
df_cleaned.groupby('Production_House').size().sort_values(ascending = False).head(20)

In [49]:
top_production = list(df_cleaned.groupby('Production_House').size().sort_values(ascending = False).head(5).index)

df_cleaned['Production_House'] = df_cleaned.Production_House.apply(lambda x: x if x in top_production
                                         else "Other")

In [50]:
df_cleaned.groupby('Production_House').size().sort_values(ascending = False)

Production_House
Other                             3583
 Universal Pictures                 80
 Columbia Pictures Corporation      70
 Paramount Pictures                 63
 Warner Bros.                       61
 Twentieth Century Fox              51
dtype: int64

#### Check language split

In [None]:
df_cleaned.groupby('Language').size().sort_values(ascending = False).head(20) # could do English, French, Hindi, Spanish, Mandarin vs. others

In [51]:
top_language = list(df_cleaned.groupby('Language').size().sort_values(ascending = False).head(5).index)

df_cleaned['Language'] = df_cleaned.Language.apply(lambda x: x if x in top_language
                                         else "Other")

In [52]:
df_cleaned.groupby('Language').size().sort_values(ascending = False)

Language
English     2961
Other        454
French       194
Hindi        151
Spanish       81
Mandarin      67
dtype: int64

#### Check country split

In [None]:
df_cleaned.groupby('Country').size().sort_values(ascending = False).head(20) # could do USA, UK, France, India, Canada, China vs. others

In [53]:
top_countries = list(df_cleaned.groupby('Country').size().sort_values(ascending = False).head(5).index)

df_cleaned['Country'] = df_cleaned.Country.apply(lambda x: x if x in top_countries
                                         else "Other")

In [54]:
df_cleaned.groupby('Country').size().sort_values(ascending = False)

Country
USA       2211
Other      810
UK         352
France     237
India      198
Canada     100
dtype: int64

### One-Hot Encoding 
For categorical features, and the gross_category label.

In [56]:
X = df_cleaned.drop(['gross_category', 'Gross', 'Name', 'Release date '], axis=1) 
# drop is NOT in-place by default, doesn't affect original DF

y = df_cleaned['gross_category'].copy()

In [57]:
X.dtypes

num_ratings                             int64
avg_rating                            float64
motion_picture_rating                  object
duration                                int64
num_user_ratings                      float64
num_critic_ratings                    float64
Country                                object
Language                               object
Production_House                       object
viewCount                               int64
likeCount                               int64
dislikeCount                            int64
commentCount                            int64
Afinn Pre Release                     float64
Afinn Post Release                    float64
Holiday                                 int64
Average_popularity_score_per_movie    float64
num_languages                           int64
Action                                  int64
Adventure                               int64
Animation                               int64
Biography                         

In [58]:
X.shape 

(3908, 41)

In [59]:
#categorical_cols = ["motion_picture_rating", "Country", "Language",  "Production_House"]
X_dummies = pd.get_dummies(X)

In [60]:
X_dummies.shape # 41 -4 + 18 + 5 = 60

(3908, 60)

In [61]:
X_dummies.head(5)

Unnamed: 0_level_0,num_ratings,avg_rating,duration,num_user_ratings,num_critic_ratings,viewCount,likeCount,dislikeCount,commentCount,Afinn Pre Release,...,Language_Hindi,Language_Mandarin,Language_Other,Language_Spanish,Production_House_ Columbia Pictures Corporation,Production_House_ Paramount Pictures,Production_House_ Twentieth Century Fox,Production_House_ Universal Pictures,Production_House_ Warner Bros.,Production_House_Other
IMDB_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1690470,1251,6.2,96,5.0,3.0,0,0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,1
1508290,952,6.7,85,8.0,45.0,0,0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,1
2057455,323,7.3,88,4.0,33.0,0,0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,1
2077826,1152,6.1,88,4.0,4.0,0,0,0,0,0.0,...,0,0,1,0,0,0,0,0,0,1
1937133,4319,5.8,96,5.0,36.0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [62]:
le = preprocessing.LabelEncoder()
le.fit(y)

LabelEncoder()

In [63]:
list(le.classes_)

['high', 'low', 'medium', 'very high', 'very low']

In [64]:
y_encoded = le.transform(y) 

### Split data - train, test

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_encoded, random_state=1)

In [66]:
X_train.shape

(2931, 60)

In [67]:
X_test.shape

(977, 60)

In [68]:
imputer = SimpleImputer()
scaler = StandardScaler()
lr = LogisticRegression(multi_class = "multinomial", solver = 'newton-cg', max_iter = 5000)

pipe = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('lr', lr)])


pipe.fit(X_train, y_train) 

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=5000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [None]:
pipe.named_steps.keys()

In [None]:
# for any continuous parameters, specify a distribution instead of a list of options
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['lr__C'] = [1, 0.75, 0.5] # smaller specifies stronger regularization
param_grid

In [None]:
# additional parameters are n_iter (number of searches) and random_state
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [None]:
# time the randomized search
%time rand.fit(X_train, y_train)

In [None]:
print(rand.best_score_) # hold-out set
print(rand.best_params_)
# print the best model found by RandomizedSearchCV
print(rand.best_estimator_)

In [None]:
# predictions on train and test data with best estimator
y_trainpred0 = rand.predict(X_train)
y_pred0 = rand.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred0))
print(metrics.f1_score(y_test, y_pred0, average='macro')) 

In [None]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred0))
print(metrics.f1_score(y_train, y_trainpred0, average='macro'))

In [None]:
# interpretation 
lr.coef_

In [None]:
lr.coef_.shape

### k-NN Classification

In [69]:
knn = KNeighborsClassifier()
pipe_knn = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('knn', knn)])
# pipeline steps are automatically assigned names by make_pipeline

In [70]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['knn__n_neighbors'] = [50, 100, 150, 200, 250] 
param_grid['knn__weights'] = ['uniform', 'distance'] 
param_grid['knn__algorithm'] = ['auto', 'ball_tree', 'kd_tree', 'brute']
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'knn__n_neighbors': [50, 100, 150, 200, 250],
 'knn__weights': ['uniform', 'distance'],
 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [71]:
rand_knn = RandomizedSearchCV(pipe_knn, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [72]:
# time the randomized search
%time rand_knn.fit(X_train, y_train)

CPU times: user 19.1 s, sys: 1.3 s, total: 20.4 s
Wall time: 13.5 s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'knn__n_neighbors': [50, 100, 150, 200, 250], 'knn__weights': ['uniform', 'distance'], 'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [73]:
print(rand_knn.best_score_) # hold-out set
print(rand_knn.best_params_)

0.4612760150119413
{'scaler__with_std': True, 'scaler__with_mean': True, 'knn__weights': 'uniform', 'knn__n_neighbors': 50, 'knn__algorithm': 'kd_tree', 'imputer__strategy': 'mean'}


In [74]:
# print the best model found by RandomizedSearchCV
print(rand_knn.best_estimator_)

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('knn', KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=50, p=2,
           weights='uniform'))])


In [75]:
# predictions on train and test data with best estimator
y_trainpred_knn = rand_knn.predict(X_train)
y_pred_knn = rand_knn.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_knn))
print(metrics.f1_score(y_test, y_pred_knn, average='macro'))

0.4749232343909928
0.47823249337774687


In [76]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_knn))
print(metrics.f1_score(y_train, y_trainpred_knn, average='macro'))

0.49334698055271237
0.4919488072117538


### Random Forest Classification

In [77]:
rf = RandomForestClassifier(random_state=0)
pipe_rf = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('rf', rf)])


In [78]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid['rf__n_estimators'] = [50, 100, 150, 200, 300, 500, 600]  # how many trees to use in the forest
param_grid['rf__max_depth'] = [3, 5, 7, 9] # max depth
param_grid['rf__criterion'] = ['gini', 'entropy']
param_grid['rf__max_features'] = ['auto', 'log2'] # like mtry
param_grid['rf__oob_score'] = [True, False] 
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False],
 'rf__n_estimators': [50, 100, 150, 200, 300, 500, 600],
 'rf__max_depth': [3, 5, 7, 9],
 'rf__criterion': ['gini', 'entropy'],
 'rf__max_features': ['auto', 'log2'],
 'rf__oob_score': [True, False]}

In [79]:
rand_rf = RandomizedSearchCV(pipe_rf, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [80]:
# time the randomized search
%time rand_rf.fit(X_train, y_train)

CPU times: user 29.4 s, sys: 332 ms, total: 29.8 s
Wall time: 31.6 s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, m...ors='warn', n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False], 'rf__n_estimators': [50, 100, 150, 200, 300, 500, 600], 'rf__max_depth': [3, 5, 7, 9], 'rf__criterion': ['gini', 'entropy'], 'rf__max_features': ['auto', 'log2'], 'rf__oob_score': [True, False]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_sc

In [81]:
print(rand_rf.best_score_) # hold-out set
print(rand_rf.best_params_)

0.5554418287273968
{'scaler__with_std': False, 'scaler__with_mean': False, 'rf__oob_score': False, 'rf__n_estimators': 600, 'rf__max_features': 'log2', 'rf__max_depth': 9, 'rf__criterion': 'entropy', 'imputer__strategy': 'median'}


In [85]:
# print the best model found by RandomizedSearchCV
print(rand_rf.best_estimator_)

Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=False, with_std=False)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=...mators=600, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False))])


In [83]:
# predictions on train and test data with best estimator
y_trainpred_rf = rand_rf.predict(X_train)
y_pred_rf = rand_rf.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_rf))
print(metrics.f1_score(y_test, y_pred_rf, average='macro'))

0.5537359263050153
0.54475542632895


In [84]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_rf))
print(metrics.f1_score(y_train, y_trainpred_rf, average='macro'))

0.8065506653019447
0.8038368493078718


In [None]:
rf.feature_importances #cannot do this when using pipeline?

### Naive Bayes Classificiation

In [92]:
nb = GaussianNB()
pipe_nb = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('nb', nb)])



In [97]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False]}

In [98]:
rand_nb = RandomizedSearchCV(pipe_nb, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [99]:
# time the randomized search
%time rand_nb.fit(X_train, y_train)

ValueError: Input X must be non-negative

In [None]:
print(rand_nb.best_score_) # hold-out set
print(rand_nb.best_params_)

In [None]:
# print the best model found by RandomizedSearchCV
print(rand_nb.best_estimator_)

In [None]:
# predictions on train and test data with best estimator
y_trainpred_nb = rand_nb.predict(X_train)
y_pred_nb = rand_nb.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_nb))
print(metrics.f1_score(y_test, y_pred_nb, average='macro'))

In [None]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_nb))
print(metrics.f1_score(y_train, y_trainpred_nb, average='macro'))

### MLP Classification

In [103]:
mlp = MLPClassifier(random_state=0)
pipe_mlp = Pipeline([('imputer', imputer),
                 ('scaler', scaler), 
                 ('mlp', mlp)])



In [105]:
param_grid = {}
param_grid['imputer__strategy'] = ["mean", "median"]
param_grid['scaler__with_mean'] = [True, False]
param_grid['scaler__with_std'] = [True, False]
# param_grid['mlp__n_estimators'] = [50, 100, 150, 200, 300, 500, 600]  # how many trees to use in the forest
# param_grid['mlp__max_depth'] = [3, 5, 7, 9] # max depth
# param_grid['mlp__criterion'] = ['gini', 'entropy']
# param_grid['mlp__max_features'] = ['auto', 'log2'] # like mtry
# param_grid['mlp__oob_score'] = [True, False] 
param_grid

{'imputer__strategy': ['mean', 'median'],
 'scaler__with_mean': [True, False],
 'scaler__with_std': [True, False]}

In [106]:
rand_mlp = RandomizedSearchCV(pipe_mlp, param_grid, cv=5, scoring='accuracy', n_iter=5, random_state=1)

In [107]:
# time the randomized search
%time rand_mlp.fit(X_train, y_train)

CPU times: user 1min 41s, sys: 1.68 s, total: 1min 43s
Wall time: 57.6 s


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]),
          fit_params=None, iid='warn', n_iter=5, n_jobs=None,
          param_distributions={'imputer__strategy': ['mean', 'median'], 'scaler__with_mean': [True, False], 'scaler__with_std': [True, False]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          return_train_score='warn', scoring='accuracy', verbose=0)

In [None]:
print(rand_mlp.best_score_) # hold-out set
print(rand_mlp.best_params_)

In [None]:
# print the best model found by RandomizedSearchCV
print(rand_mlp.best_estimator_)

In [None]:
# predictions on train and test data with best estimator
y_trainpred_mlp = rand_mlp.predict(X_train)
y_pred_mlp = rand_nb.predict(X_test)

print(metrics.accuracy_score(y_test, y_pred_mlp))
print(metrics.f1_score(y_test, y_pred_mlp, average='macro'))

In [None]:
# train set
print(metrics.accuracy_score(y_train, y_trainpred_mlp))
print(metrics.f1_score(y_train, y_trainpred_mlp, average='macro'))

## Further evaluation of best-performing model

In [None]:
# classification report 
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf, target_names=["very low", "low", "medium", "high", "very high"]))

In [None]:
# confusion matrix
# from https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = ["very low", "low", "medium", "high", "very high"]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
np.set_printoptions(precision=2)
class_names = ["very low", "low", "medium", "high", "very high"]

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred_rf, classes=class_names,
                      title='Confusion matrix, without normalization')


In [None]:
# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred_rf, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

### TODO modelling

1. NB and MLP
3. Ensemble
4. Add to hyperparameter tuning above to improve performance
