In [88]:
import pandas as pd                  # Pandas
import numpy as np                   # Numpy
from matplotlib import pyplot as plt # Matplotlib

# Package to implement ML Algorithms
import sklearn
from sklearn.tree import DecisionTreeClassifier     # Decision Tree
from sklearn.ensemble import RandomForestClassifier # Random Forest


# Package for data partitioning
from sklearn.model_selection import train_test_split, KFold

# Package for generating confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Package for generating classification report
from sklearn.metrics import classification_report

# Import packages to implement Stratified K-fold CV
from sklearn.model_selection import StratifiedKFold # For creating folds

# Import Package to implement GridSearch CV
from sklearn.model_selection import GridSearchCV

# Importing package for Randomized Search CV
from sklearn.model_selection import RandomizedSearchCV

# Package to record time
import time

# Package for Data pretty printer
from pprint import pprint

# Module to save and load Python objects to and from files
import pickle 

# Ignore Deprecation Warnings
import warnings
warnings.filterwarnings('ignore')

# Display inline plots as vector-based (svg)
%config InlineBackend.figure_formats = ['svg']

%matplotlib inline

In [4]:
#read in data set + inspect variables
df_comedians = pd.read_csv('comedians.csv')
df_comedians

Unnamed: 0,Comedian,Gender,Nationality,Race,Ethnicity,Sexual orientation,Religion,Politics,Education,Genres
0,tom papa,m,American,white,American,,,,University degree,
1,tig notaro,f,American,white,American,,,,high school,Observational comedy
2,joe list,m,American,white,American,,,,high school,
3,nate bargatze,m,American,white,American,,,,,
4,brian regan,m,American,white,Irish-American,,,,University degree,"Observational comedy, sarcastic and self-depre..."
...,...,...,...,...,...,...,...,...,...,...
63,sarah millican,f,British,white,English,,,,high school,observational comedy
64,daniel tosh,m,American,white,American,,,,University degree,"observational comedy, black comedy, insult com..."
65,neal brennan,m,American,white,Irish Catholic,,,,high school,"observational comedy, surreal humor, sketch co..."
66,hannibal buress,m,American,black,Afro-American,,atheist,,high school,"observational comedy, black comedy, blue comed..."


In [5]:
df_comedians.isna().sum()

Comedian               0
Gender                 0
Nationality            0
Race                   0
Ethnicity              0
Sexual orientation    66
Religion              52
Politics              66
Education              6
Genres                19
dtype: int64

In [8]:
df_comedians['Religion'].value_counts()

Religion
atheist           6
agnostic          3
christian         2
presbyterian      1
muslim            1
catholic          1
roman catholic    1
jewish            1
Name: count, dtype: int64

In [94]:
df_scripts = pd.read_csv('SCRIPTS.csv')
df_scripts.head()

Unnamed: 0.1,Unnamed: 0,Script_ID,Sample,Label,ID_script,stops,alliteration_num,max_all_len,consonance_num,max_cons_len,...,Nationality_New Zealander,Nationality_Scottish,Nationality_Scottish American,Nationality_South African,Ethnicity_asian,Ethnicity_black,Ethnicity_hispanic,Ethnicity_mixed,Ethnicity_south-asian,Ethnicity_white
0,0,tom-papa-human-mule,aired December 2016 [ plodding music ] Pleas...,0,tom-papa-human-mule,13,0,0,8,4,...,0,0,0,0,0,0,0,0,0,1
1,1,tom-papa-human-mule,Thank you . [ cheers and applause continue ] ...,0,tom-papa-human-mule,35,8,5,18,10,...,0,0,0,0,0,0,0,0,0,1
2,2,tom-papa-human-mule,Look at you .,0,tom-papa-human-mule,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,tom-papa-human-mule,You made a little plan for yourself . And you...,0,tom-papa-human-mule,11,1,2,8,5,...,0,0,0,0,0,0,0,0,0,1
4,4,tom-papa-human-mule,"Texting each other , We still going to go to...",0,tom-papa-human-mule,5,2,2,4,3,...,0,0,0,0,0,0,0,0,0,1


In [96]:
df_scripts.rename(columns={'Label':'Humorous'}, inplace=True)
df_scripts.head()

Unnamed: 0.1,Unnamed: 0,Script_ID,Sample,Humorous,ID_script,stops,alliteration_num,max_all_len,consonance_num,max_cons_len,...,Nationality_New Zealander,Nationality_Scottish,Nationality_Scottish American,Nationality_South African,Ethnicity_asian,Ethnicity_black,Ethnicity_hispanic,Ethnicity_mixed,Ethnicity_south-asian,Ethnicity_white
0,0,tom-papa-human-mule,aired December 2016 [ plodding music ] Pleas...,0,tom-papa-human-mule,13,0,0,8,4,...,0,0,0,0,0,0,0,0,0,1
1,1,tom-papa-human-mule,Thank you . [ cheers and applause continue ] ...,0,tom-papa-human-mule,35,8,5,18,10,...,0,0,0,0,0,0,0,0,0,1
2,2,tom-papa-human-mule,Look at you .,0,tom-papa-human-mule,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,tom-papa-human-mule,You made a little plan for yourself . And you...,0,tom-papa-human-mule,11,1,2,8,5,...,0,0,0,0,0,0,0,0,0,1
4,4,tom-papa-human-mule,"Texting each other , We still going to go to...",0,tom-papa-human-mule,5,2,2,4,3,...,0,0,0,0,0,0,0,0,0,1


In [97]:
df_scripts.drop(columns=['Unnamed: 0', 'Script_ID','ID_script'],inplace=True)
df_scripts

Unnamed: 0,Sample,Humorous,stops,alliteration_num,max_all_len,consonance_num,max_cons_len,assonance_num,max_asso_len,sense_farmost,...,Nationality_New Zealander,Nationality_Scottish,Nationality_Scottish American,Nationality_South African,Ethnicity_asian,Ethnicity_black,Ethnicity_hispanic,Ethnicity_mixed,Ethnicity_south-asian,Ethnicity_white
0,aired December 2016 [ plodding music ] Pleas...,0,13,0,0,8,4,5,3,0.043478,...,0,0,0,0,0,0,0,0,0,1
1,Thank you . [ cheers and applause continue ] ...,0,35,8,5,18,10,14,8,0.045455,...,0,0,0,0,0,0,0,0,0,1
2,Look at you .,0,2,0,0,0,0,0,0,0.058824,...,0,0,0,0,0,0,0,0,0,1
3,You made a little plan for yourself . And you...,0,11,1,2,8,5,4,4,0.055556,...,0,0,0,0,0,0,0,0,0,1
4,"Texting each other , We still going to go to...",0,5,2,2,4,3,4,2,0.062500,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,"It was like a farmer s market yam , just stic...",1,31,5,4,15,11,12,8,0.043478,...,0,0,0,0,0,0,0,0,0,1
19133,He did not turn into a Jew . it is not like m...,1,41,10,5,18,14,12,12,0.041667,...,0,0,0,0,0,0,0,0,0,1
19134,Why can not I have a boyfriend ? I would lik...,1,19,4,5,12,6,6,7,0.045455,...,0,0,0,0,0,0,0,0,0,1
19135,I know I would like it . I would like to hav...,1,16,4,5,9,5,7,12,0.045455,...,0,0,0,0,0,0,0,0,0,1


In [98]:
X_tom= df_scripts.drop(columns = ['Humorous'])
y_tom = df_scripts['Humorous']

In [99]:
X_tom_cat_var = ['Sample']
X_tom_encoded = pd.get_dummies(X_tom,columns=X_tom_cat_var)

In [101]:
X_tom_encoded.head()

Unnamed: 0,stops,alliteration_num,max_all_len,consonance_num,max_cons_len,assonance_num,max_asso_len,sense_farmost,sense_closest,sense_combination,...,"Sample_you will see , Scout Master . Forty years from now , I will be doing a comedy special I do not know what network .",Sample_you will see .,"Sample_you will see that third - quarter profits are on their way up . Men , you want to make bathrooms better for women ? Get those four - year - old little boys out of there ! Always poking their creepy little heads under the stalls being like , Are you my mom ? I told you , not anymore , Kevin ! And I do not know why men are so concerned about our bathrooms . I am worried about your bathrooms .","Sample_you would be behind him with a hole punch . Coming , Boris !","Sample_you would be fucking livid , would not you ? The irony would kill you . I recently read Great Expectations , and it was not as good as I thought it was going to be . I can do a brilliant Michael Jackson impersonation . Would you like to see it ?","Sample_you would be like , We should fix that . Every time I hear a woman talk about giving birth honestly talk about giving it , not the Facebook version of like , This is magical and I am blessed honestly talk about giving birth , it always feels like the beginning of an infomercial where some guy s going to pop out and be like , Are you tired of a hole ripping from your vagina to your butt ? Is pooping on a table in front of strangers leaving you feeling embarrassed ? Have you been pushing for 20 hours with no end in sight , thinking , there is got to be a better way ! Well , there is not !","Sample_you would be like , why did not you put it in my salad ? Well , I kinda did . And then you would wink . You would wink . I know what it is , it is incredibly hot . Let me let me get this kmart fan on everybody . Is that what it is ? it is a little hot down here . So , let me get this kmart or as we call it up north , black macy s . Let me get this fan on everybody . Could you get some ? Paul , are you getting some ?","Sample_you would say , I had Indian food last night . I have been trumpin my brains out all day . Or , Ah , shit ! That seagull just trumped in my hair .",Sample_you would see her toes . you would see her feet and her toes . She could just kick off her shoes and there is a Dorito - sized piece of cloth keeping you from the greatest show on earth .,Sample_your music would have suffered .
0,13,0,0,8,4,5,3,0.043478,0.333333,6.579251,...,False,False,False,False,False,False,False,False,False,False
1,35,8,5,18,10,14,8,0.045455,0.5,49.407325,...,False,False,False,False,False,False,False,False,False,False
2,2,0,0,0,0,0,0,0.058824,0.142857,2.995732,...,False,False,False,False,False,False,False,False,False,False
3,11,1,2,8,5,4,4,0.055556,0.5,15.96276,...,False,False,False,False,False,False,False,False,False,False
4,5,2,2,4,3,4,2,0.0625,0.333333,12.05989,...,False,False,False,False,False,False,False,False,False,False


In [33]:
df_scripts['Name'] = df_scripts['Script_ID'].str.split('-').str[0:2].apply(' '.join)
df_scripts['Name']

0        tom papa
1        tom papa
2        tom papa
3        tom papa
4        tom papa
           ...   
19132     louis c
19133     louis c
19134     louis c
19135     louis c
19136     louis c
Name: Name, Length: 19137, dtype: object

In [57]:
#counting number of scripts for each comedian
df_scripts.groupby('Name')['Script_ID'].count()

Name
adam devine          303
adam sandler         369
anthony jeselnik     188
ari shaffir          224
bert kreischer       391
                    ... 
tom segura          1099
trevor noah          192
urzila carlson       444
w kamau              253
whitney cummings      94
Name: Script_ID, Length: 70, dtype: int64

In [44]:
df_scripts()

Unnamed: 0.1,Unnamed: 0,Script_ID,Sample,Humorous,ID_script,stops,alliteration_num,max_all_len,consonance_num,max_cons_len,...,Nationality_Scottish,Nationality_Scottish American,Nationality_South African,Ethnicity_asian,Ethnicity_black,Ethnicity_hispanic,Ethnicity_mixed,Ethnicity_south-asian,Ethnicity_white,Name
0,0,tom-papa-human-mule,aired December 2016 [ plodding music ] Pleas...,0,tom-papa-human-mule,13,0,0,8,4,...,0,0,0,0,0,0,0,0,1,tom papa
1,1,tom-papa-human-mule,Thank you . [ cheers and applause continue ] ...,0,tom-papa-human-mule,35,8,5,18,10,...,0,0,0,0,0,0,0,0,1,tom papa
2,2,tom-papa-human-mule,Look at you .,0,tom-papa-human-mule,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,tom papa
3,3,tom-papa-human-mule,You made a little plan for yourself . And you...,0,tom-papa-human-mule,11,1,2,8,5,...,0,0,0,0,0,0,0,0,1,tom papa
4,4,tom-papa-human-mule,"Texting each other , We still going to go to...",0,tom-papa-human-mule,5,2,2,4,3,...,0,0,0,0,0,0,0,0,1,tom papa


In [77]:
df_tom_papa_scripts_filtered

Unnamed: 0.1,Unnamed: 0,Script_ID,Sample,Humorous,ID_script,stops,alliteration_num,max_all_len,consonance_num,max_cons_len,...,Nationality_South African,Ethnicity_asian,Ethnicity_black,Ethnicity_hispanic,Ethnicity_mixed,Ethnicity_south-asian,Ethnicity_white,Name,Script,Scripts
0,0,tom-papa-human-mule,aired December 2016 [ plodding music ] Pleas...,0,tom-papa-human-mule,13,0,0,8,4,...,0,0,0,0,0,0,1,tom papa,papa,papa
1,1,tom-papa-human-mule,Thank you . [ cheers and applause continue ] ...,0,tom-papa-human-mule,35,8,5,18,10,...,0,0,0,0,0,0,1,tom papa,papa,papa
2,2,tom-papa-human-mule,Look at you .,0,tom-papa-human-mule,2,0,0,0,0,...,0,0,0,0,0,0,1,tom papa,papa,papa
3,3,tom-papa-human-mule,You made a little plan for yourself . And you...,0,tom-papa-human-mule,11,1,2,8,5,...,0,0,0,0,0,0,1,tom papa,papa,papa
4,4,tom-papa-human-mule,"Texting each other , We still going to go to...",0,tom-papa-human-mule,5,2,2,4,3,...,0,0,0,0,0,0,1,tom papa,papa,papa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9509,9509,tom-papa-human-mule,"Look , I am not Pollyannaish about it . I mea...",1,tom-papa-human-mule,29,4,3,12,11,...,0,0,0,0,0,0,1,tom papa,papa,papa
9510,9510,tom-papa-human-mule,"Then I am in the bathroom on YouTube , How ...",1,tom-papa-human-mule,13,4,3,13,6,...,0,0,0,0,0,0,1,tom papa,papa,papa
9511,9511,tom-papa-human-mule,we are Italian . We got here in 1945 . We eat...,1,tom-papa-human-mule,6,1,2,6,4,...,0,0,0,0,0,0,1,tom papa,papa,papa
9512,9512,tom-papa-human-mule,"she is like she is like , because I have s...",1,tom-papa-human-mule,28,8,5,16,11,...,0,0,0,0,0,0,1,tom papa,papa,papa


In [80]:
df_tom_papa_scripts_filtered = df_scripts[df_scripts['Name'] == 'tom papa']
df_tom_papa_scripts_filtered.drop(columns=['ID_script','Script_ID', 'Unnamed: 0','Name', 'Script','Scripts'],axis=1,inplace=True)


In [85]:
df_tom_papa_scripts_filtered.head()

Unnamed: 0,Sample,Humorous,stops,alliteration_num,max_all_len,consonance_num,max_cons_len,assonance_num,max_asso_len,sense_farmost,...,Nationality_New Zealander,Nationality_Scottish,Nationality_Scottish American,Nationality_South African,Ethnicity_asian,Ethnicity_black,Ethnicity_hispanic,Ethnicity_mixed,Ethnicity_south-asian,Ethnicity_white
0,aired December 2016 [ plodding music ] Pleas...,0,13,0,0,8,4,5,3,0.043478,...,0,0,0,0,0,0,0,0,0,1
1,Thank you . [ cheers and applause continue ] ...,0,35,8,5,18,10,14,8,0.045455,...,0,0,0,0,0,0,0,0,0,1
2,Look at you .,0,2,0,0,0,0,0,0,0.058824,...,0,0,0,0,0,0,0,0,0,1
3,You made a little plan for yourself . And you...,0,11,1,2,8,5,4,4,0.055556,...,0,0,0,0,0,0,0,0,0,1
4,"Texting each other , We still going to go to...",0,5,2,2,4,3,4,2,0.0625,...,0,0,0,0,0,0,0,0,0,1


In [84]:
rf_class_mod = RandomForestClassifier(random_state=42)

In [86]:
#create predictors and response variable
X_tom= df_tom_papa_scripts_filtered.drop(columns = ['Humorous'])
y_tom = df_tom_papa_scripts_filtered['Humorous']

In [87]:
X_tom_cat_var = ['Sample']
X_tom_encoded = pd.get_dummies(X_tom,columns=X_tom_cat_var)

In [89]:
#partition data into train/test dataset
#create cv folds
train_X_tom, test_X_tom, train_y_tom, test_y_tom = train_test_split(X_tom_encoded, y_tom, test_size = 0.2, random_state = 1)
folds_3 = KFold(n_splits = 3, shuffle = True, random_state = 100)

In [90]:
# Start with an initial guess for parameters
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 20, num = 10)]

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [5, 10, 20]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

pprint(random_grid)

{'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
 'min_samples_leaf': [5, 10, 20],
 'min_samples_split': [5, 10, 20],
 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]}


In [91]:
#perform randCV
rf_reg_cv = RandomizedSearchCV(estimator = rf_class_mod, 
                              param_distributions = random_grid,
                              n_iter = 100,
                              scoring = 'f1_macro', 
                              cv = folds_3, 
                              verbose = 1,
                              random_state = 42,
                              n_jobs = -1) # Will utilize all available CPUs 

In [92]:
rf_reg_cv.fit(train_X_tom, train_y_tom)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [93]:
print('Initial score: ', rf_reg_cv.best_score_)
print('Initial parameters: ', rf_reg_cv.best_params_)

Initial score:  0.4586289659707381
Initial parameters:  {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 20, 'max_depth': 2}
