Imports:

In [4]:
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install scikit-learn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier


# Data Retrieval 

In [5]:
dialogue = pd.read_csv(("https://raw.githubusercontent.com/"
 "rfordatascience/tidytuesday/master/data/"
 "2020/2020-09-08/friends.csv"))

print("Example Dialogue:")
dialogue.sample(4)


Example Dialogue:


Unnamed: 0,text,speaker,season,episode,scene,utterance
27277,No we didn't!,Chandler Bing,5,1,7,7
56490,Er-does not.,Waiter,9,5,12,3
1497,Thank God you didn't try to fan out the magazi...,Chandler Bing,1,6,4,13
14665,Why not?!,Chandler Bing,3,8,5,10


In [6]:
episodes = pd.read_csv(("https://raw.githubusercontent.com/"
 "rfordatascience/tidytuesday/master/data/"
 "2020/2020-09-08/friends_info.csv"))

print("Example Episode:")
episodes.sample(4)

Example Episode:


Unnamed: 0,season,episode,title,directed_by,written_by,air_date,us_views_millions,imdb_rating
147,7,2,The One with Rachel's Book,Michael Lembeck,Andrew Reich & Ted Cohen,2000-10-12,27.93,8.3
72,3,25,The One at the Beach,Pamela Fryman,Story by : Pang-Ni Landrum & Mark KunerthTelep...,1997-05-15,28.8,8.8
16,1,17,The One with Two Parts: Part 2,Michael Lembeck,Marta Kauffman & David Crane,1995-02-23,30.5,8.5
230,10,13,The One Where Joey Speaks French,Gary Halvorson,Sherry Bilsing-Graham & Ellen Plummer,2004-02-19,24.27,8.5


# Preprocessing

In [7]:


# 6 Main Character Names
main_character_names = ["Ross Geller", "Monica Geller",
                        "Phoebe Buffay", "Chandler Bing",
                        "Joey Tribbiani", "Rachel Green"]


#Filters Out Non Dialogue Rows
all_dialogue = dialogue[~dialogue["speaker"].isin(["Scene Directions", "#ALL#", "NA"])]


#Indexs of  MAIN CHARACTER DIALOGUE
dialogue_mc_indexs = all_dialogue["speaker"].isin(main_character_names)

#Indexs of OTHER CHARACTER DIALOGUE
dialogue_oc_indexs = ~all_dialogue["speaker"].isin(main_character_names)

#Indexs of EXCLAMATORY DIALOGUE
dialogue_exclamatory_indexs = all_dialogue["text"].str.contains(r'[A-Z]{2,}') | all_dialogue["text"].str.contains(r'!')

In [8]:
#FUNCTION TO APPLY

def sum_size(x):
    return x.str.len().sum()
                      

# Processing 1/4: Adding Episode Level Dialogue Info


In [9]:

#Table To Store Episode Level Dialogue Summary
jt_1 = all_dialogue.groupby(["season", "episode"]).agg("first").reset_index().drop(columns=["scene", "utterance", "speaker", "text"])

jt_1.head(2)

Unnamed: 0,season,episode
0,1,1
1,1,2


In [10]:
#Grouped Data
gp = all_dialogue.groupby(["season", "episode"])

#Adding Total Words In Episode
jt_1 = pd.merge(jt_1, gp["text"].size().reset_index(name="total_words"))

#Adding Total Lines In Episode
jt_1 = pd.merge(jt_1, gp["text"].agg(sum_size).reset_index(name="total_lines"))

#Adding Total Scenes In Episode
jt_1 = pd.merge(jt_1, gp["scene"].nunique().reset_index(name="total_scenes"))

#Adding Total Speakers In Episode
jt_1 = pd.merge(jt_1, gp["speaker"].nunique().reset_index(name="total_speakers"))

jt_1.head(5)

Unnamed: 0,season,episode,total_words,total_lines,total_scenes,total_speakers
0,1,1,292,16876,15,11
1,1,2,241,12249,11,14
2,1,3,237,12070,14,10
3,1,4,248,12532,16,14
4,1,5,238,12626,16,10


In [11]:

#Grouped Data (All NON-MAIN Characters Lines By Episodes)
gp = all_dialogue[dialogue_oc_indexs].groupby(["season", "episode"])


# Adding Total Lines By NON-MAIN characters
jt_1 = pd.merge(jt_1, gp["text"].size().reset_index(name="oc_lines"))

#Adding Total Scenes By NON-MAIN characters
jt_1 = pd.merge(jt_1, gp["scene"].nunique().reset_index(name="oc_scenes"))


#Grouped Data (All NON-MAIN Characters Exclamatory Lines By Episode)
gp = all_dialogue[dialogue_oc_indexs & dialogue_exclamatory_indexs].groupby(["season", "episode"])

#Adding Total Exclamatory Lines By Non Main Characters
jt_1 = pd.merge(jt_1, gp["text"].size().reset_index(name="oc_exclam"))

jt_1.head(5)

Unnamed: 0,season,episode,total_words,total_lines,total_scenes,total_speakers,oc_lines,oc_scenes,oc_exclam
0,1,1,292,16876,15,11,27,9,5
1,1,2,241,12249,11,14,69,8,19
2,1,3,237,12070,14,10,24,6,3
3,1,4,248,12532,16,14,31,6,10
4,1,5,238,12626,16,10,34,10,2


# Processing 2/4: Adding Character Level Dialogue Info

In [12]:
#Table To Store Character Level Dialogue Summary
# (WILL BE PIVOTED INTO ABOVE FORM BELOW)

jt_2 = all_dialogue[dialogue_mc_indexs].groupby(["season", "episode", "speaker"]).agg("first").reset_index().drop(columns=["scene", "utterance", "text"])

jt_2.head(6)

Unnamed: 0,season,episode,speaker
0,1,1,Chandler Bing
1,1,1,Joey Tribbiani
2,1,1,Monica Geller
3,1,1,Phoebe Buffay
4,1,1,Rachel Green
5,1,1,Ross Geller


In [13]:
#Grouped Data (All Main Character Dialogue By Episode, Speaker)
gp = all_dialogue[dialogue_mc_indexs].groupby(["season", "episode", "speaker"])

#Adding Total Lines Per Main Character
jt_2 = pd.merge(jt_2, gp["text"].size().reset_index(name="lines"))

#Adding Total Scenes Per Main Character 
jt_2 = pd.merge(jt_2, gp["scene"].nunique().reset_index(name="scene"))



#Grouped Data (All Main Character Exclamatory Lines, By Episode, Speaker)
gp = all_dialogue[dialogue_mc_indexs & dialogue_exclamatory_indexs].groupby(["season", "episode", "speaker"])

#Adding Total Exclamatory Lines By Main Character
jt_2 = pd.merge(jt_2, gp["text"].size().reset_index(name="exclam_lines"))


jt_2.head(6)

Unnamed: 0,season,episode,speaker,lines,scene,exclam_lines
0,1,1,Chandler Bing,39,9,8
1,1,1,Joey Tribbiani,39,8,11
2,1,1,Monica Geller,73,9,20
3,1,1,Phoebe Buffay,19,6,5
4,1,1,Rachel Green,48,8,15
5,1,1,Ross Geller,47,8,8


In [14]:

#Pivoting Table (To Ensure One Row Per Episode)
jt_2 = jt_2.pivot(index=["season", "episode"], columns="speaker", values=["lines", "scene", "exclam_lines"]).reset_index(inplace=False)

#Renaming Columns + Reseting Index
jt_2.columns = ['_'.join(map(str, col)).strip() for col in jt_2.columns.values]
jt_2.reset_index(inplace=False)
jt_2.rename(columns={"season_":"season", "episode_":"episode"}, inplace=True)


jt_2.head(2)

Unnamed: 0,season,episode,lines_Chandler Bing,lines_Joey Tribbiani,lines_Monica Geller,lines_Phoebe Buffay,lines_Rachel Green,lines_Ross Geller,scene_Chandler Bing,scene_Joey Tribbiani,scene_Monica Geller,scene_Phoebe Buffay,scene_Rachel Green,scene_Ross Geller,exclam_lines_Chandler Bing,exclam_lines_Joey Tribbiani,exclam_lines_Monica Geller,exclam_lines_Phoebe Buffay,exclam_lines_Rachel Green,exclam_lines_Ross Geller
0,1,1,39.0,39.0,73.0,19.0,48.0,47.0,9.0,8.0,9.0,6.0,8.0,8.0,8.0,11.0,20.0,5.0,15.0,8.0
1,1,2,16.0,8.0,28.0,14.0,38.0,68.0,4.0,4.0,5.0,4.0,6.0,9.0,2.0,1.0,5.0,9.0,8.0,10.0


# Processing 3/4: Adding Episode Review/Viewership Info

In [15]:
#Taking Existing Episode df and dropping certain attributes
jt_3 = episodes.drop(columns=["title", "directed_by", "written_by", "air_date"])

jt_3.head(6)

Unnamed: 0,season,episode,us_views_millions,imdb_rating
0,1,1,21.5,8.3
1,1,2,20.2,8.1
2,1,3,19.5,8.2
3,1,4,19.7,8.1
4,1,5,18.6,8.5
5,1,6,18.2,8.1


# Processing 4/4: Joining Tables + Summary

In [16]:
#JOINING ALL STATISTICS INTO A SINGLE DATA TO CREATE A MODEL FROM

joined_table_final = pd.merge(pd.merge(jt_1, jt_2, on=["season", "episode"]), jt_3, on=["season", "episode"])

joined_table_final.head(2)

Unnamed: 0,season,episode,total_words,total_lines,total_scenes,total_speakers,oc_lines,oc_scenes,oc_exclam,lines_Chandler Bing,...,scene_Rachel Green,scene_Ross Geller,exclam_lines_Chandler Bing,exclam_lines_Joey Tribbiani,exclam_lines_Monica Geller,exclam_lines_Phoebe Buffay,exclam_lines_Rachel Green,exclam_lines_Ross Geller,us_views_millions,imdb_rating
0,1,1,292,16876,15,11,27,9,5,39.0,...,8.0,8.0,8.0,11.0,20.0,5.0,15.0,8.0,21.5,8.3
1,1,2,241,12249,11,14,69,8,19,16.0,...,6.0,9.0,2.0,1.0,5.0,9.0,8.0,10.0,20.2,8.1


#### THE ABOVE TABLE CONTAINS THE FOLLOWING: ON PER EPISODE BASIS

     "total_words" -       The Number Of Words Spoken

     "total_lines" -       The Number Of Lines Spoken

     "total_scenes" -      The Number Of Scenes

     "total_speakers" -    The Number Of Total Characters Who Speak

     "oc_lines" -          The Number Of Lines Collectively Spoken By All Non-Main Characters

     "oc_scenes" -         The Number Of Scenes In Which A Non-Main Character Speaks

     "oc_exclam" -         The Number Of Exclamatory Lines (upper case word, or exclamation point)
                           Collectively Spoken By All Non-Main Characters

     "us_views_millions" - The Number Of Viewers This Episode Had When Released (EXISTING NOT DERIVED)

     "imdb_rating" -       The Rating Given To The Episode By IMDB (EXISTING NOT DERIVED)


     COL: For All 6 Main Character Names (MCN)

          "lines_{MCN}" -         The Number Of Lines Spoken By This Main Character

          "scenes_{MCN}" -        The Number Of Scenes Spoken In By This Main Character

          "exclam_lines_{MCN} -   The Number Of Exclamatory Lines (Upper Case Word, or Exclamation point)
                                   Spoken By This Main Character

# Inference Task

In [53]:
# I want to verify that lots of exclamation changes episode IMDb rating 

# Resampling import to use in bootstrapping 
from sklearn.utils import resample

# Two populations: IMDb ratings and Viewership 

# Combining columns with exclamation
exclam_cols = ['exclam_lines_Chandler Bing','exclam_lines_Joey Tribbiani','exclam_lines_Monica Geller','exclam_lines_Phoebe Buffay','exclam_lines_Rachel Green','exclam_lines_Ross Geller']
exclam = joined_table_final['oc_exclam'] 
for col_name in exclam_cols:
    exclam = exclam + joined_table_final.loc[:,col_name]

# Creates new dataframe with the desired columns (rating and # of exclamations)
df_exclam = pd.DataFrame({"Exclamations":exclam, "Rating":joined_table_final['imdb_rating']})
mean_exclam = np.round(np.mean(df_exclam), 0)
df_exclam.fillna({'Exclamations':mean_exclam}, inplace=True)

# Defines two populations: IMDb ratings with high 
df_exclam 
mean_exclamation = np.mean(df_exclam['Exclamations'])
pop1 = df_exclam[df_exclam['Exclamations']>mean_exclamation]['Rating']
pop2 = df_exclam[df_exclam['Exclamations']<=mean_exclamation]['Rating']

# Define number of bootstrap resamples
n_resamples = 1000
mean_diffs = np.zeros(n_resamples)

# Bootstrapping procedure (add difference in each resample mean to the list)
for i in range(n_resamples):
    s1 = resample(pop1, replace=True)
    s2 = resample(pop2, replace=True)
    mean_diffs[i] = np.mean(s1)-np.mean(s2) 
    
# Confidence interval 
ci_95 = np.percentile(mean_diffs, [2.5, 97.5]) 

# Mean Difference 
mean_difference = round(np.mean(mean_diffs),2)

print(f'On average, episodes with more exclamation have about {mean_difference*100}% higher imdb ratings.')
print(f'The mean difference between ratings if on a 95% confidence interval of being between({round(ci_95[0],2)*100}%, {round(ci_95[1],2)*100}%)')


On average, episodes with more exclamation have about 12.0% higher imdb ratings.
The mean difference between ratings if on a 95% confidence interval of being bewteen(2.0%, 22.0%)
