# Gathering the data

In [2]:

# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    # telling the data is in post content after doing inspect page and to pull all the paragraphs
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']

# Comedian names
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

#Requesting for the transcripts to run hence takes some time
#creating a dictionary where every key is the comedian and every value is the transcript

In [4]:
transcripts = [url_to_transcript(u) for u in urls]

http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/
http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/
http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/
http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/
http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/
http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/
http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/
http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/
http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/
http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/
http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/
http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-

In [6]:

# # Pickle files for later use
for i, c in enumerate(comedians):
     with open("transcripts/" + c + ".txt", "wb") as file:
            pickle.dump(transcripts[i], file)

In [7]:

# Load pickled files
data = {}
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [8]:
data.keys()

dict_keys(['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe'])

# Cleaning The Data

We can clean the data in the following ways:

1.Make text all lower case
2.Remove punctuation
3.Remove numerical values
4.Remove common non-sensical text (/n)
5.Tokenize text
6.Remove stop words

Further data cleaning can be done in the following steps after tokenization:

1.Stemming / lemmatization
2.Parts of speech tagging
3.Create bi-grams or tri-grams
4.Deal with typos

In [9]:
# We are going to change this to key: comedian, value: string format
#we are combining all the chunks of data to one ie all the paragraphs about a particular comedian to one
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [10]:

# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [11]:

# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
ali,"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have ..."
anthony,"Thank you. Thank you. Thank you, San Francisco. Thank you so much. So good to be here. People were surprised when I told ’em I was gonna tape my s..."
bill,"[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a ..."
bo,Bo What? Old MacDonald had a farm E I E I O And on that farm he had a pig E I E I O Here a snort There a Old MacDonald had a farm E I E I O [Appla...
dave,"This is Dave. He tells dirty jokes for a living. That stare is where most of his hard work happens. It signifies a profound train of thought, the ..."
hasan,"[theme music: orchestral hip-hop] [crowd roars] What’s up? Davis, what’s up? I’m home. I had to bring it back here. Netflix said, “Where do you wa..."
jim,"[Car horn honks] [Audience cheering] [Announcer] Ladies and gentlemen, please welcome to the stage Mr. Jim Jefferies! [Upbeat music playing] Hello..."
joe,"[rock music playing] [audience cheering] [announcer] Ladies and gentlemen, welcome Joe Rogan. [audience cheering and applauding] What the fuck is ..."
john,"All right, Petunia. Wish me luck out there. You will die on August 7th, 2037. That’s pretty good. All right. Hello. Hello, Chicago. Nice to see yo..."
louis,Intro\nFade the music out. Let’s roll. Hold there. Lights. Do the lights. Thank you. Thank you very much. I appreciate that. I don’t necessarily a...


In [13]:
#getting the information about the data frame
#raw corpus
data_df.info()
data_df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, ali to ricky
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   transcript  12 non-null     object
dtypes: object(1)
memory usage: 192.0+ bytes


Unnamed: 0,transcript
count,12
unique,12
top,"[Car horn honks] [Audience cheering] [Announcer] Ladies and gentlemen, please welcome to the stage Mr. Jim Jefferies! [Upbeat music playing] Hello..."
freq,1


In [16]:
# Applying the first set of text cleaning technique
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    #getting rid of data within brackets as they are sounds
    text = re.sub('\[.*?\]', '', text)
    #getting rid of punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #getting rid of words with numbers
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [19]:

# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
ali,ladies and gentlemen please welcome to the stage ali wong hi hello welcome thank you thank you for coming hello hello we are gonna have to get thi...
anthony,thank you thank you thank you san francisco thank you so much so good to be here people were surprised when i told em i was gonna tape my special ...
bill,all right thank you thank you very much thank you thank you thank you how are you whats going on thank you its a pleasure to be here in the great...
bo,bo what old macdonald had a farm e i e i o and on that farm he had a pig e i e i o here a snort there a old macdonald had a farm e i e i o this i...
dave,this is dave he tells dirty jokes for a living that stare is where most of his hard work happens it signifies a profound train of thought the alch...
hasan,whats up davis whats up im home i had to bring it back here netflix said where do you want to do the special la chicago new york i was like nah ...
jim,ladies and gentlemen please welcome to the stage mr jim jefferies hello sit down sit down sit down sit down sit down thank you boston i appre...
joe,ladies and gentlemen welcome joe rogan what the fuck is going on san francisco thanks for coming i appreciate it god damn put your phone down ...
john,all right petunia wish me luck out there you will die on august thats pretty good all right hello hello chicago nice to see you again thank you ...
louis,introfade the music out lets roll hold there lights do the lights thank you thank you very much i appreciate that i dont necessarily agree with yo...


# Organizing The Data

Corpus - a collection of text
Document-Term Matrix - word counts in matrix format

In [20]:
# Let's add the comedians' full names as well
full_names = ['Ali Wong', 'Anthony Jeselnik', 'Bill Burr', 'Bo Burnham', 'Dave Chappelle', 'Hasan Minhaj',
              'Jim Jefferies', 'Joe Rogan', 'John Mulaney', 'Louis C.K.', 'Mike Birbiglia', 'Ricky Gervais']

data_df['full_name'] = full_names
data_df

Unnamed: 0,transcript,full_name
ali,"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have ...",Ali Wong
anthony,"Thank you. Thank you. Thank you, San Francisco. Thank you so much. So good to be here. People were surprised when I told ’em I was gonna tape my s...",Anthony Jeselnik
bill,"[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a ...",Bill Burr
bo,Bo What? Old MacDonald had a farm E I E I O And on that farm he had a pig E I E I O Here a snort There a Old MacDonald had a farm E I E I O [Appla...,Bo Burnham
dave,"This is Dave. He tells dirty jokes for a living. That stare is where most of his hard work happens. It signifies a profound train of thought, the ...",Dave Chappelle
hasan,"[theme music: orchestral hip-hop] [crowd roars] What’s up? Davis, what’s up? I’m home. I had to bring it back here. Netflix said, “Where do you wa...",Hasan Minhaj
jim,"[Car horn honks] [Audience cheering] [Announcer] Ladies and gentlemen, please welcome to the stage Mr. Jim Jefferies! [Upbeat music playing] Hello...",Jim Jefferies
joe,"[rock music playing] [audience cheering] [announcer] Ladies and gentlemen, welcome Joe Rogan. [audience cheering and applauding] What the fuck is ...",Joe Rogan
john,"All right, Petunia. Wish me luck out there. You will die on August 7th, 2037. That’s pretty good. All right. Hello. Hello, Chicago. Nice to see yo...",John Mulaney
louis,Intro\nFade the music out. Let’s roll. Hold there. Lights. Do the lights. Thank you. Thank you very much. I appreciate that. I don’t necessarily a...,Louis C.K.


# Document-Term Matrix

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo,éclair
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,1,0,0
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,0,...,2,1,0,1,0,0,0,0,0,0
jim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [28]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

# Exploratory Data Analysis

Most common words - find these and create word clouds
Size of vocabulary - look number of unique words and also how quickly someone speaks
Amount of profanity - most common terms

In [22]:
data=data_dtm
data.head()

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo,éclair
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,1,0,0
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:

# Find the top 30 words said by each comedian
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

{'aaaaah': [('bill', 1),
  ('ricky', 0),
  ('mike', 0),
  ('louis', 0),
  ('john', 0),
  ('joe', 0),
  ('jim', 0),
  ('hasan', 0),
  ('dave', 0),
  ('bo', 0),
  ('anthony', 0),
  ('ali', 0)],
 'aaaaahhhhhhh': [('bo', 1),
  ('ricky', 0),
  ('mike', 0),
  ('louis', 0),
  ('john', 0),
  ('joe', 0),
  ('jim', 0),
  ('hasan', 0),
  ('dave', 0),
  ('bill', 0),
  ('anthony', 0),
  ('ali', 0)],
 'aaaaauuugghhhhhh': [('bo', 1),
  ('ricky', 0),
  ('mike', 0),
  ('louis', 0),
  ('john', 0),
  ('joe', 0),
  ('jim', 0),
  ('hasan', 0),
  ('dave', 0),
  ('bill', 0),
  ('anthony', 0),
  ('ali', 0)],
 'aaaahhhhh': [('bo', 1),
  ('ricky', 0),
  ('mike', 0),
  ('louis', 0),
  ('john', 0),
  ('joe', 0),
  ('jim', 0),
  ('hasan', 0),
  ('dave', 0),
  ('bill', 0),
  ('anthony', 0),
  ('ali', 0)],
 'aaah': [('dave', 1),
  ('ricky', 0),
  ('mike', 0),
  ('louis', 0),
  ('john', 0),
  ('joe', 0),
  ('jim', 0),
  ('hasan', 0),
  ('bo', 0),
  ('bill', 0),
  ('anthony', 0),
  ('ali', 0)],
 'aah': [('louis', 3),


In [24]:
# Print the top 15 words said by each comedian
for comedian, top_words in top_dict.items():
    print(comedian)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

aaaaah
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
aaaaahhhhhhh
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
aaaaauuugghhhhhh
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
aaaahhhhh
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
aaah
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
aah
louis, ricky, mike, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
abc
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
abcs
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
ability
ricky, bo, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
abject
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
able
john, ricky, joe, ali, louis, jim, hasan, bill, mike, dave, bo, anthony
---
ablebodied
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, 

---
body
ali, hasan, louis, dave, mike, john, joe, anthony, ricky, jim, bo, bill
---
bodys
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
bodyslammed
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
bognor
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
boilerplate
louis, ricky, mike, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
boing
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, ali
---
bolt
louis, ricky, mike, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
bomb
dave, louis, jim, ricky, mike, john, joe, hasan, bo, bill, anthony, ali
---
bombed
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
bombs
jim, dave, mike, ricky, louis, john, joe, hasan, bo, bill, anthony, ali
---
bones
joe, louis, ricky, mike, john, jim, hasan, dave, bo, bill, anthony, ali
---
bongo
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
b

john, ricky, mike, louis, joe, jim, hasan, dave, bo, bill, anthony, ali
---
confuses
john, ricky, mike, louis, joe, jim, hasan, dave, bo, bill, anthony, ali
---
confusing
joe, jim, john, ali, ricky, mike, louis, hasan, dave, bo, bill, anthony
---
confusion
louis, ricky, mike, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
congrats
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
congratulate
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, ali
---
congratulations
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
connect
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
connected
mike, jim, hasan, ricky, louis, john, joe, dave, bo, bill, anthony, ali
---
connections
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
conqueror
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
conscious
ricky, mike, louis, john, joe, jim, hasan, dave

endure
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony, ali
---
enemies
jim, hasan, ricky, mike, louis, john, joe, dave, bo, bill, anthony, ali
---
enemy
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
energy
john, ricky, mike, louis, joe, jim, hasan, dave, bo, bill, anthony, ali
---
engage
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, ali
---
engaged
mike, john, jim, ali, ricky, louis, joe, hasan, dave, bo, bill, anthony
---
engaging
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
engine
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
england
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, ali
---
english
hasan, ricky, mike, bill, louis, john, joe, jim, dave, bo, anthony, ali
---
enjoy
jim, bill, mike, john, bo, joe, dave, anthony, ricky, louis, hasan, ali
---
enjoyed
anthony, ricky, joe, mike, louis, john, jim, hasan, dave, bo, bill, ali
---

---
happens
louis, dave, jim, hasan, bill, ricky, ali, john, joe, bo, mike, anthony
---
happiest
joe, hasan, ricky, mike, louis, john, jim, dave, bo, bill, anthony, ali
---
happily
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
happiness
louis, john, joe, hasan, ricky, mike, jim, dave, bo, bill, anthony, ali
---
happy
jim, mike, john, ricky, joe, hasan, bo, bill, dave, anthony, louis, ali
---
hard
jim, bo, dave, mike, louis, hasan, anthony, bill, john, ricky, joe, ali
---
hardass
anthony, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, ali
---
hardboiled
john, ricky, mike, louis, joe, jim, hasan, dave, bo, bill, anthony, ali
---
hardearned
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
harder
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
hardest
ricky, anthony, mike, louis, john, joe, jim, hasan, dave, bo, bill, ali
---
hardwired
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, 

mike, anthony, ricky, louis, john, joe, jim, hasan, dave, bo, bill, ali
---
lawyers
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
lay
louis, bill, ricky, mike, john, joe, jim, hasan, dave, bo, anthony, ali
---
layer
joe, ricky, mike, louis, john, jim, hasan, dave, bo, bill, anthony, ali
---
layered
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
layers
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
layin
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
laying
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, ali
---
lays
jim, hasan, ricky, mike, louis, john, joe, dave, bo, bill, anthony, ali
---
lazy
john, jim, bill, ricky, mike, louis, joe, hasan, dave, bo, anthony, ali
---
lbgtq
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
le
john, ricky, mike, louis, joe, jim, hasan, dave, bo, bill, anthony, ali
---
lead
bo, ricky, mike, l

hasan, bill, ricky, mike, louis, john, joe, jim, dave, bo, anthony, ali
---
nerfedup
joe, ricky, mike, louis, john, jim, hasan, dave, bo, bill, anthony, ali
---
nerve
mike, bill, ali, ricky, louis, john, joe, jim, hasan, dave, bo, anthony
---
nervous
joe, mike, hasan, ricky, louis, john, jim, dave, bo, bill, anthony, ali
---
nervously
ricky, bill, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
nestled
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
net
joe, ricky, mike, louis, john, jim, hasan, dave, bo, bill, anthony, ali
---
netdetectivecom
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
netflix
jim, joe, hasan, dave, ali, ricky, mike, louis, john, bo, bill, anthony
---
network
anthony, bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, ali
---
neurological
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
neutralized
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, ant

ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
program
bill, mike, louis, ricky, john, joe, jim, hasan, dave, bo, anthony, ali
---
progressive
ricky, bill, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
prohibition
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, ali
---
project
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
prolonged
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
prom
hasan, anthony, ricky, mike, louis, john, joe, jim, dave, bo, bill, ali
---
promise
hasan, anthony, louis, dave, bo, ali, ricky, mike, john, joe, jim, bill
---
promised
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony, ali
---
promote
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
promoted
jim, john, ricky, mike, louis, joe, hasan, dave, bo, bill, anthony, ali
---
prompter
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony

---
shaking
bill, ricky, jim, mike, louis, john, joe, hasan, dave, bo, anthony, ali
---
shall
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
shame
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
shandling
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
shape
bill, ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony
---
shaped
jim, ricky, mike, louis, john, joe, hasan, dave, bo, bill, anthony, ali
---
shaq
joe, ricky, mike, louis, john, jim, hasan, dave, bo, bill, anthony, ali
---
shaquille
joe, ricky, mike, louis, john, jim, hasan, dave, bo, bill, anthony, ali
---
shar
bill, ricky, mike, louis, john, joe, jim, hasan, dave, bo, anthony, ali
---
share
ricky, joe, anthony, mike, louis, john, jim, hasan, dave, bo, bill, ali
---
shared
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
shark
anthony, ricky, louis, mike, john, joe, jim, hasan, dave, bo, bill, ali
---
sh

---
swells
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
swerving
dave, ricky, mike, louis, john, joe, jim, hasan, bo, bill, anthony, ali
---
swift
joe, ricky, mike, louis, john, jim, hasan, dave, bo, bill, anthony, ali
---
swim
louis, bo, ricky, mike, john, joe, jim, hasan, dave, bill, anthony, ali
---
swimming
anthony, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, ali
---
swing
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
swinging
john, ricky, mike, louis, joe, jim, hasan, dave, bo, bill, anthony, ali
---
swings
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony, ali
---
swiping
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony, ali
---
switch
joe, dave, bill, ricky, mike, louis, john, jim, hasan, bo, anthony, ali
---
switched
john, ricky, mike, louis, joe, jim, hasan, dave, bo, bill, anthony, ali
---
swore
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali


---
violets
bo, ricky, mike, louis, john, joe, jim, hasan, dave, bill, anthony, ali
---
viral
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony, ali
---
virgin
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony, ali
---
virginity
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
virgins
mike, ricky, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
virtue
joe, ricky, mike, louis, john, jim, hasan, dave, bo, bill, anthony, ali
---
visa
hasan, ricky, mike, louis, john, joe, jim, dave, bo, bill, anthony, ali
---
visibly
ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---
vision
hasan, louis, john, ricky, mike, joe, jim, dave, bo, bill, anthony, ali
---
visit
joe, hasan, ricky, mike, louis, john, jim, dave, bo, bill, anthony, ali
---
visor
ali, ricky, mike, louis, john, joe, jim, hasan, dave, bo, bill, anthony
---
vital
louis, ricky, mike, john, joe, jim, hasan, dave, bo, bill, anthony, ali
---


In [25]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each comedian
words = []
for comedian in data.columns:
    top = [word for (word, count) in top_dict[comedian]]
    for t in top:
        words.append(t)
        
words

['bill',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bo',
 'anthony',
 'ali',
 'bo',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bill',
 'anthony',
 'ali',
 'bo',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bill',
 'anthony',
 'ali',
 'bo',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bill',
 'anthony',
 'ali',
 'dave',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'bo',
 'bill',
 'anthony',
 'ali',
 'louis',
 'ricky',
 'mike',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bo',
 'bill',
 'anthony',
 'ali',
 'ali',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bo',
 'bill',
 'anthony',
 'bill',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bo',
 'anthony',
 'ali',
 'ricky',
 'bo',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bill',
 'anthony',
 'ali',
 'ricky',
 'mike',


In [26]:

# Let's aggregate this list and identify the most common words along with how many routines they occur in
Counter(words).most_common()

[('bill', 7480),
 ('ricky', 7480),
 ('mike', 7480),
 ('louis', 7480),
 ('john', 7480),
 ('joe', 7480),
 ('jim', 7480),
 ('hasan', 7480),
 ('dave', 7480),
 ('bo', 7480),
 ('anthony', 7480),
 ('ali', 7480)]

In [27]:

# If more than half of the comedians have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 6]
add_stop_words

['bill',
 'ricky',
 'mike',
 'louis',
 'john',
 'joe',
 'jim',
 'hasan',
 'dave',
 'bo',
 'anthony',
 'ali']

In [30]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.transcript)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index
# Pickle it for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")