In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    text = [p.text for p in soup.find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = [
        'https://scrapsfromtheloft.com/tv-series/the-last-of-us-s01e02-infected-transcript/',
        'https://scrapsfromtheloft.com/tv-series/the-last-of-us-s01e03-long-long-time-transcript/',
        'https://scrapsfromtheloft.com/tv-series/the-last-of-us-s01e04-please-hold-on-to-my-hand-transcript/'
         
        ]

# Comedian names
episodes = ['episode2', 'episode3', 'episode4' ]

In [2]:
transcripts = [url_to_transcript(u) for u in urls] 

https://scrapsfromtheloft.com/tv-series/the-last-of-us-s01e02-infected-transcript/
https://scrapsfromtheloft.com/tv-series/the-last-of-us-s01e03-long-long-time-transcript/
https://scrapsfromtheloft.com/tv-series/the-last-of-us-s01e04-please-hold-on-to-my-hand-transcript/


In [3]:
!mkdir transcripts

for i, c in enumerate(episodes):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)


In [4]:
# Load pickled files
data = {}
for i, c in enumerate(episodes):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['episode2', 'episode3', 'episode4'])

In [6]:
next(iter(data.values()))

['Original air date: January 23, 2023',
 'On the day of the outbreak, in\xa0Jakarta, Indonesia, a\xa0mycologist\xa0learns of the oncoming pandemic and advises the government to bomb the city to slow the spread. In the present, Ellie explains she is being transported west in hopes of her being used to find a cure. Discovering the path to their meetup is swarmed with infected, they cut through a museum and are attacked by two blind variants of infected, one of whom bites Ellie before Joel and Tess kill them. They arrive at the Old State House to find the Fireflies slaughtered by Infected. Tess reveals she was bitten while Ellie’s bite is healing, proving her immunity. Joel shoots an infected, alerting the others to their location, Tess convinces him to escape with Ellie as she stays behind and blows up the building, killing herself and the horde.',
 '* * *',
 '(sirens wailing)',
 'JAKARTA, INDONESIA\nSEPTEMBER 24TH, 2003',
 '(cars honking)',
 '(indistinct chatter)',
 '(chatter stops)',
 

In [7]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [8]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [9]:
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
episode2,"Original air date: January 23, 2023 On the day of the outbreak, in Jakarta, Indonesia, a mycologist learns of the oncoming pandemic and advises th..."
episode3,"Original air date: January 30, 2023 In the present, Joel and Ellie heed Tess’s final instructions and hike to meet allies Bill and Frank. On their..."
episode4,"Original air date: February 6, 2023 Traveling through Missouri on their way to Wyoming, Joel and Ellie take a shortcut through the ruins of Kansas..."


In [10]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[0-9]+','',text)  #number 
    text = re.sub("[^A-Za-z0-9 ]",'',text)  #alpha numeric
    years = re.findall("b(19[40][0-9]|20[0-1][0-9]|2020)b",text)   #years extracter
    return text

round1 = lambda x: clean_text_round1(x)

In [11]:
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
episode2,original air date january on the day of the outbreak injakarta indonesia amycologistlearns of the oncoming pandemic and advises the government t...
episode3,original air date january in the present joel and ellie heed tesss final instructions and hike to meet alliesbillandfrank on their journey joel ...
episode4,original air date february traveling throughmissourion their way to wyoming joel and ellie take a shortcut through the ruins ofkansas citywhen t...


In [12]:
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [13]:
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
episode2,original air date january on the day of the outbreak injakarta indonesia amycologistlearns of the oncoming pandemic and advises the government t...
episode3,original air date january in the present joel and ellie heed tesss final instructions and hike to meet alliesbillandfrank on their journey joel ...
episode4,original air date february traveling throughmissourion their way to wyoming joel and ellie take a shortcut through the ruins ofkansas citywhen t...


In [14]:
data_df

Unnamed: 0,transcript
episode2,"Original air date: January 23, 2023 On the day of the outbreak, in Jakarta, Indonesia, a mycologist learns of the oncoming pandemic and advises th..."
episode3,"Original air date: January 30, 2023 In the present, Joel and Ellie heed Tess’s final instructions and hike to meet allies Bill and Frank. On their..."
episode4,"Original air date: February 6, 2023 Traveling through Missouri on their way to Wyoming, Joel and Ellie take a shortcut through the ruins of Kansas..."


In [15]:
data_df.to_pickle("corpus.pkl")

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm



Unnamed: 0,abandoned,abide,absolutely,accelerates,accelerating,according,actually,address,adult,advice,...,yelps,yep,yes,youd,youll,younger,youre,youve,yuck,zeroing
episode2,0,0,1,0,0,1,1,1,1,0,...,0,1,6,0,2,0,6,2,0,0
episode3,1,3,0,1,0,0,3,1,0,3,...,0,2,10,0,5,0,10,1,1,1
episode4,1,0,0,0,1,0,3,1,0,0,...,1,0,1,1,1,2,10,0,0,0


In [17]:
data_dtm.to_pickle("dtm.pkl")

In [18]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))