In [1]:
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="elementor-widget-theme-post-content").find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['https://scrapsfromtheloft.com/comedy/oliver-stone-vladimir-putin-interview-on-ukraine/',
        'https://scrapsfromtheloft.com/movies/face-to-face-with-ingmar-bergman/',
        'https://scrapsfromtheloft.com/movies/norman-lloyd-interview-alfred-hitchcock-and-saboteur/',
        'https://scrapsfromtheloft.com/movies/david-lynch-interview-dune-starlog-magazine/',
        'https://scrapsfromtheloft.com/movies/an-interview-with-ennio-morricone-1994/',
        'https://scrapsfromtheloft.com/movies/i-still-love-going-to-movies-an-interview-with-pauline-kael-1999/']

# Comedian names
interviewer = ['A', 'B', 'C', 'D', 'E', 'F']

In [2]:
transcripts = [url_to_transcript(u) for u in urls]

https://scrapsfromtheloft.com/comedy/oliver-stone-vladimir-putin-interview-on-ukraine/
https://scrapsfromtheloft.com/movies/face-to-face-with-ingmar-bergman/
https://scrapsfromtheloft.com/movies/norman-lloyd-interview-alfred-hitchcock-and-saboteur/
https://scrapsfromtheloft.com/movies/david-lynch-interview-dune-starlog-magazine/
https://scrapsfromtheloft.com/movies/an-interview-with-ennio-morricone-1994/
https://scrapsfromtheloft.com/movies/i-still-love-going-to-movies-an-interview-with-pauline-kael-1999/


In [3]:
!mkdir transcripts

for i, c in enumerate(interviewer):
   with open("transcripts/" + c + ".txt", "wb") as file:
       pickle.dump(transcripts[i], file)

A subdirectory or file transcripts already exists.


In [4]:
data = {}
for i, c in enumerate(interviewer):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
data['A'][:2]

['Oliver Stone conducted interviews with Vladimir Putin on four separate trips to Russia for a total of nine days between July 2, 2015 and February 10, 2017.',
 'The following is a selection of parts of the interviews where Putin talks about Ukraine.']

In [6]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [7]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [8]:
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
A,"Oliver Stone conducted interviews with Vladimir Putin on four separate trips to Russia for a total of nine days between July 2, 2015 and February ..."
B,"by William Wolf Ingmar Bergman stands outside his ranch house on Fårö, his Swedish island sanctuary, one arm affectionately around his wife, the o..."
C,"Alfred Hitchcock and Saboteur It was 1941, and Alfred Hitchcock had a picture about to be released called Suspicion, with Cary Grant and Joan Font..."
D,STARLOG INTERVIEW Director of “Dune” The filmmaker who tamed “The Elephant Man” undertakes the grandest vision of them all—the realization on the ...
E,by Jon Burlingame and Gary Crowdus Ennio Morricone occupies a unique place in the history of twentieth-century music. He is without question one o...
F,"by Leonard Quart Pauline Kael shook up the critical scene with her controversial 1963 Film Quarterly article, “Circles and Squares,” which attacke..."


In [9]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [10]:
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
A,oliver stone conducted interviews with vladimir putin on four separate trips to russia for a total of nine days between july and february the ...
B,by william wolf ingmar bergman stands outside his ranch house on fårö his swedish island sanctuary one arm affectionately around his wife the othe...
C,alfred hitchcock and saboteur it was and alfred hitchcock had a picture about to be released called suspicion with cary grant and joan fontaine h...
D,starlog interview director of “dune” the filmmaker who tamed “the elephant man” undertakes the grandest vision of them all—the realization on the ...
E,by jon burlingame and gary crowdus ennio morricone occupies a unique place in the history of twentiethcentury music he is without question one of ...
F,by leonard quart pauline kael shook up the critical scene with her controversial film quarterly article “circles and squares” which attacked aute...


In [11]:
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\t', '', text)
    text = re.sub('     ', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [12]:
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
A,oliver stone conducted interviews with vladimir putin on four separate trips to russia for a total of nine days between july and february the ...
B,by william wolf ingmar bergman stands outside his ranch house on fårö his swedish island sanctuary one arm affectionately around his wife the othe...
C,alfred hitchcock and saboteur it was and alfred hitchcock had a picture about to be released called suspicion with cary grant and joan fontaine h...
D,starlog interview director of dune the filmmaker who tamed the elephant man undertakes the grandest vision of them all—the realization on the scre...
E,by jon burlingame and gary crowdus ennio morricone occupies a unique place in the history of twentiethcentury music he is without question one of ...
F,by leonard quart pauline kael shook up the critical scene with her controversial film quarterly article circles and squares which attacked auteur...


In [13]:
full_name = ['a','b','c','d','e','f']

data_df['full_name'] = full_name
data_df

Unnamed: 0,transcript,full_name
A,"Oliver Stone conducted interviews with Vladimir Putin on four separate trips to Russia for a total of nine days between July 2, 2015 and February ...",a
B,"by William Wolf Ingmar Bergman stands outside his ranch house on Fårö, his Swedish island sanctuary, one arm affectionately around his wife, the o...",b
C,"Alfred Hitchcock and Saboteur It was 1941, and Alfred Hitchcock had a picture about to be released called Suspicion, with Cary Grant and Joan Font...",c
D,STARLOG INTERVIEW Director of “Dune” The filmmaker who tamed “The Elephant Man” undertakes the grandest vision of them all—the realization on the ...,d
E,by Jon Burlingame and Gary Crowdus Ennio Morricone occupies a unique place in the history of twentieth-century music. He is without question one o...,e
F,"by Leonard Quart Pauline Kael shook up the critical scene with her controversial 1963 Film Quarterly article, “Circles and Squares,” which attacke...",f


In [14]:
data_df.to_pickle("corpus.pkl")

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Initialize TfidfVectorizer
cv = TfidfVectorizer(stop_words='english')

# Fit and transform the data
data_dtm = cv.fit_transform(data_clean.transcript)

# Create a DataFrame with the TF-IDF values and feature names
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = data_clean.index

print(data_dtm)


   ab  abandoned  abano  abilities  able  ables  abm  abms  abraham  abroad  \
A   0          1      0          0     6      0    6     2        0       2   
B   0          0      0          0     0      0    0     0        0       2   
C   1          0      0          0     0      0    0     0        0       0   
D   0          0      1          0     2      0    0     0        0       0   
E   0          0      0          1     5      1    0     0        2       0   
F   0          0      0          0     1      0    0     0        0       0   

   ...  yushchenko  zealous  zeffirellis  zenith  zero  zeroed  zled  zone  \
A  ...           4        0            0       0     1       1     0     1   
B  ...           0        1            0       0     0       0     0     0   
C  ...           0        0            0       1     4       0     1     0   
D  ...           0        0            0       0     0       0     0     0   
E  ...           0        0            1       0     0  

In [19]:
data_dtm.to_pickle("dtm.pkl")

In [21]:
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))