# Import modules

In [None]:
import requests
import pandas as pd
import pickle
from bs4 import BeautifulSoup

# URL of the webpage

In [None]:
url = "https://www.rev.com/blog/transcripts/donald-trump-joe-biden-final-presidential-debate-transcript-2020"

# Define a function to extract text from the page

In [None]:
def getdata(url): 
    r = requests.get(url) 
    return r.text 

# Target Biden's remarks to extract

In [None]:
htmldata = getdata(url) 
soup = BeautifulSoup(htmldata, 'html.parser') 
text = [t.text for t in soup.find_all("p") if "Joe Biden:" in t.text]
print(text)

# Combine the text

In [None]:
text = '\n'.join(text)
print(text)

# Cleaning Biden's text

In [None]:
# Make text lowercase, remove the name, remove text in square brackets, remove punctuation and remove words containing numbers.
import re
import string

def clean_text_round1(text):
    text = text.lower()
    text = re.sub('joe biden', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Check the result

In [None]:
round1 = clean_text_round1(text)
print(round1)

# More cleaning...

In [None]:
# Get rid of some additional punctuation and non-sensical text that was missed the first time around.
def clean_text_round2(text):
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

# Check again

In [None]:
round2 = clean_text_round2(round1)
print(round2)

# Save Biden's remarks as a text file

In [None]:
f = open('biden.txt','wb')
pickle.dump(round2, f)
f.close()

# Repeat the same process for Trump

In [None]:
url = "https://www.rev.com/blog/transcripts/donald-trump-joe-biden-final-presidential-debate-transcript-2020"

def getdata(url): 
    r = requests.get(url) 
    return r.text 

htmldata = getdata(url) 
soup = BeautifulSoup(htmldata, 'html.parser') 
text = [t.text for t in soup.find_all("p") if "Donald Trump:" in t.text]

text = '\n'.join(text)

def clean_text_round1(text):
    text = text.lower()
    text = re.sub('donald trump', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = clean_text_round1(text)

def clean_text_round2(text):
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = clean_text_round2(round1)
print(round2)

# Looks good. Save Trump's text as well.

In [None]:
f = open('trump.txt','wb')
pickle.dump(round2, f)
f.close()

# Load the pickled files

In [None]:
names = ['biden', 'trump'] 
data = {}
for i, n in enumerate(names):
    with open(n + ".txt", "rb") as file:
        data[n] = pickle.load(file)

In [None]:
data.keys()

In [None]:
data['trump'][:10]

In [None]:
next(iter(data.keys()))

In [None]:
next(iter(data.values()))

# Convert lists (paragraphs) of texts into a single string 

In [None]:
def combine_text(list_of_text):
    combined_text = ''.join(list_of_text)
    return combined_text

In [None]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
data_combined

In [None]:
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

In [None]:
data_df.transcript.loc['biden']

In [None]:
full_names = ['Joe Biden', 'Donald Trump']
data_df['full_name'] = full_names
data_df
data_df.to_pickle("corpus.pkl")

# Make a dataframe with what we've got.

# Convert the dataframe into a document-term matrix

In [None]:
# Also, exclude common words English stop words from the index. 

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_df.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_df.index
data_dtm

# Pickle the data

In [None]:
import pickle
data_dtm.to_pickle("dtm.pkl")
data_df.to_pickle('df.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

# Transpose the data for easy manipulation

In [None]:
data = pd.read_pickle('dtm.pkl')
data = data.transpose()
data.head()

# Find the top 30 words used

In [None]:
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
for index, top_words in top_dict.items():
    print(index)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

# Find top 30 words used in the debate

In [None]:
from collections import Counter

# Let's first pull out the top 30 words for both
words = []
for index in data.columns:
    top = [word for (word, count) in top_dict[index]]
    for t in top:
        words.append(t)
        
words

# Check the top words in common

In [None]:
Counter(words).most_common()

# Find the top words in common

In [None]:
add_stop_words = [word for word, count in Counter(words).most_common() if count > 1]
add_stop_words

# Update the document-term matrix with these new stop words

In [None]:
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
df = pd.read_pickle('df.pkl')

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(df.transcript)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = df.index

# Pickle the data
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")

# Create word clouds

In [None]:
# To download a necessary module, open command prompt and run the comand "conda install -c conda-forge wordcloud"
from wordcloud import WordCloud
wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [16, 6]

full_names = ['Joe Biden', 'Donald Trump']

# Create subplots for each
for index, name in enumerate(data.columns):
    wc.generate(df.transcript[name])
    
    plt.subplot(3, 4, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(full_names[index])
    
plt.show()
plt.savefig("figure.png")

# Find the number of unique words Biden and Trump used

In [None]:
unique_list = []
for name in data.columns:
    uniques = data[name].to_numpy().nonzero()[0].size
    unique_list.append(uniques)
data_words = pd.DataFrame(list(zip(full_names, unique_list)), columns=['name', 'unique_words'])
data_unique_sort = data_words.sort_values(by='unique_words')
data_unique_sort

# Sentiment Analysis

In [None]:
data = pd.read_pickle('corpus.pkl')
data

In [None]:
# Download the Python module textblob: run "conda install -c conda-forge textblob" in command prompt
from textblob import TextBlob

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

data['polarity'] = data['transcript'].apply(pol)
data['subjectivity'] = data['transcript'].apply(sub)
data

# Topic modeling

In [None]:
data = pd.read_pickle('dtm_stop.pkl')
data

In [None]:
# Download the Python module textblob: run "conda install -c conda-forge gensim" in command prompt
from gensim import matutils, models
import scipy.sparse
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Convert the document-term matrix into a term-document matrix

In [None]:
tdm = data.transpose()
tdm.head()

# Convert tdm into a sparse matrix, then into a gensim corpus

In [None]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

# Get the dictionary of words with their respective locations in the corpus

In [None]:
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

# Let's devide the words into 3 topics

In [None]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

# Not interesting at all... let's consider only nouns

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, pos_tag

def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
data_clean = pd.read_pickle("corpus.pkl")
data_clean

In [None]:
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

In [None]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

In [None]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

# Still not interesting... Let's try nouns and adjectives

In [None]:
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [None]:
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj

In [None]:
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

In [None]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [None]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

# Alright, it looks interesting now. Let's try different numbers of topics

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=10)
ldana.print_topics()

# Looks like dividing in 2 topics makes the most sense

In [None]:
# Our final LDA model (for now)
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=80)
ldana.print_topics()

# Looks like Corona and BLM... let's look at the topics covered by each speaker

In [None]:
corpus_transformed = ldana[corpusna]
list(zip([a for [(a,b)] in corpus_transformed], data_dtmna.index))

# Biden criticized Trump for the pandemic, Trump criticized Biden for the unrest... you can check by reading the script.