In [373]:
import pandas as pd
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import re
import itertools
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import json
import string

import math
from textblob import TextBlob as tb
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

df = pd.read_csv("DemDebateTranscripts.csv")
jan = pd.read_csv("DemDebateTranscriptsJan.csv")
df = pd.concat([df,jan])

In [377]:
# condensing name variations
name_variations = {"name": 
                   {
                       "Senator Warren":"Elizabeth Warren", "Elizabeth W.":"Elizabeth Warren", "Sen. Warren":"Elizabeth Warren", 
                       "E. Warren":"Elizabeth Warren", "Elizabeth W":"Elizabeth Warren",
                       "Julian Castro":"Julián Castro",
                       "Williamson":"Marianne Williamson", "Marianne W.":"Marianne Williamson", 'Ms. Williamson': "Marianne Williamson",
                       "John H":"John Hickenlooper", "John H.":"John Hickenlooper", "John Hickenloop":"John Hickenlooper",
                       "Yang": "Andrew Yang", "Sen Klobuchar":"Amy Klobuchar",
                       "Kirsten G.":"Kirsten Gillibrand", "Kristen Gillibr":"Kirsten Gillibrand", "Gillibrand":"Kirsten Gillibrand",
                       "Bennett":"Michael Bennett", "Senator Bennet":"Michael Bennett", "Michael Bennet":"Michael Bennett",
                       'Senator Booker':"Cory Booker", "Corey Booker":"Cory Booker",
                       'Mayor Buttigieg': "Pete Buttigieg", "Eric Stalwell":"Eric Swalwell",
                       'Mayor de Blasio':"Bill de Blasio", "Bill De Blasio":"Bill de Blasio",
                       "Speaker 14":"Other", "Speaker 15":"Other", "Speaker 16":"Other", "Speaker 17":"Other", "Speaker 18":"Other",
                       "Speaker 19":"Other", "Speaker 20":"Other", "Speaker 21":"Other", "Speaker 23":"Other",
                       "Female":"Other", "Male":"Other", "Audience":"Other", "Speaker 26":"Other", "Speaker 30":"Other",
                       "Speaker 31":"Other", "Speaker 2":"Other", "Speaker 3":"Other", "Speaker 4":"Other", "Speaker 5":"Other", 
                       "Speaker 6":"Other", "Speaker 7":"Other", "Speaker 8":"Other", "Speaker 1":"Other",  
                       "Lester Holt":"Moderator", "Savannah":"Moderator", "Rachel Maddow":"Moderator", "Chuck Todd":"Moderator",
                       "Savannah G.":"Moderator", "Jose":"Moderator", "Steve Kornacki":"Moderator", 
                       "Voiceover": "Other", "Speaker 13":"Other",
                       "Crowd": "Remove",
                       "  › Blog › Political Transcripts › October Democratic Debate Transcript":"Remove",
                       "  › Blog › Political Transcripts › December Democratic Debate Transcript":"Remove",
                       " Rev › Blog › Political Transcripts › Transcript of July Democratic Debate 2nd Round Night 1":"Remove",
                       " Rev › Blog › Political Transcripts › Transcript of July Democratic Debate 2nd Round, Night 2":"Remove",
                       'Jake Tapper':"Moderator", 'Diana':"Moderator", 'Stephanie Sy':"Moderator",
                       'Dana Bash':"Moderator", 'Don Lemon':"Moderator", 'Yamiche A.':"Moderator",
                       'Anderson Cooper':"Moderator", 'John King':"Moderator", 'Henderson':"Moderator", 
                       'George S':"Moderator", 'Jorge Ramos':"Moderator",
                       'David Muir':"Moderator", 'Lindsey Davis':"Moderator", 'George S.':"Moderator",
                       'Erin Burnett':"Moderator", 'Marc Lacey':"Moderator", 'A. Cooper':"Moderator",
                       'Andrea Mitchell':"Moderator", 'Kristen Welker':"Moderator", 'Ashley Parker':"Moderator",
                       'Libby Casey':"Moderator", 'Judy Woodruff':"Moderator", 'Amy Walter':"Moderator",
                       'Tim Alberta':"Moderator", 'Amna':"Moderator", 'Amna Nawaz':"Moderator", 'Yamiche':"Moderator", 
                       'Tim':"Moderator", 'Judy':"Moderator"
                       
                   }
                }
                      
df.replace(name_variations, inplace=True)

In [378]:
df = df[df['name']!='Remove']
df = df[df['name']!='Other']

# Pull out speech for just one candidate and transform into one string

In [169]:
def getSpeech(name):
    candidate = df[df['name']==name].reset_index()

    # concat all rows of dialogue into one long string
    candidateSpeech = ""

    for row in range(0,len(candidate)):
        candidateSpeech = candidateSpeech + " " + candidate['speech'][row]
    return candidateSpeech
    # remove puncutation and extra space
    candidateSpeech = candidateSpeech.translate(str.maketrans('', '', string.punctuation))
    candidateSpeech = candidateSpeech.strip(" ")
    
    return candidateSpeech
    # remove stop words
    stop_words = set(stopwords.words('english')) 
  
    word_tokens = word_tokenize(candidateSpeech) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
            
    # removing stop words transformed speech into a list
    # convery back into one long string
    return ' '.join(word.lower() for word in filtered_sentence)

# Use only candidates who were in at least 3 debates:
Biden, Warren, Sanders, Buttigieg, Harris, Klobuchar, Booker, O'Rourke, Castro, Yang, Gabbard, Steyer (and maybe Marianne Williamson for kicks)

In [333]:
candidates = ["Joe Biden", "Elizabeth Warren", "Bernie Sanders", "Pete Buttigieg", "Kamala Harris", "Amy Klobuchar", "Cory Booker", "Beto O'Rourke", "Julian Castro", "Andrew Yang", "Tulsi Gabbard", "Tom Steyer"]
candidates.sort()

In [347]:
df['name'].unique()

array(['Moderator', 'Elizabeth Warren', 'Amy Klobuchar', 'Beto O’Rourke',
       'Cory Booker', 'Julián Castro', 'Tulsi Gabbard', 'Bill de Blasio',
       'John Delaney', 'Jay Inslee', 'Tim Ryan', 'Bernie Sanders',
       'Michael Bennett', 'Joe Biden', 'Kamala Harris',
       'John Hickenlooper', 'Kirsten Gillibrand', 'Pete Buttigieg',
       'Eric Swalwell', 'Andrew Yang', 'Marianne Williamson',
       'Steve Bullock', 'Tom Steyer'], dtype=object)

In [379]:
amy = getSpeech("Amy Klobuchar")
yang = getSpeech("Andrew Yang")
sanders = getSpeech("Bernie Sanders")
beto = getSpeech("Beto O’Rourke")
booker = getSpeech("Cory Booker")
warren = getSpeech("Elizabeth Warren")
biden = getSpeech("Joe Biden")
castro = getSpeech("Julián Castro")
harris = getSpeech("Kamala Harris")
buttigieg = getSpeech("Pete Buttigieg")
steyer = getSpeech("Tom Steyer")
gabbard = getSpeech("Tulsi Gabbard")

In [380]:
corpus = {'Klobuchar': amy, 'Yang': yangSpeech, 'Sanders':sanders, 'Beto':beto, 'Booker':booker, 'Warren': warren, 'Biden':biden, 'Castro':castro, 'Harris':harris, 'Buttigieg':buttigieg, 'Steyer':steyer, 'Gabbard':gabbard}

In [537]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS.union(["donald", "trump", "ve", "got"])

tfidf = TfidfVectorizer(stop_words = my_stop_words, ngram_range=(2,2))
tfs = tfidf.fit_transform(corpus.values())

feature_names = tfidf.get_feature_names()
corpus_index = [n for n in corpus]
rows, cols = tfs.nonzero()

data = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)

In [538]:
data.sort_values("Gabbard",ascending=False).head(10)["Gabbard"].reset_index()

Unnamed: 0,index,Gabbard
0,regime change,0.250389
1,change war,0.188652
2,al qaeda,0.147288
3,change wars,0.102901
4,commander chief,0.097585
5,national security,0.096688
6,serve commander,0.085751
7,people country,0.082926
8,brothers sisters,0.073644
9,cold war,0.068601


In [542]:
yang

' That’s right. I’m sorry? Oh, so it’s difficult to do if you have companies like Amazon, trillion dollar tech companies paying literally zero in taxes, while they’re closing 30% of our stores. Now, we need to put the American people in a position to benefit from all these innovations in other parts of the economy, and if we had a value added tax at even half the European level, it would generate over $800 billion in new revenue, which combined with the money in our hands, it would be the trickle up economy from our people, families and communities up. We would spend the money and it would circulate through our regional economies and neighborhoods, creating millions of jobs, making our families stronger and healthier. We’d save money on things like incarceration, homelessness services, emergency room healthcare. And just the value gains from having a stronger, healthier, mentally healthier population would increase GDP by $700 billion. This is the move that we have to make, particularl