# tf-idf Text Analysis Of Dem Debate Transcripts
## Author: Oliver Gladfelter
### Date: Jan 2020

In [1]:
import pandas as pd
import string

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

df = pd.read_csv("DemDebateTranscripts.csv")

In [2]:
# condensing name variations
name_variations = {"name": 
                   {
                       "Senator Warren":"Elizabeth Warren", "Elizabeth W.":"Elizabeth Warren", "Sen. Warren":"Elizabeth Warren", 
                       "E. Warren":"Elizabeth Warren", "Elizabeth W":"Elizabeth Warren",
                       "Julian Castro":"Julián Castro",
                       "Williamson":"Marianne Williamson", "Marianne W.":"Marianne Williamson", 'Ms. Williamson': "Marianne Williamson",
                       "John H":"John Hickenlooper", "John H.":"John Hickenlooper", "John Hickenloop":"John Hickenlooper",
                       "Yang": "Andrew Yang", "Sen Klobuchar":"Amy Klobuchar",
                       "Kirsten G.":"Kirsten Gillibrand", "Kristen Gillibr":"Kirsten Gillibrand", "Gillibrand":"Kirsten Gillibrand",
                       "Bennett":"Michael Bennett", "Senator Bennet":"Michael Bennett", "Michael Bennet":"Michael Bennett",
                       'Senator Booker':"Cory Booker", "Corey Booker":"Cory Booker",
                       'Mayor Buttigieg': "Pete Buttigieg", "Eric Stalwell":"Eric Swalwell",
                       'Mayor de Blasio':"Bill de Blasio", "Bill De Blasio":"Bill de Blasio",
                       "Speaker 14":"Other", "Speaker 15":"Other", "Speaker 16":"Other", "Speaker 17":"Other", "Speaker 18":"Other",
                       "Speaker 19":"Other", "Speaker 20":"Other", "Speaker 21":"Other", "Speaker 23":"Other",
                       "Female":"Other", "Male":"Other", "Audience":"Other", "Speaker 26":"Other", "Speaker 30":"Other",
                       "Speaker 31":"Other", "Speaker 2":"Other", "Speaker 3":"Other", "Speaker 4":"Other", "Speaker 5":"Other", 
                       "Speaker 6":"Other", "Speaker 7":"Other", "Speaker 8":"Other", "Speaker 1":"Other",  
                       "Lester Holt":"Moderator", "Savannah":"Moderator", "Rachel Maddow":"Moderator", "Chuck Todd":"Moderator",
                       "Savannah G.":"Moderator", "Jose":"Moderator", "Steve Kornacki":"Moderator", 
                       "Voiceover": "Other", "Speaker 13":"Other",
                       "Crowd": "Remove",
                       "  › Blog › Political Transcripts › October Democratic Debate Transcript":"Remove",
                       "  › Blog › Political Transcripts › December Democratic Debate Transcript":"Remove",
                       " Rev › Blog › Political Transcripts › Transcript of July Democratic Debate 2nd Round Night 1":"Remove",
                       " Rev › Blog › Political Transcripts › Transcript of July Democratic Debate 2nd Round, Night 2":"Remove",
                       'Jake Tapper':"Moderator", 'Diana':"Moderator", 'Stephanie Sy':"Moderator",
                       'Dana Bash':"Moderator", 'Don Lemon':"Moderator", 'Yamiche A.':"Moderator",
                       'Anderson Cooper':"Moderator", 'John King':"Moderator", 'Henderson':"Moderator", 
                       'George S':"Moderator", 'Jorge Ramos':"Moderator",
                       'David Muir':"Moderator", 'Lindsey Davis':"Moderator", 'George S.':"Moderator",
                       'Erin Burnett':"Moderator", 'Marc Lacey':"Moderator", 'A. Cooper':"Moderator",
                       'Andrea Mitchell':"Moderator", 'Kristen Welker':"Moderator", 'Ashley Parker':"Moderator",
                       'Libby Casey':"Moderator", 'Judy Woodruff':"Moderator", 'Amy Walter':"Moderator",
                       'Tim Alberta':"Moderator", 'Amna':"Moderator", 'Amna Nawaz':"Moderator", 'Yamiche':"Moderator", 
                       'Tim':"Moderator", 'Judy':"Moderator"
                       
                   }
                }
                      
df.replace(name_variations, inplace=True)

df = df[df['name']!='Remove']
df = df[df['name']!='Other']

# Pull out speech for just one candidate and transform into one string

In [4]:
def getSpeech(name):
    candidate = df[df['name']==name].reset_index()

    # concat all rows of dialogue into one long string
    candidateSpeech = ""

    for row in range(0,len(candidate)):
        candidateSpeech = candidateSpeech + " " + candidate['speech'][row]
    return candidateSpeech
    # remove puncutation and extra space
    candidateSpeech = candidateSpeech.translate(str.maketrans('', '', string.punctuation))
    candidateSpeech = candidateSpeech.strip(" ")
    
    # remove stop words
    stop_words = set(stopwords.words('english')) 
  
    word_tokens = word_tokenize(candidateSpeech) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
            
    # removing stop words transformed speech into a list
    # convery back into one long string
    return ' '.join(word.lower() for word in filtered_sentence)

# Use only candidates who were in at least 3 debates:
Biden, Warren, Sanders, Buttigieg, Harris, Klobuchar, Booker, O'Rourke, Castro, Yang, Gabbard, Steyer (and maybe Marianne Williamson for kicks)

In [7]:
candidates = ["Joe Biden", "Elizabeth Warren", "Bernie Sanders", "Pete Buttigieg", "Kamala Harris", "Amy Klobuchar", "Cory Booker", "Beto O'Rourke", "Julian Castro", "Andrew Yang", "Tulsi Gabbard", "Tom Steyer"]
candidates.sort()

amy = getSpeech("Amy Klobuchar")
yang = getSpeech("Andrew Yang")
sanders = getSpeech("Bernie Sanders")
beto = getSpeech("Beto O’Rourke")
booker = getSpeech("Cory Booker")
warren = getSpeech("Elizabeth Warren")
biden = getSpeech("Joe Biden")
castro = getSpeech("Julián Castro")
harris = getSpeech("Kamala Harris")
buttigieg = getSpeech("Pete Buttigieg")
steyer = getSpeech("Tom Steyer")
gabbard = getSpeech("Tulsi Gabbard")

# also pulling in Trump transcripts (from 15 rallies held 2019-2020)
trump = pd.read_csv("TrumpRallies.csv")
trump.replace({'President Trump':'Donald Trump'}, inplace=True)
trump = getTrumpSpeech("Donald Trump")

corpus = {'Klobuchar': amy, 'Yang': yang, 'Sanders':sanders, 'Beto':beto, 'Booker':booker, 'Warren': warren, 'Biden':biden, 'Castro':castro, 'Harris':harris, 'Buttigieg':buttigieg, 'Steyer':steyer, 'Gabbard':gabbard, 'Trump':trump}

In [10]:
# add additional stop words to the nltk list
my_stop_words = text.ENGLISH_STOP_WORDS.union(["donald", "trump", "ve", "got"])

# calc tf-idf for 3-word phrases
tfidf = TfidfVectorizer(stop_words = my_stop_words, ngram_range=(3,3))
tfs = tfidf.fit_transform(corpus.values())

feature_names = tfidf.get_feature_names()
corpus_index = [n for n in corpus]
rows, cols = tfs.nonzero()

data = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)

# Top tf-idf Phrases Of Various Candidates

In [18]:
data.sort_values("Biden",ascending=False).head(5)["Biden"].reset_index()

Unnamed: 0,index,Biden
0,violence women act,0.077353
1,united states america,0.060525
2,president united states,0.056408
3,united states senate,0.048506
4,160 million people,0.047648


In [17]:
data.sort_values("Sanders",ascending=False).head(5)["Sanders"].reset_index()

Unnamed: 0,index,Sanders
0,fossil fuel industry,0.143558
1,drug companies insurance,0.105276
2,companies insurance companies,0.093354
3,major country earth,0.088784
4,public colleges universities,0.088784


In [19]:
data.sort_values("Warren",ascending=False).head(5)["Warren"].reset_index()

Unnamed: 0,index,Warren
0,student loan debt,0.135557
1,middle class families,0.102947
2,public school teacher,0.072489
3,cent wealth tax,0.072489
4,giant multinational corporations,0.062134


In [20]:
data.sort_values("Klobuchar",ascending=False).head(5)["Klobuchar"].reset_index()

Unnamed: 0,index,Klobuchar
0,like poker chips,0.048834
1,barack obama wanted,0.048834
2,bills lead democrat,0.048834
3,long term care,0.042113
4,universal background checks,0.038278
