In [4]:
import pandas as pd
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import re
import itertools
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import json

df = pd.read_csv("DemDebateTranscripts.csv")

In [58]:
# condensing name variations
name_variations = {"name": 
                   {
                       "Senator Warren":"Elizabeth Warren", "Elizabeth W.":"Elizabeth Warren", "Sen. Warren":"Elizabeth Warren", 
                       "E. Warren":"Elizabeth Warren", "Elizabeth W":"Elizabeth Warren",
                       "Julian Castro":"Julián Castro",
                       "Williamson":"Marianne Williamson", "Marianne W.":"Marianne Williamson", 'Ms. Williamson': "Marianne Williamson",
                       "John H":"John Hickenlooper", "John H.":"John Hickenlooper", "John Hickenloop":"John Hickenlooper",
                       "Yang": "Andrew Yang", "Sen Klobuchar":"Amy Klobuchar",
                       "Kirsten G.":"Kirsten Gillibrand", "Kristen Gillibr":"Kirsten Gillibrand", "Gillibrand":"Kirsten Gillibrand",
                       "Bennett":"Michael Bennett", "Senator Bennet":"Michael Bennett", "Michael Bennet":"Michael Bennett",
                       'Senator Booker':"Cory Booker", "Corey Booker":"Cory Booker",
                       'Mayor Buttigieg': "Pete Buttigieg", "Eric Stalwell":"Eric Swalwell",
                       'Mayor de Blasio':"Bill de Blasio", "Bill De Blasio":"Bill de Blasio",
                       "Speaker 14":"Other", "Speaker 15":"Other", "Speaker 16":"Other", "Speaker 17":"Other", "Speaker 18":"Other",
                       "Speaker 19":"Other", "Speaker 20":"Other", "Speaker 21":"Other", "Speaker 23":"Other",
                       "Female":"Other", "Male":"Other", "Audience":"Other", "Speaker 26":"Other", "Speaker 30":"Other",
                       "Speaker 31":"Other", "Speaker 2":"Other", "Speaker 3":"Other", "Speaker 4":"Other", "Speaker 5":"Other", 
                       "Speaker 6":"Other", "Speaker 7":"Other", "Speaker 8":"Other", "Speaker 1":"Other",  
                       "Lester Holt":"Moderator", "Savannah":"Moderator", "Rachel Maddow":"Moderator", "Chuck Todd":"Moderator",
                       "Savannah G.":"Moderator", "Jose":"Moderator", "Steve Kornacki":"Moderator", 
                       "Voiceover": "Other", "Speaker 13":"Other",
                       "Crowd": "Remove",
                       "  › Blog › Political Transcripts › October Democratic Debate Transcript":"Remove",
                       "  › Blog › Political Transcripts › December Democratic Debate Transcript":"Remove",
                       " Rev › Blog › Political Transcripts › Transcript of July Democratic Debate 2nd Round Night 1":"Remove",
                       " Rev › Blog › Political Transcripts › Transcript of July Democratic Debate 2nd Round, Night 2":"Remove",
                       'Jake Tapper':"Moderator", 'Diana':"Moderator", 'Stephanie Sy':"Moderator",
                       'Dana Bash':"Moderator", 'Don Lemon':"Moderator", 'Yamiche A.':"Moderator",
                       'Anderson Cooper':"Moderator", 'John King':"Moderator", 'Henderson':"Moderator", 
                       'George S':"Moderator", 'Jorge Ramos':"Moderator",
                       'David Muir':"Moderator", 'Lindsey Davis':"Moderator", 'George S.':"Moderator",
                       'Erin Burnett':"Moderator", 'Marc Lacey':"Moderator", 'A. Cooper':"Moderator",
                       'Andrea Mitchell':"Moderator", 'Kristen Welker':"Moderator", 'Ashley Parker':"Moderator",
                       'Libby Casey':"Moderator", 'Judy Woodruff':"Moderator", 'Amy Walter':"Moderator",
                       'Tim Alberta':"Moderator", 'Amna':"Moderator", 'Amna Nawaz':"Moderator", 'Yamiche':"Moderator", 
                       'Tim':"Moderator", 'Judy':"Moderator"
                       
                   }
                }
                      
df.replace(name_variations, inplace=True)

In [94]:
df = df[df['name']!='Remove']
df = df[df['name']!='Other']

# Functions to count number of sentences, words, and syllables in speech paragraphs

In [22]:
def sentenceCount(paragraph):
    return len(re.split(r'[.!?]+', paragraph)) - 1

def wordCount(paragraph):
    words = paragraph.split(" ")
    return len(words)
    
def syllableCount(word) :
    word = word.lower()

    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables

    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately']

    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']

    pre_one = ['preach']

    syls = 0 #added syllable number
    disc = 0 #discarded syllable number

    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls

    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)

    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1

    #3) discard trailing "e", except where ending is "le"  

    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']

    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass

        else :
            disc+=1

    #4) check if consecutive vowels exists, triplets or pairs, count them as one.

    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple

    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))

    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1

    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1

    #8) add one if "y" is surrounded by non-vowels and is not in the last word.

    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1

    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.

    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1

    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1

    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"

    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1

    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:2] == "co" and word[2] in 'eaoui' :

        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1

    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1

    #13) check for "-n't" and cross match with dictionary to add syllable.

    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]

    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass   

    #14) Handling the exceptional words.

    if word in exception_del :
        disc+=1

    if word in exception_add :
        syls+=1     

    # calculate the output
    return numVowels - disc + syls

# Pull out speech for just one candidate and transform into one string

In [25]:
amy = df[df['name']=='Amy Klobuchar'].reset_index()

amySpeech = ""

for row in range(0,len(amy)):
    amySpeech = amySpeech + " " + amy['speech'][row]
    
amySpeech = amySpeech.strip(" ")

# Calculate reading score

Flesch-Kincaid grade level formula: 0.39 x (words/sentences) + 11.8 x (syllables/words) - 15.59.

In [26]:
numSentences = sentenceCount(amySpeech)
numWords = wordCount(amySpeech)
numSyllables = syllableCount(amySpeech)

0.39 * (numWords / numSentences) + 11.8 * (numSyllables / numWords) - 15.59

8.392887147719438

# Putting two previous steps together into one function
## With optional ability to filter by month

In [78]:
def calcScore(candidateName, month='All'):
    
    candidateDF = df[df['name']==candidateName].reset_index()

    if month != 'All':
        candidateDF = candidateDF[candidateDF['month']==month].reset_index()

    candidateSpeech = ""

    for row in range(0,len(candidateDF)):
        candidateSpeech = candidateSpeech + " " + candidateDF['speech'][row]

    candidateSpeech = candidateSpeech.strip(" ")
    
    numSentences = sentenceCount(candidateSpeech)
    numWords = wordCount(candidateSpeech)
    numSyllables = syllableCount(candidateSpeech)

    return 0.39 * (numWords / numSentences) + 11.8 * (numSyllables / numWords) - 15.59

# Calculate reading score for every candidate and month (including all) combination

In [95]:
names = list(df['name'].unique())
months = list(df['month'].unique())
months.append("All")

nameList = []
scoreList = []
monthList = []

for name in names:
    for month in months:
        try:
            scoreList.append(calcScore(name, month))
            nameList.append(name)
            monthList.append(month)
        except:
            continue

scoresDF = pd.DataFrame({'name':nameList, 'month':monthList, 'score':scoreList})

In [96]:
scoresDF.sort_values('score')

Unnamed: 0,name,month,score
0,Moderator,June,6.564199
64,Joe Biden,Sept,6.661817
65,Joe Biden,Oct,7.027274
3,Moderator,Oct,7.209532
62,Joe Biden,June,7.251426
1,Moderator,July,7.279495
4,Moderator,Dec,7.369296
68,Joe Biden,All,7.387106
63,Joe Biden,July,7.472560
79,Kirsten Gillibrand,July,7.611744


In [101]:
hi = scoresDF[scoresDF['month']!='All'].sort_values('score').reset_index()

In [103]:
del hi['index']

In [104]:
import os
os.getcwd()

'C:\\Users\\glol7001'

In [105]:
hi.to_csv(os.getcwd()+"\\Documents\\DemDebateReadingScores.csv")

In [113]:
hi2 = hi.pivot(index='name', columns='month', values='score').reset_index()

In [119]:
hi2.to_csv(os.getcwd()+"\\Documents\\DemDebateReadingScoresReformattedMissing.csv", index=False)

In [117]:
hi3 = hi2.dropna()

In [122]:
hi2.isnull().sum()

month
name     0
Dec     15
July     2
June     2
Nov     13
Oct     10
Sept    12
dtype: int64

In [127]:
hi2['bye'] = 0

for i in range(len(hi2)) :
    if hi2.iloc[i].isnull().sum() > 2:
        hi2['bye'][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [128]:
hi25 = hi2[hi2['bye']==0]

In [130]:
del hi25['bye']

In [131]:
hi25.to_csv(os.getcwd()+"\\Documents\\DemDebateReadingScoresData.csv", index=False)

In [132]:
len(hi25)

12

In [133]:
len(hi2)

23