In [1]:
from pymongo import MongoClient
import pandas as pd

In [3]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
import spacy
from spacy import displacy

In [4]:
import en_core_web_sm



In [5]:
import speech_recognition as sr 
import pyaudio

In [6]:
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/NPeas15/nltk_data...
[nltk_data] Downloading package punkt to /Users/NPeas15/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/NPeas15/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## 1. BRINGING THE DATA

### 1.1 Connecting to the database

In [7]:
client = MongoClient("mongodb://localhost/debates") 
db = client.get_database()
collection = db['phrases']

### 1.2 Dialogues

In [11]:
personajes = collection.distinct('speaker')
personajes[:10]

['A. Cooper',
 'Abby Phillips',
 'Adam Sexton',
 'Amna Nawaz',
 'Amy Klobuchar',
 'Amy Walter',
 'Anderson Cooper',
 'Andrea Mitchell',
 'Andrew Yang',
 'Announcer']

In [14]:
query_bernie = {"speaker":"Bernie Sanders"}
bernie = list(collection.find(query_bernie, {"speech": 1,"_id": 0}))

In [17]:
bernie[:4]

[{'speech': 'Well, you’re right, the economy is doing really great for people like Mr. Bloomberg and other billionaires. In the last three years, last three years, billionaires in this country saw an $850 billion increase in their wealth. But you know what, for the ordinary American things are not so good. Last year, real wage increases for the average worker were less than one percent. Half of our people are living paycheck to paycheck. 87 million Americans have no health insurance or are under-insured. 45 million people are struggling with student debt. 500,000 people tonight are sleeping out on the street, including 30,000 veterans. That is not an economy that’s working for the American people. That’s an economy working for the one percent. We’re going to create an economy for all, not just wealthy campaign contributors.'},
 {'speech': 'Oh, Mr. Bloomberg. Let me tell Mr. Putin. Okay, I’m not a good friend of President Xi of China. I think President Xi is an authoritarian leader. And

## 2. Speechs into a DataFrame

### 2.1 Through Mongo

In [18]:
bernie_frases = [i['speech'] for i in bernie]
bernie_frases


['Well, you’re right, the economy is doing really great for people like Mr. Bloomberg and other billionaires. In the last three years, last three years, billionaires in this country saw an $850 billion increase in their wealth. But you know what, for the ordinary American things are not so good. Last year, real wage increases for the average worker were less than one percent. Half of our people are living paycheck to paycheck. 87 million Americans have no health insurance or are under-insured. 45 million people are struggling with student debt. 500,000 people tonight are sleeping out on the street, including 30,000 veterans. That is not an economy that’s working for the American people. That’s an economy working for the one percent. We’re going to create an economy for all, not just wealthy campaign contributors.',
 'Oh, Mr. Bloomberg. Let me tell Mr. Putin. Okay, I’m not a good friend of President Xi of China. I think President Xi is an authoritarian leader. And let me tell Mr. Putin 

In [19]:
debatesdf = pd.DataFrame(bernie_frases, columns=["Bernie"])
debatesdf.head()

Unnamed: 0,Bernie
0,"Well, you’re right, the economy is doing reall..."
1,"Oh, Mr. Bloomberg. Let me tell Mr. Putin. Okay..."
2,Pete mentioned … I’m hearing my name mentioned...
3,Pete has gotten funding [crosstalk 00:08:42] f...
4,"I didn’t say that, Pete."


### 2.2 Through Pandas

In [20]:
todo = pd.DataFrame(list(collection.find()))
todo.head()

Unnamed: 0,_id,date,speaker,speech
0,5fc4cf8fc0927bf53398692c,02-25-2020,Norah O’Donnell,"Good evening and welcome, the Democratic presi..."
1,5fc4cf8fc0927bf53398692d,02-25-2020,Gayle King,And Super Tuesday is just a week away and this...
2,5fc4cf8fc0927bf53398692e,02-25-2020,Norah O’Donnell,And CBS News is proud to bring you this debate...
3,5fc4cf8fc0927bf53398692f,02-25-2020,Gayle King,And we are partnering tonight also with Twitte...
4,5fc4cf8fc0927bf533986930,02-25-2020,Norah O’Donnell,"Now, here are the rules for the next two hours..."


In [22]:
todo = todo[['speaker', 'speech']]
todo.head()

Unnamed: 0,speaker,speech
0,Norah O’Donnell,"Good evening and welcome, the Democratic presi..."
1,Gayle King,And Super Tuesday is just a week away and this...
2,Norah O’Donnell,And CBS News is proud to bring you this debate...
3,Gayle King,And we are partnering tonight also with Twitte...
4,Norah O’Donnell,"Now, here are the rules for the next two hours..."


In [25]:
todo.groupby(['speaker'])
todo

Unnamed: 0,speaker,speech
0,Norah O’Donnell,"Good evening and welcome, the Democratic presi..."
1,Gayle King,And Super Tuesday is just a week away and this...
2,Norah O’Donnell,And CBS News is proud to bring you this debate...
3,Gayle King,And we are partnering tonight also with Twitte...
4,Norah O’Donnell,"Now, here are the rules for the next two hours..."
...,...,...
5911,Nuria Peñas,Probando nuevo proyecto
5912,Nuria Peñas,Probando nuevo proyecto
5913,Nuria Peñas,Probando nuevo proyecto
5914,Nuria Peñas,Probando nuevo proyecto


## 3. Sentiment analysis

### 3.1. NLTK

In [26]:
sia = SentimentIntensityAnalyzer()

In [27]:
sentence = "this is beautiful"

In [29]:
polarity = sia.polarity_scores(sentence)
polarity

{'neg': 0.0, 'neu': 0.339, 'pos': 0.661, 'compound': 0.5994}

In [30]:
def sentimentAnalysis(sentence):
    sia = SentimentIntensityAnalyzer()
    polarity = sia.polarity_scores(sentence)
    return polarity

In [31]:
def sentimentAnalysis(sentence):
    sia = SentimentIntensityAnalyzer()
    polarity = sia.polarity_scores(sentence)
    pol = polarity['compound']
    return pol

In [32]:
todo['sentiment_compound'] = todo['speech'].apply(sentimentAnalysis)
todo.head()

Unnamed: 0,speaker,speech,sentiment_compound
0,Norah O’Donnell,"Good evening and welcome, the Democratic presi...",0.7096
1,Gayle King,And Super Tuesday is just a week away and this...,0.9325
2,Norah O’Donnell,And CBS News is proud to bring you this debate...,0.6369
3,Gayle King,And we are partnering tonight also with Twitte...,0.0
4,Norah O’Donnell,"Now, here are the rules for the next two hours...",-0.6369


In [34]:
todo.groupby(['speaker'])['sentiment_compound'].mean()

speaker
A. Cooper        0.000000
Abby Phillips    0.181614
Adam Sexton      0.087567
Amna Nawaz       0.258117
Amy Klobuchar    0.311199
                   ...   
Tulsi Gabbard   -0.035218
Vanessa Hauc     0.131005
Voiceover        0.011085
Wolf Blitzer     0.009519
Yamiche A.       0.401840
Name: sentiment_compound, Length: 107, dtype: float64