# VADER Sentiment

In [46]:
# Initial imports
import pandas_market_calendars as mcal
# pip install pandas_market_calendars
import re
import os
from path import Path
import pandas as pd
# from newsapi import NewsApiClient
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import hvplot.pandas

In [2]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/baileycameron/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# Import twitter feed from csv file

# Load the sentiment data for vaccine
trump_df = pd.read_csv('trump_tweets_Jan_2020_Sep_2020.csv', infer_datetime_format=True, parse_dates=True)
trump_df = trump_df.drop(columns="Unnamed: 0")
trump_df = trump_df.drop_duplicates(subset=['Time', 'Tweet'], keep='last')
trump_df = trump_df.dropna()
trump_df.tail()

Unnamed: 0,Time,Tweet
2405,2020-01-03T13:54:43.000Z,General Qassem Soleimani has killed or badly w...
2406,2020-01-03T12:44:30.000Z,"Iran never won a war, but never lost a negotia..."
2407,2020-01-02T13:58:01.000Z,A lot of very good people were taken down by a...
2408,2020-01-02T13:42:41.000Z,"Sohrab Ahmari, New York Post “The Trump Campai..."
2409,2020-01-01T01:30:35.000Z,HAPPY NEW YEAR!


In [5]:
# cleanup 'Time' indicies

def split_timestamps(df):
    
    Date = []
    Time = []
    
    for index,row in trump_df.iterrows():
    
        date = str(row['Time'])[:10]
        time = str(row['Time'])[11:19]
    
        Time.append(time)
        Date.append(date)
    
    df['Date'] = Date
    df['Time Stamp'] = Time 
    
    return df

trump_df = split_timestamps(trump_df)
trump_df = trump_df.drop(columns='Time')
trump_df = trump_df[trump_df['Date'] > '2020-01-01']
trump_df.tail()

Unnamed: 0,Tweet,Date,Time Stamp
2404,The United States has paid Iraq Billions of Do...,2020-01-03,15:09:32
2405,General Qassem Soleimani has killed or badly w...,2020-01-03,13:54:43
2406,"Iran never won a war, but never lost a negotia...",2020-01-03,12:44:30
2407,A lot of very good people were taken down by a...,2020-01-02,13:58:01
2408,"Sohrab Ahmari, New York Post “The Trump Campai...",2020-01-02,13:42:41


In [9]:
# assign a vaccine identifier

flu_pattern = re.compile(r'(china|flu|vaccine|covid|virus|corona|chinese|pharma)')
flu_column = []

for index, row in trump_df.iterrows():

    # converts obj to string
    tweet_str = str(row['Tweet']).lower()
    
    #indicates REGEX match
    if bool(flu_pattern.search(tweet_str)): #previously match
        flu_column.append(1)
    else:
        flu_column.append(0)
        
trump_df['COVID_Tweet'] = flu_column

# extracting vaccine tweets
vaccine_df = trump_df[trump_df['COVID_Tweet'] == 1]
vaccine_df.head()

Unnamed: 0,Tweet,Date,Time Stamp,COVID_Tweet
2,White House News Conference today at 6:00 P.M....,2020-09-23,13:21:10,1
20,Joe Biden delivered remarks to union members a...,2020-09-18,22:58:15,1
21,Joe Biden says this is a race between Scranton...,2020-09-18,22:56:18,1
22,Sleepy Joe Biden just said that he wished I cl...,2020-09-18,20:54:55,1
25,Biden FAILED BADLY with the Swine Flu. It was ...,2020-09-18,15:26:41,1


In [10]:
len(vaccine_df)

212

In [11]:
# Create the sentiment scores DataFrame
sentiments = []

for index, row in vaccine_df.iterrows():
    try:
        text = row['Tweet']
        date = row['Date']
        sentiment = analyzer.polarity_scores(text)
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        sentiments.append({
            "text": text,
            "date": date,
            "compound": compound,
            "positive": pos,
            "negative": neg,
            "neutral": neu
            
        })
        
    except AttributeError:
        pass
    
# Create DataFrame
sentiment_df = pd.DataFrame(sentiments)

sentiment_df.head()

Unnamed: 0,text,date,compound,positive,negative,neutral
0,White House News Conference today at 6:00 P.M....,2020-09-23,0.8188,0.32,0.0,0.68
1,Joe Biden delivered remarks to union members a...,2020-09-18,0.34,0.094,0.0,0.906
2,Joe Biden says this is a race between Scranton...,2020-09-18,-0.6476,0.0,0.115,0.885
3,Sleepy Joe Biden just said that he wished I cl...,2020-09-18,-0.0387,0.102,0.106,0.793
4,Biden FAILED BADLY with the Swine Flu. It was ...,2020-09-18,-0.9092,0.042,0.322,0.636


In [12]:
# option 2: combine all the text from all the tweets on that day and run sentiment analysis 
# averaging daily sentiment scores
daily_sent_df = (sentiment_df.groupby('date')['compound','positive','negative','neutral'].mean()).reset_index()

# creating an empty df with all dates
all_dates = trump_df['Date'].unique()
placeholder = pd.DataFrame(all_dates,columns=['date'])

# joining dfs on the date to identify days trump didn't tweet
placeholder = placeholder.set_index('date')
daily_sent_df = daily_sent_df.set_index('date')
df = pd.concat([placeholder,daily_sent_df],axis=1,join='outer')

df = df.sort_index()
df.head(15)


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,compound,positive,negative,neutral
2020-01-02,,,,
2020-01-03,,,,
2020-01-04,,,,
2020-01-05,,,,
2020-01-06,,,,
2020-01-07,,,,
2020-01-08,,,,
2020-01-09,,,,
2020-01-10,,,,
2020-01-11,,,,


In [13]:
# option 1: fill from the last available date
# option 2: average all available entries
# option 3: put in zeros

# option 1
df_clean = df.fillna(method='ffill')

# fills blank values from Jan 1 to Jan 15 where no virus tweets
df_clean = df_clean.fillna(0)
df_clean.head(15)

Unnamed: 0,compound,positive,negative,neutral
2020-01-02,0.0,0.0,0.0,0.0
2020-01-03,0.0,0.0,0.0,0.0
2020-01-04,0.0,0.0,0.0,0.0
2020-01-05,0.0,0.0,0.0,0.0
2020-01-06,0.0,0.0,0.0,0.0
2020-01-07,0.0,0.0,0.0,0.0
2020-01-08,0.0,0.0,0.0,0.0
2020-01-09,0.0,0.0,0.0,0.0
2020-01-10,0.0,0.0,0.0,0.0
2020-01-11,0.0,0.0,0.0,0.0


In [89]:
# Get only trading days
nyse = mcal.get_calendar('NYSE')

df_final = df_clean[['compound']].copy()

valid_days = nyse.valid_days(start_date='2020-01-01', end_date='2020-09-24')
valid_days = valid_days.strftime('%Y-%m-%d').to_frame()

df_final = pd.concat([valid_days,df_final],join='inner',axis=1)
df_final = df_final.drop(columns=0)

df_final.head(10)

Unnamed: 0,compound
2020-01-02,0.0
2020-01-03,0.0
2020-01-06,0.0
2020-01-07,0.0
2020-01-08,0.0
2020-01-09,0.0
2020-01-10,0.0
2020-01-13,0.0
2020-01-14,0.0
2020-01-15,0.0


In [90]:
df_final.to_csv('trump_sentiment_Jan_2020_Sep_2020.csv')

----------------
# Code Graveyard
Code that was written, but not used.

In [80]:

pos_list = []
neg_list = []
neu_list = []

for index,row in df_clean.iterrows():
    
    sent_list = [row['positive'], row['negative'], row['neutral']]
    highest_value = sent_list.index(max(sent_list))
    
    if highest_value == 0:
        if sent_list[0] == 0 and sent_list[1] == 0 and sent_list[2] == 0:
            positive = 0
            negative = 0
            neutral = 0
        else:
            positive = 1
            negative = 0
            neutral = 0
        
    elif highest_value == 1:
        positive = 0
        negative = 1
        neutral = 0
        
    else:
        positive = 0
        negative = 0
        neutral = 1
        
    pos_list.append(positive)
    neg_list.append(negative)
    neu_list.append(neutral)

    

sent_score_only = pd.DataFrame()
# sent_score_only['date'] = list(all_dates) 
sent_score_only['positive'] = pos_list
sent_score_only['negative'] = neg_list
sent_score_only['neutral'] = neu_list

sent_score_only
#      if row['compound'] < 0.5 and row['compound'] > -0.5:
#          # neutral
#      elif row['compound']

Unnamed: 0,positive,negative,neutral
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
258,0,0,1
259,0,0,1
260,0,0,1
261,0,0,1
