In [None]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import re
import string

import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *

from collections import Counter

from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns


sns.set(style="darkgrid")

In [None]:
df = pd.read_csv("cleaned_tweets_2021-08.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.Datetime = pd.to_datetime(df.Datetime).dt.date
df

In [None]:
texts = df['tweet']
texts.head()

In [None]:
remove_url = lambda x: re.sub(r'https\S+', '', str(x))
texts_lr = texts.apply(remove_url)
texts_lr.head()

In [None]:
to_lower = lambda x : x.lower()
texts_lc = texts_lr.apply(to_lower)
texts_lc.head()

In [None]:
rmv_pcs = lambda x : x.translate(str.maketrans('', '', string.punctuation))
texts_pcs = texts_lc.apply(rmv_pcs)
texts_pcs

In [None]:
update_words = ['covid','amp', 'coronavirus', 'covid19','https']
stop_words = set(stopwords.words('english'))
stop_words.update(update_words)

remove_words = lambda x : ' '.join([word for word in x.split() if word not in stop_words])
texts_rs = texts_pcs.apply(remove_words)
texts_rs.head()

## Text Analysis

In [None]:
word_list = [word for line in texts_rs for word in line.split()]
word_list[:10]

In [None]:
word_counts = Counter(word_list).most_common(50)
words_df = pd.DataFrame(word_counts)
words_df.columns = ['word', 'frequency']

px.bar(words_df, x='word', y='frequency', title='Most Common Words')

## Join Table

In [None]:
df['text'] = texts_rs
df.head()

In [None]:
df.info()

In [None]:
sid = SentimentIntensityAnalyzer()
ps = lambda x : sid.polarity_scores(x)
sentiment_scores = df.text.apply(ps)
sentiment_scores

In [None]:
sentiment_df = pd.DataFrame(data = list(sentiment_scores))
sentiment_df.head()

In [None]:
labelize = lambda x : 'neutral' if x==0 else('positive' if x>0 else 'negative')
sentiment_df['label'] = sentiment_df.compound.apply(labelize)
sentiment_df.head()

In [None]:
data = df.join(sentiment_df.label)
data.head()

In [None]:
counts_df = data.label.value_counts().reset_index()
counts_df

In [None]:
sns.barplot(data=counts_df, x='index', y='label')

In [None]:
data.to_csv('sentiment_aug.csv',index=False)

In [None]:
data_agg = data[['Datetime', 'label']].groupby(['Datetime', 'label']).count().reset_index()
data_agg.columns = ['label', 'counts']
data_agg.head()

In [None]:
px.line(data_agg, x='date', y='counts', color='label', title='COVID-19 Vaccines Sentiment Analysis')