In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# For uploading and accessing the data
import geopandas as gpd
import mapclassify
import csv
import re
import os
import string
import pickle
from collections import Counter

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#Plotly Libraris
import plotly.express as px
import plotly.graph_objects as go
#import plotly.figure_factory as ff
#from plotly.colors import n_colors
from plotly.subplots import make_subplots
# Minmax scaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('vader_lexicon')

In [None]:
pd.options.display.max_colwidth = 100

## **Visualization**: Can you create a visualization of the growth of interest. Something like a heat map of frequency of hashtags by geography.

In [None]:
early_apr_dir = "/kaggle/input/coronavirus-covid19-tweets-early-april"
late_apr_dir = "/kaggle/input/coronavirus-covid19-tweets-late-april"

In [None]:
# Create list of dataframes
list_of_geo_dfs = []
list_of_dfs = []

In [None]:
for dirname, _, filenames in os.walk("/kaggle/input"):
    if dirname == early_apr_dir or dirname == late_apr_dir:
        for filename in filenames:
            csvfile = os.path.join(dirname, filename)
            print(csvfile)
            temp_df = pd.read_csv(csvfile)
            country_code_df = temp_df[temp_df['country_code'].notna()] # merge valid rows for geo dfs
            list_of_geo_dfs.append(country_code_df)
            
            columns_to_display = ['created_at', 'text', 'lang']
            trimmed_df = temp_df[columns_to_display]
            list_of_dfs.append(trimmed_df)

In [None]:
# May take awhile to concat
merged_geo_df = pd.concat(list_of_geo_dfs)
merged_geo_df

In [None]:
# May take awhile to concat
merged_df = pd.concat(list_of_dfs)
merged_df

In [None]:
# Gather tweet count by country code
merged_geo_tweets_count = merged_geo_df.groupby('country_code').count()["text"]
merged_geo_tweets_count

In [None]:
# Get base world map
world_filepath = gpd.datasets.get_path('naturalearth_lowres')
world = gpd.read_file(world_filepath)
world.head()

In [None]:
world.info()

In [None]:
# countries_codes_and_coordinates.csv
country_code_csv = "https://gist.githubusercontent.com/tadast/8827699/raw/f5cac3d42d16b78348610fc4ec301e9234f82821/countries_codes_and_coordinates.csv"
country_df = pd.read_csv(country_code_csv)

In [None]:
country_df.head()

In [None]:
# Clean country_df
country_df = country_df.apply(lambda s:s.str.replace('"', ""))
country_df["Latitude (average)"] = pd.to_numeric(country_df["Latitude (average)"])
country_df["Longitude (average)"] = pd.to_numeric(country_df["Longitude (average)"])
country_df["Alpha-2 code"] = country_df["Alpha-2 code"].apply(lambda s:s.strip())
country_df["Alpha-3 code"] = country_df["Alpha-3 code"].apply(lambda s:s.strip())

In [None]:
country_df.head()

In [None]:
world2 = world.merge(country_df[['Alpha-2 code', 'Alpha-3 code', 'Latitude (average)', 'Longitude (average)']], how="left", left_on='iso_a3', right_on='Alpha-3 code')
world2.head()

In [None]:
world2.info()

In [None]:
world_tweets = world2.merge(merged_geo_tweets_count, how="left", left_on='Alpha-2 code', right_index=True)
world_tweets.head()

In [None]:
world_tweets.info()

In [None]:
#fig, ax = plt.subplots(1, 1)

heatmap = world_tweets.plot(column='text', figsize=(15, 10), cmap='OrRd', legend=True,
                     legend_kwds={
                        'label': "Tweets by Country",
                        'orientation': "horizontal"
                     }
)

In [None]:
#plt.savefig('heatmap.png')
heatmap.get_figure().savefig("heatmap.png")

In [None]:
heatmap2 = world_tweets.plot(column='text', figsize=(15, 10), cmap='OrRd',
                             legend=True, scheme="quantiles")

In [None]:
#plt.savefig('heatmap2.png')
heatmap2.get_figure().savefig("heatmap2.png")

In [None]:
world_tweets.sort_values(by=["text"], ascending=False).head(10)

Looking at the tweet heatmaps, the US, India, UK, Spain and Canada appear to have the most amount of tweets as growing interests and possibly Coronavirus cases are rising in those countries. The large amount of US tweets (about 178,749 compared to less than 7,100 for others) might be skewed due to the fact that some tweets may not have country code attributes or may not have geographic data associated with the tweet.

## **Analysis**: Can you do a sentiment analysis of people’s reaction to Coronavirus - fear, excitement, nervousness, etc..

In [None]:
cleaned_df = []

In [None]:
for dirname, _, filenames in os.walk("/kaggle/input"):
    if dirname == early_apr_dir or dirname == late_apr_dir:
        for filename in filenames:
            csvfile = os.path.join(dirname, filename)
            print(csvfile)
            temp_df = pd.read_csv(csvfile)
            columns_to_display = ['created_at', 'screen_name', 'text', 'lang']
            sent_df = temp_df[columns_to_display]


In [None]:
# We will be analyzing the tweets from 3/29 to conduct sentiment analysis
first_csv = "/kaggle/input/coronavirus-covid19-tweets-early-april/2020-03-29 Coronavirus Tweets.CSV"
first_df = pd.read_csv(first_csv)
first_df.head()

In [None]:
columns_to_display = ['created_at', 'text', 'lang']
sent_df = first_df[columns_to_display]
sent_df.head()

### Tweet text cleaning

In [None]:
texts = first_df["text"]
texts

In [None]:
# Remove url
remove_url=lambda x:re.sub(r'http\S+','',str(x))
texts_lr=texts.apply(remove_url)
texts_lr

In [None]:
# To lowercase
to_lower=lambda x: x.lower()
texts_lr_lc=texts_lr.apply(to_lower)
texts_lr_lc

In [None]:
# Remove punctuation
remove_puncs= lambda x:x.translate(str.maketrans('','',string.punctuation))
texts_lr_lc_np=texts_lr_lc.apply(remove_puncs)
texts_lr_lc_np

In [None]:
# Remove stopwords
more_words=['say','going','like','U','u','yankees','diane3443','today','hey','covid','#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19', '#epitwitter', '#ihavecorona', 'amp', 'coronavirus', 'covid19', 'coronaupdate', 'coronavirusoutbreak', 'corona']
stop_words=set(stopwords.words('english')) #nltk package
stop_words.update(stopwords.words('spanish'))
stop_words.update(more_words)

remove_words=lambda x: ' '.join([word for word in x.split() if word not in stop_words]) #.join is from package string
texts_lr_lc_np_ns=r=texts_lr_lc_np.apply(remove_words)
texts_lr_lc_np_ns

In [None]:
words_list=[word for line in texts_lr_lc_np_ns for word in line.split()]
words_list[:5]

In [None]:
word_counts=Counter(words_list).most_common(50)
word_df=pd.DataFrame(word_counts)
word_df.columns=['word','frq']
display(word_df.head(50))

In [None]:
px.bar(word_df,x='word',y='frq',title='Most common words')

Looking at the most common words of the tweets, some words like cases, lockdown and deaths seem to portray a negative reaction to Coronavirus. In addition, precautionary words like stayathomeandstaysafe and home, may incite neutral connotations as well.

In [None]:
display(sent_df.head(5))
sent_df['text'] = texts_lr_lc_np_ns
display(sent_df.head(5))

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
sent_df['text'] = sent_df['text'].apply(lambda x: clean_text(x))
display(sent_df)

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
sent_df['text']=sent_df['text'].apply(lambda x: remove_emoji(x))
display(sent_df)

In [None]:
# Sentiment Analysis
sid = SentimentIntensityAnalyzer()
ps = lambda x:sid.polarity_scores(x)
sentiment_scores = sent_df['text'].apply(ps)
sentiment_scores

In [None]:
sentiment_df=pd.DataFrame(data=list(sentiment_scores))
display(sentiment_df)

In [None]:
labelize=lambda x:'neutral' if x==0 else('positive' if x>0 else 'negative')
sentiment_df['label']=sentiment_df.compound.apply(labelize)
display(sentiment_df.head(10))

In [None]:
display(sent_df.head(5))
data=sent_df.join(sentiment_df.compound)
data=data.join(sentiment_df.label)
display(data.head(5))

In [None]:
counts_df=data.label.value_counts().reset_index()
display(counts_df)

In [None]:
plt.figure(figsize=(8,5))
sns_plot = sns.barplot(x='index',y='label',data=counts_df)

In [None]:
#plt.savefig('sentiment.png')
sns_plot.get_figure().savefig("sentiment.png")

Looking at the results of the sentiment analysis for this day, a majority of the of the sentiments were neutral and slightly positive. It could stem from the fact that these tweets are from a variety of different languages, which in turn, could interpreted as neutral.

In [None]:
# Analyze english tweets
en_data = data[data['lang'] == 'en']
en_data

In [None]:
en_counts_df=en_data.label.value_counts().reset_index()
display(en_counts_df)

In [None]:
plt.figure(figsize=(8,5))
en_sns_plot = sns.barplot(x='index',y='label',data=en_counts_df)

In [None]:
# plt.savefig('en_sentiment.png')
en_sns_plot.get_figure().savefig("en_sentiment.png")

In [None]:
# Let's look at positive english tweets
en_data[en_data['label'] == 'positive']

In [None]:
# Let's look at negative english tweets
en_data[en_data['label'] == 'negative']

Taking a look at only the english tweets, we see a decent amount of positive tweets surprisely. There are certain words (like positive, remarkable, desires) with positive connotations that trigger the nltk model to lean towards a more optimistic sentiment. The model seems to "neutralize" certain tweets other than english. It would be difficult to identify specific emotions like fear, excitement, and nervousness accurately from just sentiment polarity, but we can gauge the general insensity of an emotion or feeling from Coronavirus for that day.

In [None]:
# Rest of data yields similiar results
# Clean and analyze rest of the data
more_words=['say','going','like','U','u','yankees','diane3443','today','hey','covid','#coronavirus', '#coronavirusoutbreak', '#coronavirusPandemic', '#covid19', '#covid_19', '#epitwitter', '#ihavecorona', 'amp', 'coronavirus', 'covid19', 'coronaupdate', 'coronavirusoutbreak', 'corona']
stop_words=set(stopwords.words('english')) #nltk package
stop_words.update(stopwords.words('spanish'))
stop_words.update(more_words)

for df in list_of_dfs:
    #texts = df["text"]
    
    remove_url=lambda x:re.sub(r'http\S+','',str(x))
    #texts_lr=texts.apply(remove_url)
    df['text'] = df['text'].apply(remove_url)
    
    to_lower=lambda x: x.lower()
    #texts_lr_lc=texts_lr.apply(to_lower)
    df['text'] = df['text'].apply(to_lower)
    
    remove_puncs= lambda x:x.translate(str.maketrans('','',string.punctuation))
    #texts_lr_lc_np=texts_lr_lc.apply(remove_puncs)
    df['text'] = df['text'].apply(remove_puncs)
    
    # Remove stopwords
    remove_words=lambda x: ' '.join([word for word in x.split() if word not in stop_words]) #.join is from package string
    #texts_lr_lc_np_ns=r=texts_lr_lc_np.apply(remove_words)
    df['text'] = df['text'].apply(remove_words)
    
    #df['text'] = texts_lr_lc_np_ns
    df['text'] = df['text'].apply(lambda x: clean_text(x))
    df['text'] = df['text'].apply(lambda x: remove_emoji(x))

In [None]:
# list_of_sentiment_dfs = []

In [None]:
# for df in list_of_dfs:
#     sid = SentimentIntensityAnalyzer()
#     ps = lambda x:sid.polarity_scores(x)
#     sentiment_scores = df['text'].apply(ps)
#     sentiment_df = pd.DataFrame(data=list(sentiment_scores))
#     labelize=lambda x:'neutral' if x==0 else('positive' if x>0 else 'negative')
#     sentiment_df['label']=sentiment_df.compound.apply(labelize)
#     data=df.join(sentiment_df.compound)
#     data=data.join(sentiment_df.label)
#     list_of_sentiment_dfs.append(data)

# Save to file
# with open("list_of_sentiment_dfs.txt", "wb") as fp:   # Pickling
#     pickle.dump(list_of_sentiment_dfs, fp)

In [None]:
with open("../input/list-of-sentiment-dfs/list_of_sentiment_dfs.txt", "rb") as fp:   # Unpickling
    list_of_sentiment_dfs = pickle.load(fp)

In [None]:
merged_df2 = pd.concat(list_of_sentiment_dfs)

In [None]:
# Save to file as well
#merged_df2.to_pickle("merged_df2.pkl")

In [None]:
# merged_df2 = pd.read_pickle("merged_df2.pkl")

In [None]:
counts_df=merged_df2.label.value_counts().reset_index()
display(counts_df)

In [None]:
plt.figure(figsize=(8,5))
sns_plot = sns.barplot(x='index',y='label',data=counts_df)

In [None]:
# Analyze english tweets
en_data = merged_df2[merged_df2['lang'] == 'en']
en_data

In [None]:
en_counts_df=en_data.label.value_counts().reset_index()
display(en_counts_df)

In [None]:
plt.figure(figsize=(8,5))
en_sns_plot = sns.barplot(x='index',y='label',data=en_counts_df)