## Covid19 Tweets EDA and Text Sentiment Analysis

In [4]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
# import nltk

%matplotlib inline
plt.style.use('ggplot')

In [3]:
#To set up JupyterLab
#1. Make sure docker is running. If another container is running, use 'docker rm [name]' to remove it
#2. Run docker container:
#   'sudo docker run -d --name sparkbook -p 8881:8888 -v "$PWD":/home/jovyan/work jupyter/pyspark-notebook start.sh jupyter lab --LabApp.token='' '
#   'docker exec -it sparkbook bash'
#3. Open browser and go to 'localhost:8881'

In [5]:
#Setting up pyspark session
import pyspark as ps

spark = (ps.sql.SparkSession
            .builder
            .master('local[4]')
            .appName('Covid19_tweets')
            .getOrCreate())

sc = spark.sparkContext
sc

In [7]:
covid_data = pd.read_csv('covid19_tweets.csv')
covid_data.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [8]:
covid_data.shape

(140101, 13)

In [9]:
#Removing duplicates
covid_data.drop_duplicates()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
140096,Aaron Derfel,Montreal,I'm the health-care journalist at the Montreal...,2010-07-01 01:19:14,18063,197,6083,True,2020-08-16 01:56:08,"6) Meanwhile, the Institut national de santé p...",['COVID19'],Twitter Web App,False
140097,Pierre Alozie,World,"Photojournalist @ F8, FFR, NUJ",2009-03-20 12:27:35,300,1002,703,False,2020-08-16 01:56:00,"17912147095473612, 15/08/2020 12:43:02, Studen...",,Zapier.com,False
140098,Pierre Alozie,World,"Photojournalist @ F8, FFR, NUJ",2009-03-20 12:27:35,300,1002,703,False,2020-08-16 01:55:59,"18115162954178167, 15/08/2020 13:16:55, Studen...",,Zapier.com,False
140099,Pierre Alozie,World,"Photojournalist @ F8, FFR, NUJ",2009-03-20 12:27:35,300,1002,703,False,2020-08-16 01:55:58,"17899557754536668, 15/08/2020 13:16:57, Studen...",,Zapier.com,False


##### No duplicates in this dataset

In [10]:
covid_data.dtypes

user_name           object
user_location       object
user_description    object
user_created        object
user_followers       int64
user_friends         int64
user_favourites      int64
user_verified         bool
date                object
text                object
hashtags            object
source              object
is_retweet            bool
dtype: object

In [11]:
#Average number of followers user_followers
user_followers_avg = np.mean(covid_data['user_followers'])
user_followers_avg

108665.41141033969

In [12]:
#Average number of user_friends
user_friends_avg = np.mean(covid_data['user_friends'])
user_friends_avg

2104.862384993683

In [13]:
#Average number of user_favorites
user_favourites_avg = np.mean(covid_data['user_favourites'])
user_favourites_avg

14257.867802513902

In [14]:
#unique types of hashtags
print(covid_data['hashtags'].nunique())
print(covid_data['hashtags'].unique())

42175
[nan "['COVID19']" "['CoronaVirusUpdates', 'COVID19']" ...
 "['NEWS', 'NZ', 'COVID19nz', 'COVID19Aus', 'COVID19']"
 "['bbcnews', 'c4news', 'Newsnight', 'skynews', 'skypapers', 'covid19', 'coronavirus', 'covid19UK']"
 "['Chernobyl', 'COVID19']"]


In [34]:
#How many has the word 'news' in the hashtag
news_in_hashtag = []
for hashtag in covid_data['hashtags']:
    if type(hashtag) == float:
        news_in_hashtag.append(0)
    elif 'news' in hashtag or 'News' in hashtag:
        news_in_hashtag.append(1)
    else:
        news_in_hashtag.append(0)

print('number of hashtags containing "news": {}'.format(sum(news_in_hashtag)))

number of hashtags containing "news": 1204


In [35]:
#adding news_in_hashtag into the dataframe
covid_data['news_in_hashtag'] = news_in_hashtag

In [32]:
#How many has the word 'covid' in the hashtag
covid_in_hashtag = []
for hashtag in covid_data['hashtags']:
    if type(hashtag) == float:
        covid_in_hashtag.append(0)
    elif 'Covid' in hashtag or 'covid' in hashtag:
        covid_in_hashtag.append(1)
    else:
        covid_in_hashtag.append(0)

print('number of hashtags containing "covid": {}'.format(sum(covid_in_hashtag)))

number of hashtags containing "covid": 17086


In [33]:
#adding covid_in_hashtag into the dataframe
covid_data['covid_in_hashtag'] = covid_in_hashtag

In [17]:
covid_data['hashtags'].isnull().sum()

40162

In [18]:
#Number of unique users
print('number of unique user_names: ', covid_data['user_name'].nunique())
covid_data['user_name'].unique()

number of unique user_names:  76144


array(['ᏉᎥ☻լꂅϮ', 'Tom Basile 🇺🇸', 'Time4fisticuffs', ...,
       'The REALLY Old Guard', 'RANT-A-DAD', 'Pierre Alozie'],
      dtype=object)

In [19]:
#How many Verified Users
covid_data[covid_data['user_verified']==True].count()

user_name           18295
user_location       16788
user_description    18287
user_created        18295
user_followers      18295
user_friends        18295
user_favourites     18295
user_verified       18295
date                18295
text                18295
hashtags            13812
source              18295
is_retweet          18295
dtype: int64

In [20]:
#How many Unverified Users
covid_data[covid_data['user_verified']==False].count()

user_name           121806
user_location        94899
user_description    113847
user_created        121806
user_followers      121806
user_friends        121806
user_favourites     121806
user_verified       121806
date                121806
text                121806
hashtags             86127
source              121761
is_retweet          121806
dtype: int64

In [21]:
#Types of sources
print('number of unique sources: ', covid_data['source'].nunique())
# covid_data['source'].unique()

number of unique sources:  564


In [22]:
#Which sources are the tweets from? Iphone or Android?
tweet_source = []
for source in covid_data['source'][:6]:
    print(source)

Twitter for iPhone
Twitter for Android
Twitter for Android
Twitter for iPhone
Twitter for Android
Twitter Web App


In [24]:
#Setting up the spark dataframe for Querying
covid19_df = spark.read.csv('covid19_tweets.csv',
                           header = True,
                           sep = ',',
                           inferSchema = True)

covid19_df.createOrReplaceTempView('covid19_tweets')

In [25]:
covid19_df.printSchema()

root
 |-- user_name: string (nullable = true)
 |-- user_location: string (nullable = true)
 |-- user_description: string (nullable = true)
 |-- user_created: string (nullable = true)
 |-- user_followers: string (nullable = true)
 |-- user_friends: string (nullable = true)
 |-- user_favourites: string (nullable = true)
 |-- user_verified: string (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- source: string (nullable = true)
 |-- is_retweet: string (nullable = true)



In [26]:
covid19_df.show(10)

+--------------------+--------------------+--------------------+-------------------+--------------+------------+---------------+-------------+-------------------+--------------------+--------------------+-------------------+----------+
|           user_name|       user_location|    user_description|       user_created|user_followers|user_friends|user_favourites|user_verified|               date|                text|            hashtags|             source|is_retweet|
+--------------------+--------------------+--------------------+-------------------+--------------+------------+---------------+-------------+-------------------+--------------------+--------------------+-------------------+----------+
|             ᏉᎥ☻լꂅϮ|          astroworld|wednesday addams ...|2017-05-26 05:46:42|           624|         950|          18775|        False|2020-07-25 12:27:21|If I smelled the ...|                null| Twitter for iPhone|     False|
|     Tom Basile 🇺🇸|        New York, NY|Husband, Father,

In [29]:
#Creates a text label about each bar in *rects*, displaying its height
def autolabel_percent(rects, orientation='vert'):
    #Prints percentage above bars for vertical bars
    if orientation=='vert':
        for rect in rects:
            height - rect.get_height()
            ax.annotate('{0:2f}%'.format(height),
                       xy=(rect.get_x()+rect.get_width()/2, height),
                       xytext=(0,3),
                       weight='bold',
                       textcoords='offset points',
                       ha='center', va='botton',
                       size=15)
    elif orientation=='hort':
        for rect in rects:
            width = rect.get_width()
            ax.annotate("{0:2f}%".format(width),
                       xy=(width, rect.get_y() + rect.get_height()/2),
                       xytext=(3,-6),
                       textcoords='offset points',
                       size=15)

In [30]:
#Print functions for baar graphs specifically
def print_bar(x, y, x_label, y_label, title='insert title', orientation='vert', color='blue',
             width=0.65, fig_size=(23,7), percentage=False, weight='bold', tick_size=20, title_size=30):
    
    #TODO: might have a problem with bars_for_annotation being called in scope
    fig, ax = plt.subplots(figsize=fig_size)
    if orientation=='vert':
        bars_for_annotation = ax.bar(x, y, color=color, align='center', width=width)
    elif orientation=='hort':
        bars_for_annotation = ax.barh(x, y, color=color, align='center')
        
    plt.xticks(size=13, rotation=90)
    plt.yticks(size=15)
    plt.xlabel(x_label, size=tick_size, color=color)
    plt.ylabel(y_label, size=tick_size, color=color)
    plt.title(title, fontsize=title_size, color=color)
    
    autolabel_percentage(bars_for_annotation, orientation)
    
    plt.show()

In [None]:
#Converting pandas df to spark sql