# Background 
This notebooks aims to study and visualise global views of climate change in light of COVID-19 pandemic, these views are scraped from twitter to capture the public's reaction through 2021 with the focus on the hashtags #climatechange and #COVID

# Importing relevant libraries 

In [4]:
#Importing Libraries

#Twitter Scraping and Data
from tqdm.auto import tqdm
import snscrape.modules.twitter as sntwitter
from snscrape.base import ScraperException
import json
import nltk
import pandas as pd
import numpy as np
import mysql.connector

#Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords


#Data Visualisation
from wordcloud import WordCloud
from collections import Counter
from PIL import Image
import re
import string

# Scraping tweets

In [6]:
#building a function to scrap twitter during specific time period and with #climatechange as keyword and only english tweets
%%time

cc_tweets = []
for i, tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper('(#climatechange) lan:eng since:2021-01-01 until:2021-12-09').get_items())):
    try:
        cc_tweets.append([tweet.date,
                          tweet.id,
                          tweet.content,
                          tweet.replyCount,
                          tweet.retweetCount,
                          tweet.likeCount,
                          tweet.coordinates])
    except ScraperException:
        print('Scraper Exception... continuing')
        continue

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 12.2 µs


0it [00:01, ?it/s]


In [129]:
# Creating a dataframe from the tweets list above
cc_tweets_df = pd.DataFrame(cc_tweets, columns=['date', 'tweet_id', 'text', 'replies', 'retweets', 'likes', 'place'])

In [130]:
cc_tweets_df

Unnamed: 0,date,tweet_id,text,replies,retweets,likes,place
0,2021-12-08 23:59:56+00:00,1468732106507534336,Is Meat Really that Bad? [Yes.] https://t.co/A...,0,1,0,
1,2021-12-08 23:59:01+00:00,1468731877284478977,@samanthamaiden @GladysB Not surprising though...,1,9,11,
2,2021-12-08 23:58:00+00:00,1468731618428960771,"@TheRACP climate change report, co-authored by...",0,4,4,
3,2021-12-08 23:57:34+00:00,1468731508710064128,The Paris Agreement: knowledge management and ...,0,0,0,
4,2021-12-08 23:57:28+00:00,1468731487121985543,The Paris Agreement: knowledge management and ...,0,0,0,
...,...,...,...,...,...,...,...
874408,2021-01-01 00:01:03+00:00,1344795736437829633,Jax Brewery (The Katrina Portraits 3)\nhttps:/...,0,3,2,
874409,2021-01-01 00:00:43+00:00,1344795654728654851,Carriageway (The Katrina Portraits 4)\nhttps:/...,0,2,3,
874410,2021-01-01 00:00:22+00:00,1344795563359932417,Marigny Triangle (The Katrina Portraits 5)\nht...,0,1,1,
874411,2021-01-01 00:00:03+00:00,1344795486021152769,If you’re thinking of making a “green” New Yea...,0,0,1,


In [131]:
#storing dataframe of scraped tweets in a csv file 
cc_tweets_df.to_csv("climatechange_eng.csv", index=False)

In [None]:
#check data types 
cc_tweets_df.dtypes

In [None]:
#change date column tp type date
def date_type(df):
    df['date']=pd.to_datetime (df['date'], errors='coerce')
    return df

In [5]:
#building a function to scrap twitter during specific time period and with #covid or #covid-19 as keywords and only english tweets
%time

covid_tweets = []
for i, tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper('(#covid OR #covid-19) lang:en since:2021-01-01 until:2021-12-09').get_items())):
    try:
        covid_tweets.append([tweet.date,
                          tweet.id,
                          tweet.content,
                          tweet.replyCount,
                          tweet.retweetCount,
                          tweet.likeCount,
                          tweet.coordinates])
    except ScraperException:
        print('Scraper Exception... continuing')
        continue

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 16.2 µs


598303it [6:00:02, 27.70it/s]


ScraperException: Unable to find guest token

In [9]:
# Creating a dataframe from the tweets list above
covid2_tweets_df = pd.DataFrame(covid_tweets, columns=['date', 'tweet_id', 'text', 'replies', 'retweets', 'likes', 'place'])

In [10]:
#storing dataframe of scraped tweets in a csv file 
covid2_tweets_df.to_csv("covid_2nd.csv", index=False)

In [11]:
covid2_tweets_df

Unnamed: 0,date,tweet_id,text,replies,retweets,likes,place
0,2021-12-08 23:59:56+00:00,1468732107719651329,"@LeftEye16 Kia ora, a map of locations of inte...",1,0,0,
1,2021-12-08 23:59:53+00:00,1468732093014310916,NOPE‼️ You cannot count the jobs that were los...,0,0,0,
2,2021-12-08 23:59:50+00:00,1468732082172178435,An increasing number of workplaces across Aust...,0,0,0,
3,2021-12-08 23:59:49+00:00,1468732075343851523,Final lecture of my course on Analytical Found...,0,0,12,
4,2021-12-08 23:59:43+00:00,1468732051708805124,@recneps51 What's a self respecting virus doin...,0,0,1,
...,...,...,...,...,...,...,...
598298,2021-11-26 00:51:51+00:00,1464034130878894106,COVID-19: Saudi Arabia to lift entry ban from ...,1,1,21,
598299,2021-11-26 00:51:50+00:00,1464034125342515206,REPORT: Get Ready For Hysteria Over The New #B...,14,30,42,
598300,2021-11-26 00:51:48+00:00,1464034117964779520,@leanneelford @JeanneGirvan @Cath_Tyldesley Yo...,2,0,1,
598301,2021-11-26 00:51:40+00:00,1464034084808806400,@elemare @Dai_Watson @ChanceTyColeman I agree ...,1,0,1,


# Text Pre-processing


## Filtering non english tweets

## Dropping duplicates

In [None]:
#checking for the number of duplicate tweets
def count_duplicated_tweets(df):
    return cc_tweets_df.duplicated(subset='text').sum()

#dropping duplicate tweets inplace 
def duplicate_tweets_drop(df):
    df_duplicates_free=df.drop_duplicates(subset= ['text'], inplace=True)
    return df_duplicates_free

## Dealing with missing values

In [None]:
#checking the number of missing values 
def missing_values(df):
    null_count=df.isna().sum()
    return null_count

In [None]:
#dropping Nan values- excluding place column
def na_drop(df):
    df_na_free= df.dropna(subset=['date', 'tweet_id', 'text', 'replies', 'retweets', 'likes'], inplace=True)
    return df_na_free

## Removing links & Mentions (URLs & @s)

In [None]:
#removing URL links in tweets
def remove_usernames_links(tweet):
    # Remove any hyperlinks that may be in the text starting with http
    tweet = re.sub('@[^\s]+','',str(tweet))
    tweet = re.sub('http[^\s]+','',str(tweet))
    return tweet

## Removing stopwords

In [None]:
#downloading english stopwords
stop_words=stopwords.words('english')
#function for removing stopwords from tweets
def remove_stopwords(df):
    df['text'] = df['text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
    return(df)

In [None]:
# ploting using worldcloud function
def plot_wordcloud(wordcloud, language):
    plt.figure(figsize=(12, 10))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis("off")
    plt.title(language + ' tweets\n', fontsize=18, fontweight='bold')
    plt.show()

In [None]:
# wouldcloud for english tweets 
wordcloud = WordCloud(max_font_size=None, max_words=200, background_color="lightgrey", 
                      width=3000, height=2000,
                      stopwords=stopwords.words('english')).generate(str(df_eng.comments.values))

plot_wordcloud(wordcloud, 'English')

# Identifying sentiment using VADAR package
VADER belongs to a type of sentiment analysis that is based on lexicons of sentiment-related words.In this approach, each of the words in the lexicon is rated as to whether it is positive or negative, and in many cases, how positive or negative (more positive words have higher positive ratings and more negative words have lower negative ratings).

VADER produces four sentiment metrics from these word ratings, which you can see below. The first three, **positive, neutral and negative,** represent the proportion of the text that falls into those categories. The final metric, the **compound** score, is the sum of all of the lexicon ratings (which have been standardised to range between -1 and 1)

VADER particularly stands out for analysing social media text because it can handle some social-media specific terms in addition to word context (capitalisation, exclamation marks, sentiment terms such as really, extremely, kinda). Lastly, it also handles changes in a sentence’s sentiment intensity when it contains ‘but’, whereby the sentiment afterwards is weighted more heavily than that before. 

In [None]:
analyser = SentimentIntensityAnalyzer()

#using polarity_score ()to get sentiment metrics 
def get_sentiment_scores(text):
    snt = analyzer.polarity_scores(text)
    print("{:-<40} {}".format(text, str(snt)))

# Data Visualisation
## Using wordcloud

In [None]:
#defining plotting function
def plot_wordcloud(wordcloud, title):
    plt.figure(figsize=(12, 10))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis("off")
    plt.title(title + ' Tweets\n', fontsize=18, fontweight='bold')
    plt.show()

In [None]:
#using wordcould function to visualise climate change tweets 
text = (cc_tweets_df['text'].str.lower()).values
wordcloud = WordCloud(max_font_size=None,
                      max_words=200,
                      background_color="lightgrey", 
                      width=4000,
                      height=2000,
                      stopwords=stopwords.words('english')).generate(str(text))

plot_wordcloud(wordcloud, 'Climate Change')

In [None]:
#using wordcould function to visualise COVID tweets 
text = (covid_tweets_df['text'].str.lower()).values
wordcloud = WordCloud(max_font_size=None,
                      max_words=200,
                      background_color="black", 
                      width=4000,
                      height=2000,
                      stopwords=stopwords.words('english')).generate(str(text))

plot_wordcloud(wordcloud, 'COVID')

In [None]:
#practicing with flo 

In [275]:
tweets_df.Place.iloc[0]==tweets_df.Place.iloc[0]

True

In [None]:
def get_long_lat(row):
    if row.coordinates is None:
        return row.coordinates
    return row.coordinates.longitude, row.coordinates.latitude

In [None]:
tweets_df.apply(lambda x: get_long_lat(x("Place")), axis=1, result_type="expand")

In [None]:
tweets_df["long"]=np.where(tweets_df.Place.apply (lambda x: x  is not None), tweets_df.Place.apply(lambda x : x.longitude),tweets_df.Place)

In [None]:
if i>max_tweets:
        break