# Covid19 - Twitter data extraction
by Victoria, Maha, Gopi

## Table of contents
- Introduction
- Authenticatications
    - Twitter
    - Google sheets
- Gathering data & storing


## Introduction
This notebook is part of the project developed for the FLT Big Data Hackathon, whose objective is to create interesting and trustworthy analyses and visualizations about the COVID19 situation and its correlation with the stock market. 

In this notebook we use the Twitter API to retrieve the tweets related to COVID19 hashtags and economic tags, to perform a sentimental analysis and store it programatically in a google sheets file. 

In [1]:
!pip install pycountry



In [2]:
#Load important libraries
import gspread 
from df2gspread import df2gspread as d2g
from oauth2client.service_account import ServiceAccountCredentials
import json
import tweepy
from textblob import TextBlob
from tweepy import Stream
from tweepy import StreamListener
import pandas as pd
import re
import csv
import nltk
from  geopy.geocoders import Nominatim
from datetime import datetime
import pycountry
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\v.perez\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Authentication
### Twitter

In [3]:
# Load twitter credentials
with open("covid19-sentanalysis-twitter_credentials.json") as datafile:
  data = json.load(datafile)

# Define the keys
consumer_key= data['consumer_key'] #'API_CONSUMER_KEY_HERE'
consumer_secret=  data['consumer_secret']#'CONSUMER_SECRET_HERE'

access_token= data['access_token_key'] #'ACCESS_TOKEN_HERE'
access_token_secret= data['access_token_secret'] #'ACCESS_TOKEN_SECRET_HERE'


#Crate the auth object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# create API, set limits to avouid errors because of a timeout 
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

#Print 5 tweets for testing purposes - Should be deleted afterwards
home_tweets = api.home_timeline(count=5)
print("printing tweets from timeline \n ")
for tweet in home_tweets:
    print(tweet.text)
    print("")

Authentication OK
printing tweets from timeline 
 
#AHORA Vuelven a extender la cuarentena obligatoria, ahora hasta el 7 de junio

https://t.co/qb8rjD1xV2

RT @normanesnombre: ojalá alguien me cuidara tanto como Alberto está cuidando a Larreta.

RT @ElDedoAcusador: Harta de que hablen de una super cuarentena en Argentina cuando en muchisimos lugares del pais, definitivamente muchos…

#urgente @alferdez confirmó que la cuarentena seguirá hasta el 7 de junio https://t.co/XF0Wx2ud1i

RT @CharlotteBRF: Some intermediate &amp; advanced #JavaScript concepts very well explained in @LadybugPodcast 🐞 https://t.co/Iv9Zq6zWyw . This…



### Google sheets

In [4]:
scope = [
   'https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

#authenticate gsheets
google_key_file = 'service_key.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

# Define spreadsheet access
spreadsheet_key = '1auoQ9XanosnM7RUInzqeZi9EIgwtCtmtubNpXrfF6OM' 
wks_name = 'sentimentAnalysis'

# Open the file
book = gc.open_by_key(spreadsheet_key) 
worksheet = book.worksheet(wks_name) 

## Gathering data & storing
**GET Twitter Stream and Do Sentiment Analysis in Real time**

In [5]:
header_name = ['id', 'user_id','Text','created_at','timestamp','location','latitude','longitude','country',
               'country_code','followers_count','hashtags','polarity','subjectivity']

class Listener(StreamListener):
    
    def __init__(self):
        super().__init__()
        self.max_tweets = 5
        self.tweet_count = 0
        self.geolocator = Nominatim()
        self.tweet_list = []
    
    def on_data(self, data):
        raw_twitts = json.loads(data)
        try:
            #  Fields we need: id, created_at, text, coordinates, author_id
            full_tweets = raw_twitts.copy()
            # TO DO: we must drop from full_tweets the tweets that consist only on RT, numbers, etc (see regex used below)
            tweets = raw_twitts['text']
            tweets = ' '.join(re.sub("(@[A-Za-z0-9]+) | ({*0-9A-Za-z \t]) |] (\wt:\/\/\St+)", " ", tweets).split())
            tweets = ' '.join(re.sub('RT',' ', tweets).split())  
  
  
            blob = TextBlob(tweets.strip())
            #global polarity 
            #global subjectivity 
            
            polarity = 0
            subjectivity = 0
            
            for sent in blob.sentences:
                polarity += round(sent.sentiment.polarity,2)
                subjectivity += round(sent.sentiment.subjectivity,2)

            #get timestamp from created_at
            time_created_at = raw_twitts['created_at']
            t = time_created_at.split('+0000 ')
            time = t[0] +t[1]
            format_time = '%a %b %d %H:%M:%S %Y'
            date_time = datetime.strptime(time,format_time)
            ts = int(date_time.timestamp())
            
            #get lat, long from location
            lat = None
            long = None
            if raw_twitts['user']['location']:
                loc = self.geolocator.geocode(raw_twitts['user']['location'])
                if loc:
                    lat = loc.latitude
                    long = loc.longitude
                    location = self.geolocator.reverse(str(lat)+','+str(long))
                    country = location.raw['address']['country']
                    #country name may be official name or name. 
                    #So, try to get the country code using both, if couldn't find then set code to None.
                    #Some countries have symbols, so better to keep None if not found using above method
                    if country:
                        country_official_name = pycountry.countries.get(official_name=country)
                        if country_official_name:
                            country_code = country_official_name.alpha_3
                        else:
                            country_name = pycountry.countries.get(name=country)
                            if country_name:
                                country_code = country_name.alpha_3
                            else:
                                country_code = None
                    else:
                        country_code = None
                        
        
            if lat and long:
                info = {'id':raw_twitts['id'],
                            'user_id':raw_twitts['user']['id'], 
                            'Text':raw_twitts['text'],
                            'created_at':raw_twitts['created_at'],
                            'timestamp':ts,
                            'location':raw_twitts['user']['location'],
                            'latitude':lat,
                            'longitude':long,
                            'country': country,
                            'country_code': country_code,
                            'followers_count':raw_twitts['user']['followers_count'],
                            'hashtags': ''.join(re.findall(r'\B#\w*[a-zA-Z]+\w*', raw_twitts['text'])),
                            'polarity': polarity,
                            'subjectivity': subjectivity
                       }
                self.tweet_list.append(info)

            #print (tweets,'\n')    
        except:
            print('ERROR got')
        else:
            self.tweet_count+=1
                # Once it reaches a fix limit the Write the data into gsheets
            if(self.tweet_count==self.max_tweets):          
                # save to a dataframe for eeasier file upload
                df_tweet_list = pd.DataFrame(self.tweet_list, columns = header_name)
            
                values = df_tweet_list.values.tolist()
                book.values_append(wks_name, {'valueInputOption': 'USER_ENTERED'}, {'values': values})
                
                 #d2g.upload(df_tweet_list, spreadsheet_key, wks_name, clean =True, credentials=credentials, row_names=False)
            
                print("completed")
                self.tweet_count = 0
                #return(False)
            else:
                decoded = json.loads(data)

        def on_error(self, status):
            print(status)

In [6]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

In [13]:
twitter_stream = Stream(auth, Listener())
twitter_stream.filter(track = ['covid', 'covid19','economic pandemic','chinese virus impact', 'lockdown', 'lockdown recession'], languages=['en'])

  # Remove the CWD from sys.path while we load stuff.


completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
ERROR got
completed
completed
completed
ERROR got
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
completed
ERROR got
completed
completed
completed
completed
completed


ProtocolError: ('Connection broken: IncompleteRead(0 bytes read)', IncompleteRead(0 bytes read))

In [None]:
pycountry.countries.get(alpha_2='DE')