In [None]:
import re
import os
import sys
import json
import string

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from collections import Counter
from datetime import datetime, date, time, timedelta

import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

from textblob import TextBlob

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
nltk.download('punkt')

import preprocessor as p

sns.set(style="whitegrid")

# to view all columns
pd.set_option("display.max.columns", None)

In [None]:
from dotenv import load_dotenv
load_dotenv()
from pathlib import Path 
env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

In [None]:
class tweetsearch():
    '''
    This is a basic class to search and download twitter data.
    You can build up on it to extend the functionalities for more 
    sophisticated analysis
    '''
    def __init__(self, cols=None,auth=None):
        #
        if not cols is None:
            self.cols = cols
        else:
            self.cols = ['id', 'created_at', 'source', 'original_text','clean_text', 
                    'sentiment','polarity','subjectivity', 'lang',
                    'favorite_count', 'retweet_count', 'original_author',   
                    'possibly_sensitive', 'hashtags',
                    'user_mentions', 'place', 'place_coord_boundaries']
            
        if auth is None:
            
            #Variables that contains the user credentials to access Twitter API 
            consumer_key = os.environ.get('TWITTER_API_KEY')
            consumer_secret = os.environ.get('TWITTER_API_SECRET')
            access_token = os.environ.get('TWITTER_ACCESS_TOKEN')
            access_token_secret = os.environ.get('TWITTER_ACCESS_TOKEN_SECRET')
            


            #This handles Twitter authetification and the connection to Twitter Streaming API
            auth = OAuthHandler(consumer_key, consumer_secret)
            auth.set_access_token(access_token, access_token_secret)
            

        #            
        self.auth = auth
        self.api = tweepy.API(auth,wait_on_rate_limit=True) 
        self.filtered_tweet = ''
            

    def clean_tweets(self, twitter_text):

        #use pre processor
        tweet = p.clean(twitter_text)

         #HappyEmoticons
        emoticons_happy = set([
            ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
            ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
            '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
            'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
            '<3'
            ])

        # Sad Emoticons
        emoticons_sad = set([
            ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
            ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
            ':c', ':{', '>:\\', ';('
            ])

        #Emoji patterns
        emoji_pattern = re.compile("["
                 u"\U0001F600-\U0001F64F"  # emoticons
                 u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                 u"\U0001F680-\U0001F6FF"  # transport & map symbols
                 u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                 u"\U00002702-\U000027B0"
                 u"\U000024C2-\U0001F251"
                 "]+", flags=re.UNICODE)

        #combine sad and happy emoticons
        emoticons = emoticons_happy.union(emoticons_sad)

        stop_words = set(stopwords.words('english'))
        word_tokens = nltk.word_tokenize(tweet)
        #after tweepy preprocessing the colon symbol left remain after      
        #removing mentions
        tweet = re.sub(r':', '', tweet)
        tweet = re.sub(r'‚Ä¶', '', tweet)

        #replace consecutive non-ASCII characters with a space
        tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)

        #remove emojis from tweet
        tweet = emoji_pattern.sub(r'', tweet)

        #filter using NLTK library append it to a string
        filtered_tweet = [w for w in word_tokens if not w in stop_words]

        #looping through conditions
        filtered_tweet = []    
        for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
            if w not in stop_words and w not in emoticons and w not in string.punctuation:
                filtered_tweet.append(w)

        return ' '.join(filtered_tweet)            

    def get_tweets(self, keyword, csvfile=None):
        
        
        df = pd.DataFrame(columns=self.cols)
        

        #page attribute in tweepy.cursor and iteration
        for page in tweepy.Cursor(self.api.search, q=keyword,count=100, include_rts=False,tweet_mode='extended').pages():

            # the you receive from the Twitter API is in a JSON format and has quite an amount of information attached
            for status in page:
                
                new_entry = []
                status = status._json
                
                #if this tweet is a retweet update retweet count
                if status['created_at'] in df['created_at'].values:
                    i = df.loc[df['created_at'] == status['created_at']].index[0]
                    #
                    cond1 = status['favorite_count'] != df.at[i, 'favorite_count']
                    cond2 = status['retweet_count'] != df.at[i, 'retweet_count']
                    if cond1 or cond2:
                        df.at[i, 'favorite_count'] = status['favorite_count']
                        df.at[i, 'retweet_count'] = status['retweet_count']
                    continue

                #calculate sentiment
                filtered_tweet = self.clean_tweets(status['full_text'])
                blob = TextBlob(filtered_tweet)
                Sentiment = blob.sentiment     
                polarity = Sentiment.polarity
                subjectivity = Sentiment.subjectivity

                new_entry += [status['id'], status['created_at'],
                              status['source'], status['full_text'], filtered_tweet, 
                              Sentiment,polarity,subjectivity, status['lang'],
                              status['favorite_count'], status['retweet_count']]

                new_entry.append(status['user']['screen_name'])

                try:
                    is_sensitive = status['possibly_sensitive']
                except KeyError:
                    is_sensitive = None

                new_entry.append(is_sensitive)

                hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
                new_entry.append(hashtags) #append the hashtags

                #
                mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                new_entry.append(mentions) #append the user mentions

                try:
                    xyz = status['place']['bounding_box']['coordinates']
                    coordinates = [coord for loc in xyz for coord in loc]
                except TypeError:
                    coordinates = None
                #
                new_entry.append(coordinates)

                try:
                    location = status['user']['location']
                except TypeError:
                    location = ''
                #
                new_entry.append(location)

                #now append a row to the dataframe
                single_tweet_df = pd.DataFrame([new_entry], columns=self.cols)
                df = df.append(single_tweet_df, ignore_index=True)

        #
        df['timestamp'] = df.created_at.map(pd.Timestamp)
        df = df.sort_values('timestamp').set_index('timestamp')
        df = df.drop('id',axis=1)
        
        if not csvfile is None:
            #save it to file
            df.to_csv(csvfile,mode='a', index=True, encoding="utf-8")
            

        return df 

    def get_timeline(self, username, csvfile=None):
        
        
        df = pd.DataFrame(columns=self.cols)
        

        #page attribute in tweepy.cursor and iteration
        for page in tweepy.Cursor(self.api.user_timeline, screen_name=username, count=100, include_rts=False,tweet_mode='extended', since = '2019-09-01').pages():

            # the you receive from the Twitter API is in a JSON format and has quite an amount of information attached
            for status in page:
                
                new_entry = []
                status = status._json
                
                #if this tweet is a retweet update retweet count
                if status['created_at'] in df['created_at'].values:
                    i = df.loc[df['created_at'] == status['created_at']].index[0]
                    #
                    cond1 = status['favorite_count'] != df.at[i, 'favorite_count']
                    cond2 = status['retweet_count'] != df.at[i, 'retweet_count']
                    if cond1 or cond2:
                        df.at[i, 'favorite_count'] = status['favorite_count']
                        df.at[i, 'retweet_count'] = status['retweet_count']
                    continue

                #calculate sentiment
                filtered_tweet = self.clean_tweets(status['full_text'])
                blob = TextBlob(filtered_tweet)
                Sentiment = blob.sentiment     
                polarity = Sentiment.polarity
                subjectivity = Sentiment.subjectivity

                new_entry += [status['id'], status['created_at'],
                              status['source'], status['full_text'], filtered_tweet, 
                              Sentiment,polarity,subjectivity, status['lang'],
                              status['favorite_count'], status['retweet_count']]

                new_entry.append(status['user']['screen_name'])

                try:
                    is_sensitive = status['possibly_sensitive']
                except KeyError:
                    is_sensitive = None

                new_entry.append(is_sensitive)

                hashtags = ", ".join([hashtag_item['text'] for hashtag_item in status['entities']['hashtags']])
                new_entry.append(hashtags) #append the hashtags

                #
                mentions = ", ".join([mention['screen_name'] for mention in status['entities']['user_mentions']])
                new_entry.append(mentions) #append the user mentions

                try:
                    xyz = status['place']['bounding_box']['coordinates']
                    coordinates = [coord for loc in xyz for coord in loc]
                except TypeError:
                    coordinates = None
                #
                new_entry.append(coordinates)

                try:
                    location = status['user']['location']
                except TypeError:
                    location = ''
                #
                new_entry.append(location)

                #now append a row to the dataframe
                single_tweet_df = pd.DataFrame([new_entry], columns=self.cols)
                df = df.append(single_tweet_df, ignore_index=True)

        #
        df['timestamp'] = df.created_at.map(pd.Timestamp)
        df = df.sort_values('timestamp').set_index('timestamp')
        df = df.drop('id',axis=1)

        if not csvfile is None:
            #save it to file
            df.to_csv(csvfile, index=True,mode='a', encoding="utf-8")
            

        return df 

In [None]:
additional = ['corona','covid','covid-19']
list_of_places_one = ['Kenya','Nairobi','Kisumu','Mombasa']
list_of_places_two = ['Nigeria','Lagos']
list_of_places_three = ['South Africa','Johannesburg','Cape Town']
all_list_of_places = [list_of_places_one,list_of_places_two,list_of_places_three]

In [None]:
list_of_keywords = ['Covid-19 Food response','Food Supply','Food Scarcity','Food Supply Chain','Food Availability','Food Provision','Food Distribution','Food Rations','Food Web','Food Network','Food Deficit','Hunger','Food Poverty','Food Security','Food Insecurity','Food Price','Starvation','Starving','Food Shortage','Food Safety','Cost of Food','Food Costs','Price of Food', 'Cost of Foodstuffs','Foodstuffs Costs','Price of Foodstuffs','Cost of Grocery','Grocery Costs', 'Price of Grocery']

def food_quantity(all_list_of_places):
    for list_of_places in all_list_of_places:
        if list_of_places == list_of_places_one:
            file_name = 'KenyaTweets.csv'
        elif list_of_places == list_of_places_two:
            file_name = 'NigeriaTweets.csv'
        else:
            file_name = 'SouthAfricaTweets.csv'
            
        for place in list_of_places:
            for item in additional:
                keywords = ['Food',place,item]
                final_keywords = " AND ".join(keywords)
                try:
                    ts = tweetsearch() 
                    df = ts.get_tweets(final_keywords, csvfile=file_name)
                    print(final_keywords)
                except tweepy.TweepError:
                    continue

        for keyword in list_of_keywords:
            for place in list_of_places:
                keywords = [keyword,place]
                final_keywords = " AND ".join(keywords)
                try:
                    ts = tweetsearch() 
                    df = ts.get_tweets(final_keywords, csvfile=file_name)
                    print(final_keywords)
                except tweepy.TweepError:
                    continue

                for item in additional:
                    keywords = [keyword,place,item]
                    final_keywords = " AND ".join(keywords)
                    try:
                        ts = tweetsearch() 
                        df = ts.get_tweets(final_keywords, csvfile=file_name)
                        print(final_keywords)
                    except tweepy.TweepError:
                        continue

In [None]:
food_quantity(all_list_of_places)

In [None]:
Kenya_Tweets = pd.read_csv('KenyaTweets.csv', low_memory = False)
Nigeria_Tweets = pd.read_csv('NigeriaTweets.csv', low_memory = False)
South_Africa_Tweets = pd.read_csv('SouthAfricaTweets.csv', low_memory = False)

In [None]:
def list_of_users(dataset):
    users = list(dataset['original_author'].unique())
    return users

In [None]:
kenyan_users =  list_of_users(Kenya_Tweets)
nigerian_users = list_of_users(Nigeria_Tweets)
south_african_users = list_of_users(South_Africa_Tweets)

In [None]:
all_users = [kenyan_users, nigerian_users, south_african_users]

def user_tweets(all_users):
    for users in all_users:
        if users == kenyan_users:
            file_name = 'KenyanTweets.csv'
        elif users == nigerian_users:
            file_name = 'NigerianTweets.csv'
        else:
            file_name = 'SouthAfricanTweets.csv'
        
        for user in users:
            try:
                ts = tweetsearch()
                df = ts.get_timeline(user, csvfile=file_name)
                #Helps keep track of usernames
                print(user)

            except tweepy.TweepError:
                continue

In [None]:
user_tweets(all_users)