#### Using scraped data from scrapping_starter notebook to get twitter data

> Import Libraries

In [103]:
import pandas as pd
import tweepy
from tweepy import OAuthHandler
from tweepy import API
from tweepy import Cursor
from datetime import datetime, date, time, timedelta
from collections import Counter
import os, sys
import csv

> Load dotenv to expose api keys to the application

In [104]:
from dotenv import load_dotenv
load_dotenv()

True

In [105]:
API_key="API_key"
API_secret_key="API_secret_key"
Access_token="Access_token"
Access_token_secret="Access_token_secret"
print(API_key, API_secret_key, Access_token, Access_token_secret)

API_key API_secret_key Access_token Access_token_secret


In [106]:
API_key = os.environ.get(API_key)
API_secret_key = os.environ.get(API_secret_key)
Access_token = os.environ.get(Access_token)
Access_token_secret=os.environ.get(Access_token_secret)

In [107]:
auth = OAuthHandler(API_key, API_secret_key)
auth.set_access_token(Access_token, Access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)
auth_api = API(auth)

> Testing Api

In [108]:
search_words = "#wildfires"
date_since= "2018-11-16"
# Collect tweets
tweets = tweepy.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(2)
# Iterate and print tweets
for tweet in tweets:
    print(tweet.text) 

RT @AlaskaWx: Alaska #wildfires mostly burn in the boreal forest between the Alaska &amp; Brooks Ranges, but also tundra in western/northern ar…
RT @HydroNewcomer: 📣Call for #AGU2020 #GlobalChange abstracts!

🔥What impact do #wildfires have on #water and #biodiversity? Fire Impacts o…


>> Define function to get tweet data

In [109]:
def get_tweets(handles):
    
    cols = ['id', 'name', 'screen_name', 'description', 
            'statuses_count', 'friends_count', 'followers_count', 
            'account_age_days', 'avg_daily_tweets', 'hashtags',
            'user_mentions','favorite_count', 'retweet_count',]
    
    # dataframe that would be returned at the end
    df = pd.DataFrame(columns=cols)
    #print(df)
    handle_data = []
    off_users = []
            
    if len(handles) > 0: 
        for handle in handles:
            value_list = []
            print("Getting data for " + handle)
            # this helps avoid Tweepy errors like suspended users or user not ound errors
            try:
                item = auth_api.get_user(handle)
            except tweepy.TweepError as e:
                continue
            value_list+= item.id_str, item.name, item.screen_name,\
            item.description, item.statuses_count, item.friends_count, item.followers_count
            
            #get average daily tweets
            
            no_tweets = item.statuses_count
            account_created_date = item.created_at
            delta = datetime.utcnow() - account_created_date
            account_age_days = delta.days
            value_list.append(str(account_age_days))
            #print(str(account_age_days))
            if account_age_days > 0:
                   value_list.append(int(float(no_tweets)/float(account_age_days)))
                    
                    
                    
            hashtags = []
            mentions = []
            favorite_count =[]
            retweet_count=[]
            tweet_count = 0
            end_date = datetime.utcnow() - timedelta(days=30)
            

            for status in Cursor(auth_api.user_timeline, id=handle).items():
                tweet_count+= 1
                if hasattr(status, "entities"):
                    entities = status.entities

                # get hashtags
                if "hashtags" in entities:
                    for ent in entities["hashtags"]:
                        if ent is not None:
                            if "text" in ent:
                                hashtag = ent["text"]
                                if hashtag is not None:
                                    hashtags.append(hashtag)
                # get usermentions
                if "user_mentions" in entities:
                    for ent in entities["user_mentions"]:
                        if ent is not None:
                            if "screen_name" in ent:
                                name = ent["screen_name"]
                                if name is not None:
                                    mentions.append(name)

                # get retweets    
                if hasattr(status, "retweet_count"):
                    retweets = status.retweet_count
                    if retweets is not None:
                        retweet_count.append(retweets)
                        
                # favorite count     
                if hasattr(status, "favorite_count"):
                    likes = status.favorite_count 
                    if likes is not None:
                        favorite_count.append(likes)
                if status.created_at < end_date:
                    break
                    
            
            value_list.append(len(hashtags))
            value_list.append(len(mentions))
            value_list.append(sum(favorite_count))
            value_list.append(sum(retweet_count))
            handle_data.append(value_list)
            #print(handle_data)
            #break
    #ls = {}
            #df_1 = pd.DataFrame([handle_data], columns=cols)
            #print(handle_data)
            #complete_df = pd.concat(df, df_1)
            df = df.append(pd.DataFrame([value_list], columns=cols))
            #print(df)
    return df

>> Load scraped handles into pandas 

In [110]:
afriq_users_handle = pd.read_csv('scraped_handles/top_100_influencers.csv')
afriq_users_handle = afriq_users_handle.handles.tolist()

In [111]:
afriq_users_handle[:4]

['@gettleman', '@a24media', '@andiMakinana', '@AfricaCheck']

In [112]:
afriq_govt_handle = pd.read_csv('scraped_handles/africa_govt_covid_resp.csv')
afriq_govt_handle = afriq_govt_handle.handles.to_list()
afriq_govt_handle[:4]

['@EswatiniGovern1', '@MalawiGovt', '@hagegeingob', '@FinanceSC']

> Gather Twitter Data for Influencers

In [52]:
df_inf = get_tweets(afriq_users_handle)

Getting data for @gettleman
Getting data for @a24media
Getting data for @andiMakinana
Getting data for @AfricaCheck
Getting data for @JamesCopnall
Getting data for @oafrica
Getting data for @PatrickNgowi
Getting data for @StateAfrica
Getting data for @Moadow
Getting data for @BrendanSAfrica
Getting data for @CityTshwane
Getting data for @VISI_Mag
Getting data for @beyondsafari
Getting data for @ThisIsAfricaTIA
Getting data for @sarzss
Getting data for @TheEIU_Africa
Getting data for @InvestInAfrica
Getting data for @malonebarry
Getting data for @artsouthafrica
Getting data for @KahnMorbee
Getting data for @JamalMOsman
Getting data for @iamsuede
Getting data for @mikestopforth
Getting data for @equal_education
Getting data for @t_mcconnell
Getting data for @forbeesta
Getting data for @hurricanevaness
Getting data for @BBCKarenAllen
Getting data for @jaxpanik
Getting data for @thisisafrica
Getting data for @audisouthafrica
Getting data for @ONEinAfrica
Getting data for @Hamza_Africa
Gett

In [54]:
df_inf

Unnamed: 0,id,name,screen_name,description,statuses_count,friends_count,followers_count,account_age_days,avg_daily_tweets,hashtags,user_mentions,favorite_count,retweet_count
0,305125998,Jeffrey Gettleman,gettleman,South Asia bureau chief for the New York Times...,3770,37,25700,3340,1,5,38,268,111152
0,26475943,A24 Media,a24media,Africa 24 produces compelling content that mak...,16869,3059,31285,4132,4,32,52,96,190
0,72013267,Scapegoat,AndiMakinana,In pursuit of scoops. I do not write headlines...,142223,2839,101246,3967,35,39,626,19466,398095
0,625489039,Africa Check,AfricaCheck,Africa's first independent fact-checking websi...,27317,4590,68061,2936,9,156,205,1353,1398
0,401520924,James Copnall,JamesCopnall,BBC reporter + presenter. Author A Poisonous T...,19434,5046,21958,3182,6,19,82,139,47608
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,117102398,Julius Sello Malema,Julius_S_Malema,Commander in Chief of Economic Freedom Fighter...,37192,652,3126246,3795,9,64,524,177856,286707
0,14697575,News24,News24,South Africa's premier online news resource. F...,322655,631,3575424,4453,72,227,1022,204918,89815
0,1102508781781557248,jdwtweet,SAPresident,,19,14,18,501,0,0,1,0,38
0,17962204,Gareth Cliff,GarethCliff,President of https://t.co/scMZ7lsVKF ⚜. Enquir...,31624,357,1974477,4239,7,40,161,3325,2253


> Save Gathered Twitter Data of Top African Influencers to csv file

In [57]:
df_inf.to_csv('twitter_datasets/acct_info/afriqTopInfluencers.csv', sep=',')

> Gather Twitter Data for Governement Covid19 Responders

In [64]:
df_gov = get_tweets(afriq_govt_handle)

Getting data for @EswatiniGovern1
Getting data for @MalawiGovt
Getting data for @hagegeingob
Getting data for @FinanceSC
Getting data for @PresidencyZA
Getting data for @mohzambia
Getting data for @edmnangagwa
Getting data for @MinSantedj
Getting data for @hawelti
Getting data for @StateHouseKenya
Getting data for @PaulKagame
Getting data for @M_Farmaajo
Getting data for @SouthSudanGov
Getting data for @SudanPMHamdok
Getting data for @TZSpokesperson
Getting data for @KagutaMuseveni
Getting data for @angola_Mirex
Getting data for @willynyamitwe
Getting data for @Cherif_MZ
Getting data for @Presidence_RDC
Getting data for @PresidentABO
Getting data for @PresidenceBenin
Getting data for @rochkaborepf
Getting data for @PresidenciaCV
Getting data for @AOuattara_PRCI
Getting data for @Presidency_GMB
Getting data for @NAkufoAddo
Getting data for @President_GN
Getting data for @USEmbalo
Getting data for @PresidenceMali
Getting data for @CheikhGhazouani
Getting data for @IssoufouMhm
Getting dat

> Save gathered data to csv

In [66]:
df_gov.to_csv('twitter_datasets/acct_info/afriqGovCovid19Resp.csv', sep=',')

> Define function to gather hashtags of handles

In [77]:
def get_hashtags(handles):
#    import timeit as t

    
    cols = ['id', 'name', 'screen_name', 'hashtags']#'hashtag_counts']
    
    # dataframe that would be returned at the end
    df = pd.DataFrame(columns=cols)
    #print(df)
    handle_data = []
            
    if len(handles) > 0: 
        for handle in handles:
            value_list = {}
            print("Getting hashtags for " + handle)
            # this helps avoid Tweepy errors like suspended users or user not ound errors
            try:
                item = auth_api.get_user(handle)
            except tweepy.TweepError as e:
                continue
            #value_list+= item.id_str, item.name, item.screen_name,
            value_list['id'] = item.id_str
            value_list['name'] = item.name
            value_list['screen_name'] = item.screen_name
            #value_list['hashtags'] = []
            #print(value_list, "hell")
            
            #get average daily tweets
            no_tweets = item.statuses_count
            account_created_date = item.created_at
            delta = datetime.utcnow() - account_created_date
            account_age_days = delta.days
           
            hashtags = set()
            hash_dic = {}
            tweet_count = 0
            end_date = datetime.utcnow() - timedelta(days=30)
            

            for status in Cursor(auth_api.user_timeline, id=handle).items():
            #    tweet_count+= 1
                if hasattr(status, "entities"):
                    entities = status.entities

                # get hashtags
                if "hashtags" in entities:
                    for ent in entities["hashtags"]:
                        if ent is not None:
                            if "text" in ent:
                                hashtag = ent["text"]
                                if hashtag is not None:
                                    if hashtag in hashtags:
                                        hash_dic[hashtag]+=1
                                    else:
                                        hashtags.add(hashtag)
                                        hash_dic[hashtag] = 1
                value_list['hashtags'] = hash_dic
            df = df.append(pd.DataFrame(value_list))
    return df.reset_index().rename(columns={'hashtags':'hashtags_count','index':'hashtags'})

> Gather hashtags of govt covid19 responders

In [96]:
df_gov_hashtags = get_hashtags(afriq_govt_handle)

Getting hashtags for @EswatiniGovern1
Getting hashtags for @MalawiGovt
Getting hashtags for @hagegeingob
Getting hashtags for @FinanceSC
Getting hashtags for @PresidencyZA
Getting hashtags for @mohzambia
Getting hashtags for @edmnangagwa
Getting hashtags for @MinSantedj
Getting hashtags for @hawelti
Getting hashtags for @StateHouseKenya
Getting hashtags for @PaulKagame
Getting hashtags for @M_Farmaajo
Getting hashtags for @SouthSudanGov
Getting hashtags for @SudanPMHamdok
Getting hashtags for @TZSpokesperson
Getting hashtags for @KagutaMuseveni
Getting hashtags for @angola_Mirex
Getting hashtags for @willynyamitwe
Getting hashtags for @Cherif_MZ
Getting hashtags for @Presidence_RDC
Getting hashtags for @PresidentABO
Getting hashtags for @PresidenceBenin
Getting hashtags for @rochkaborepf
Getting hashtags for @PresidenciaCV
Getting hashtags for @AOuattara_PRCI
Getting hashtags for @Presidency_GMB
Getting hashtags for @NAkufoAddo
Getting hashtags for @President_GN
Getting hashtags for @U

In [98]:
df

In [100]:
df_gov_hashtags.to_csv('twitter_datasets/acct_hashtags/govtTweets.csv')

In [None]:
df_inf_hashtags = get_hashtags(afriq_users_handle)

Getting hashtags for @gettleman
Getting hashtags for @a24media
Getting hashtags for @andiMakinana
Getting hashtags for @AfricaCheck
Getting hashtags for @JamesCopnall
