## Common imports

In [2]:
#Twitter code adapted from http://socialmedia-class.org/twittertutorial.html
#Twitter api docs at https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
import json
import tweepy
import pandas as pd
import numpy as np
import requests

## Twitter api tokens and keys

In [2]:
api_key = '<api key goes here>'
api_secret_key = '<api secret key goes here>'
access_token = '<access token goes here>'
access_token_secret = '<access token secret goes here>'

## Authenticate using tweepy

In [3]:
# Setup tweepy to authenticate with Twitter credentials:

#auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth = tweepy.AppAuthHandler(api_key, api_secret_key)
#auth.set_access_token(access_token, access_token_secret)

# Create the api to connect to twitter with your creadentials
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
#---------------------------------------------------------------------------------------------------------------------
# wait_on_rate_limit= True;  will make the api to automatically wait for rate limits to replenish
# wait_on_rate_limit_notify= Ture;  will make the api  to print a notification when Tweepyis waiting for rate limits to replenish
#---------------------------------------------------------------------------------------------------------------------

## Function for getting tweets

In [4]:
def get_tweets(listOfTweets, keyword, numOfTweets,start_id,max_id,update):
    # Iterate through all tweets containing the given word, api search mode
    if update==1:
        tweets = tweepy.Cursor(api.search, q=keyword, tweet_mode='extended', since_id=str(start_id+1),max_id=str(max_id-1)).items(numOfTweets)
    if update==0:
        tweets = tweepy.Cursor(api.search, q=keyword, tweet_mode='extended', max_id=str(start_id-1)).items(numOfTweets)
        
    for tweet in tweets:
        try:
            #get results in json format
            status = tweet
            json_str = json.dumps(status._json)
            json_str = json.loads(json_str)
            
            #get the necessary values from each tweet
            id = json_str['id']
            screen_name = json_str['user']['screen_name']
            user_name = json_str['user']['name']
            followers = json_str['user']['followers_count']
            following = json_str['user']['friends_count']
            tweet_created_at = json_str['created_at']
            tweet_text = json_str['full_text']
            bio = json_str['user']['description']
            location = json_str['user']['location']
            #coord = json_str['coordinates']['coordinates']
            coord = ''
            no_tweets = json_str['user']['statuses_count']

            if len(json_str['entities']['hashtags'])==0:
                hashtags = ''
            else:
                hashtags = []
                for i in range(0,len(json_str['entities']['hashtags'])):
                    hashtags = pd.Series(np.append(hashtags,json_str['entities']['hashtags'][i]['text']))
                    hashtags = hashtags.str.cat(sep=' ')

            if len(json_str['entities']['urls'])==0:
                urls=''
            else:
                urls = []
                for i in range(0,len(json_str['entities']['urls'])):
                    urls = pd.Series(np.append(urls,json_str['entities']['urls'][i]['url']))
                    urls = urls.str.cat(sep=' ')

            if len(json_str['entities']['user_mentions'])==0:
                mentions = ''
            else:
                mentions = []
                for i in range(0,len(json_str['entities']['user_mentions'])):
                    mentions = pd.Series(np.append(mentions,json_str['entities']['user_mentions'][i]['screen_name']))
                    mentions = mentions.str.cat(sep=' ')

            # Add tweets in this format
            dict_ = {'Tweet ID': id,
                    'Name': screen_name,
                    'Username': user_name,
                    'Number of Followers': followers,
                    'Number Following': following,
                    'Time': tweet_created_at,
                    'Tweet': tweet_text,
                     'Hashtags': hashtags,
                     'Web': urls,
                     'Bio': bio,
                     'Location': location,
                     'Location Coordinates': coord,
                     'Number of Tweets': no_tweets,
                     'Mentions': mentions
                    }
            listOfTweets.append(dict_)
        except:
            pass 
    return listOfTweets

## Get tweets

In [5]:
listOfTweets=[] #empty array for holding tweets
#search terms
keyword = '(nicola sturgeon) OR (snp sturgeon)  OR (scotland sturgeon) OR (scottish sturgeon) OR (holyrood sturgeon)  OR (westminster sturgeon) -filter:retweets'
#no of tweets per page
numOfTweets = 100

try:
    existing_data = pd.read_pickle('pickle_files/existing_data.pickle') #if database of tweets already exists
    existing_data.to_pickle('pickle_files/existing_data_backup.pickle')
    print("success")
    since_id = max(existing_data['Tweet ID']) #find the max tweetd id from existing database
    max_id = 9999999999999999999
    for i in range(0,30):
        x = get_tweets(listOfTweets, keyword, numOfTweets, since_id,max_id,1) #get tweets with tweetd id > since_id
        df = pd.DataFrame(x) #convert to dataframe
        max_id = min(df['Tweet ID']) #reset since_id
    existing_data = existing_data.append(df) #append new results to exisiting results
    existing_data = existing_data.reset_index(drop=True) #reset the index
    existing_data = existing_data.drop_duplicates() #just in case!
    existing_data.to_pickle('pickle_files/existing_data.pickle') #save the dataframe
except: #database of tweets does not yet exist
    pass
    #print("failure")
    #max_id=1150486892451946496 #this is the start point for this exercise
    #for i in range(0,30):
    #    x = get_tweets(listOfTweets, keyword, numOfTweets, max_id,0) #get tweets with tweet id < max_id
    #    df = pd.DataFrame(x) #convert to dataframe
    #    max_id = min(df['Tweet ID']) #reset max_id
    #df.to_pickle('pickle_files/existing_data.pickle') #save the dataframe

success


In [6]:
existing_data = pd.read_pickle('pickle_files/existing_data.pickle')
print(existing_data.shape)


(14804, 14)


## Remove all tweets that contain a term from hatebase

In [8]:
hate_tweets = [] #to hold list of ids containing 'hatebase' terms
counter = 1
hate_terms = pd.read_pickle('pickle_files/hate_terms.pickle') #read in the terms from hatebase
hate_lower = hate_terms.hate_terms.str.lower() #lowercase all hate terms
existing_data_lower = pd.read_pickle('pickle_files/existing_data.pickle') #read in all the Nicola Sturgeon tweets
existing_data_lower.Tweet = existing_data_lower.Tweet.str.lower() #lowercase the Nicola Sturgeon tweets
chars = [' ','!','.'] #additional leading and trailing characters for the hatebase terms
for t in hate_lower: #loop through each hatebase term
    for lead in chars:
        for trail in chars:
            try:
                term = lead + t + trail #add the lead and trail characters
                hate_twitter = existing_data_lower[existing_data_lower.Tweet.str.contains(term)]
                hate_tweets = np.append(hate_tweets,hate_twitter.index) #add the index if the tweet contains a hatebase term
            except:
                pass
hate_tweets = np.unique(hate_tweets) #remove the duplicates
existing_clean = pd.read_pickle('pickle_files/existing_data.pickle')
existing_clean = existing_clean[~existing_clean.index.isin(hate_tweets)] #drop all the Nicola Sturgeon tweets that include a hatebase term

existing_clean.to_pickle('pickle_files/existing_clean.pickle') #save the dataframe of "clean" tweets


    


In [9]:
existing_clean.shape

(5706, 14)