## Common imports

In [1]:
#code obtained from http://socialmedia-class.org/twittertutorial.html
#api docs at https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
import json
import tweepy
import pandas as pd
import numpy as np

## Twitter api tokens and keys

In [2]:
api_key = '<api key goes here>'
api_secret_key = '<api secret key goes here>'
access_token = '<access token goes here>'
access_token_secret = '<access token secret goes here>'

## Authenticate using tweepy

In [3]:
# Setup tweepy to authenticate with Twitter credentials:

auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_token_secret)

# Create the api to connect to twitter with your creadentials
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
#---------------------------------------------------------------------------------------------------------------------
# wait_on_rate_limit= True;  will make the api to automatically wait for rate limits to replenish
# wait_on_rate_limit_notify= Ture;  will make the api  to print a notification when Tweepyis waiting for rate limits to replenish
#---------------------------------------------------------------------------------------------------------------------

## Function for getting tweets

In [4]:
def hate_tweets(listOfTweets, ids):
    tweets = api.statuses_lookup(ids, tweet_mode='extended')
    
    for i in range(0,len(tweets)):
        try:
            status = tweets[i]
            json_str = json.dumps(status._json)
            json_str = json.loads(json_str)
            
            origin_id = json_str['id']
            id = json_str['retweeted_status']['id']
            screen_name = json_str['retweeted_status']['user']['screen_name']
            user_name = json_str['retweeted_status']['user']['name']
            followers = json_str['retweeted_status']['user']['followers_count']
            following = json_str['retweeted_status']['user']['friends_count']
            tweet_created_at = json_str['retweeted_status']['created_at']
            tweet_text = json_str['retweeted_status']['full_text']
            bio = json_str['retweeted_status']['user']['description']
            location = json_str['retweeted_status']['user']['location']
            coord = json_str['retweeted_status']['coordinates']
            no_tweets = json_str['retweeted_status']['user']['statuses_count']

            if len(json_str['retweeted_status']['entities']['hashtags'])==0:
                hashtags = ''
            else:
                hashtags = []
                for i in range(0,len(json_str['retweeted_status']['entities']['hashtags'])):
                    hashtags = pd.Series(np.append(hashtags,json_str['retweeted_status']['entities']['hashtags'][i]['text']))
                    hashtags = hashtags.str.cat(sep=' ')

            if len(json_str['retweeted_status']['entities']['urls'])==0:
                urls=''
            else:
                urls = []
                for i in range(0,len(json_str['retweeted_status']['entities']['urls'])):
                    urls = pd.Series(np.append(urls,json_str['retweeted_status']['entities']['urls'][i]['url']))
                    urls = urls.str.cat(sep=' ')

            if len(json_str['retweeted_status']['entities']['user_mentions'])==0:
                mentions = ''
            else:
                mentions = []
                for i in range(0,len(json_str['retweeted_status']['entities']['user_mentions'])):
                    mentions = pd.Series(np.append(mentions,json_str['retweeted_status']['entities']['user_mentions'][i]['screen_name']))
                    mentions = mentions.str.cat(sep=' ')

            # Add tweets in this format
            dict_ = {'Tweet ID': origin_id,
                     'Retweeted ID':id,
                    'Name': screen_name,
                    'Username': user_name,
                    'Number of Followers': followers,
                    'Number Following': following,
                    'Time': tweet_created_at,
                    'Tweet': tweet_text,
                     'Hashtags': hashtags,
                     'Web': urls,
                     'Bio': bio,
                     'Location': location,
                     'Location Coordinates': coord,
                     'Number of Tweets': no_tweets,
                     'Mentions': mentions
                    }
            listOfTweets.append(dict_)
        except:
            pass
    return listOfTweets

## Get tweets

In [5]:
numOfTweets = 100

data = pd.read_csv('cps_cat4.txt', sep=" ", header=None) #read list of hateful tweet ids
loops = data.shape[0]//numOfTweets+1 #define number of required loops
for n in range(0,loops):
    listOfTweets=[]
    ids = data.iloc[n*100:(n+1)*100,0].tolist() #loop through tweetd ids.  100 at a time
    x = hate_tweets(listOfTweets, ids) #call the hate_tweets function
    df = pd.DataFrame(x) #convert to dataframe
    if n==0:
        existing_hate = df
    else:
        existing_hate = existing_hate.append(df)

In [6]:
existing_hate.shape

(590, 15)

## Not all tweets are still available.  Supplement removed tweets with nvivo export file

In [7]:
all_hate = pd.read_excel('csv_excel/full_nvivo_export.xlsx',sheet_name='Table1-1') #load nvivo export xlsx file
all_hate = all_hate[~all_hate['Tweet ID'].isin(existing_hate['Tweet ID'])] #remove tweets from nvivo file that were found on twitter
all_hate['Retweeted ID'] = '' #add column to nvivo file
all_hate = all_hate.drop(['Row ID','Tweet Type','Retweeted By','Number of Retweets'], axis=1) #emove columns from nvivo file
existing_hate = existing_hate.append(all_hate) #append nvivo file to file of tweets found on twitter
existing_hate = existing_hate[existing_hate['Tweet ID'].isin(data.iloc[:,0].tolist())] #only include tweets identified as hateful
existing_hate.to_pickle('pickle_files/existing_hate.pickle') #pickle dataframe


In [8]:
existing_hate.shape

(845, 15)