Script was modified for our use from the original source: https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Tweet-Lookup/get_tweets_with_bearer_token.py

In order for this script to work, one should replace <your_bearer_token> in auth function with their own private Twitter API bearer token.

In [None]:
import requests
import os
import json
import pandas as pd
import time
import re

# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
bearer_token = open(os.path.join('./TwitterBearerToken.txt'), encoding="utf-8").read()

DATASETS_FOLDER = "/Users/slavkoz/OneDrive - Univerza v Ljubljani/Datasets/Offensive language datasets/"

In [None]:
def get_lists_of_ids(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
def create_url(post_ids):
    tweet_fields = "tweet.fields=text"
    # Tweet fields are adjustable.
    # Options include:
    # attachments, author_id, context_annotations,
    # conversation_id, created_at, entities, geo, id,
    # in_reply_to_user_id, lang, non_public_metrics, organic_metrics,
    # possibly_sensitive, promoted_metrics, public_metrics, referenced_tweets,
    # source, text, and withheld
    ids = f"ids={','.join(post_ids)}"
    # You can adjust ids to include a single Tweets.
    # Or you can add to up to 100 comma-separated IDs
    url = "https://api.twitter.com/2/tweets?{}&{}".format(ids, tweet_fields)
    return url


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2TweetLookupPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth)
    #print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )
    return response.json()

def get_tweets(post_ids):
    url = create_url(post_ids)
    json_response = connect_to_endpoint(url)
    #print(json.dumps(json_response, indent=4, sort_keys=True))
    
    tweets = []
    if "data" in json_response:
        for tweet in json_response["data"]:
            tweets.append([tweet["id"], re.sub(r'\s+',' ', tweet["text"])])
    
    return tweets

def saveDF(results, filename):
    df = pd.DataFrame(results, columns =['id', 'text'])
    df.to_csv(filename, index = False)

def retrieve_tweets(tweet_ids, filename):
    results = []
    for tweet_lst in get_lists_of_ids(tweet_ids, 100):
        tweets = get_tweets(tweet_lst)
        results.extend(tweets)
        time.sleep(3)
        print(f"Retrieved {len(results)} tweets, input {len(tweet_lst)}, output {len(tweets)}")
    saveDF(results, filename)

In [None]:
# DATASET 18
dataset_path = os.path.join(DATASETS_FOLDER, '18/18_hatespeechtwitter.csv')
df = pd.read_csv(dataset_path)
tweet_ids = list(map(str, df["tweet_id"].tolist()))

retrieve_tweets(tweet_ids, '18_retrieved_tweets.csv')

In [None]:
# DATASET 20
benevolents = open(os.path.join(DATASETS_FOLDER, '20_NLP_CSS_2017-master/benevolent_sexist.tsv'), 'r').readlines()
benevolents = list(map(lambda x: re.sub(r'\n$','', x), benevolents))
retrieve_tweets(benevolents, '20_retrieved_benevolent_tweets.csv')

hostiles = open(os.path.join(DATASETS_FOLDER, '20_NLP_CSS_2017-master/hostile_sexist.tsv'), 'r').readlines()
hostiles = list(map(lambda x: re.sub(r'\n$','', x), hostiles))
retrieve_tweets(hostiles, '20_retrieved_hostile_tweets.csv')

In [None]:
# DATASET 29
dataset_path = os.path.join(DATASETS_FOLDER, '29/29_NAACL_SRW_2016.csv')
df = pd.read_csv(dataset_path)
tweet_ids = list(map(str, df["tweet_id"].tolist()))

retrieve_tweets(tweet_ids, '29_retrieved_tweets.csv')

In [None]:
# DATASET 30
data = open(os.path.join(DATASETS_FOLDER, '30/30_NLP_CSS_2016.csv'), 'r').readlines()
data = list(map(lambda x: re.sub(r'\n$','', x), data))
data = list(map(lambda x: x.split('\t'), data))
data = list(map(lambda x: [x[0], x[1]], data))[1:] # tweet_id, expert - columns

tweet_ids = list(map(lambda x: x[0], data))
retrieve_tweets(tweet_ids, '30_retrieved_tweets.csv')