In this notebook I will write the code to collect tweets between January 2019 and June 2022. I will save the tweets to csv.

In [169]:
from pandas import json_normalize
import pip._vendor.requests 
import os
import json
from dotenv import load_dotenv
import csv
import pandas as pd
from datetime import datetime
from operator import itemgetter
import numpy as np
from time import sleep

In [170]:
load_dotenv()

# To get bearer token environment variable
bearer_token = os.environ.get("BEARER-TOKEN")

search_url = "https://api.twitter.com/2/tweets/search/all" 

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    return r




In [171]:
def connect_to_endpoint(url, params):
    response = pip._vendor.requests.request("GET", search_url, auth=bearer_oauth, params=params)
    sleep(3)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [172]:
def getKeywordQueryString():
    keyWordData = pd.read_csv('finalAsianKeywords.csv')
    words = keyWordData['keyword'].tolist()
    keywordQueryString = ""

    for word in words:
        if word != words[-1]:
            keywordQueryString = keywordQueryString + word + " OR "
        else:
            keywordQueryString = keywordQueryString + word
    return keywordQueryString




In [173]:



def query_params(keywordQueryString, startTime, endTime, maxResults, nextToken):

    queryString = f'({keywordQueryString}) lang:en -is:retweet place_country:gb'


    return      {'query': queryString, 
                'start_time': startTime, 
                'end_time': endTime, 
                'max_results': maxResults,
                'tweet.fields': 'text,created_at,id',                
                'next_token': nextToken}
   

In [174]:
def getTweetsToCsv(keywordQueryString, startTime, endTime, maxResults, csvString):
    next_token = {}
    finished = False
    data = []
    mostRecentDate = ""
    while finished == False:
        json_response = connect_to_endpoint(search_url, query_params(keywordQueryString, startTime, endTime, maxResults, next_token))
        if 'data' in json_response:
            for r in json_response['data']:
                date = r['created_at']
                tweetId = r['id']
                tweetText = r['text']
                if "\n" in tweetText:
                    tweetText = tweetText.replace('\n', ' ')
                result = [date, tweetId, tweetText]
                data.append(result)
                mostRecentDate = date
        if 'next_token' in json_response['meta']:
            # Save the token to use for next call
            next_token = json_response['meta']['next_token']
        else:
            finished = True
    
    df = pd.DataFrame(data)
    df.to_csv(csvString, index=False, header=False, mode='a')
    print("most recent date: " + mostRecentDate)
    return mostRecentDate



In [175]:



def twitterScraperLoopUntilComplete(keywordQueryString, startTime, endTime, maxResults, csvString):
    csvFile = open(csvString, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    csvWriter.writerow(['date', 'id', 'tweet'])
    csvFile.close()
    end = endTime
    shortFormatStartString = startTime[0:10]
    finished = False
    while finished == False:
        end = getTweetsToCsv(keywordQueryString, startTime, end, maxResults, csvString)
        if shortFormatStartString == end[0:10]:
            print("Reached end time")
            finished = True
        
    


In [176]:
queryStringOfKeywords = getKeywordQueryString()
startTime = '2019-01-01T00:00:00Z'
endTime = '2022-06-01T00:00:00Z'
maxResults = 500
outputCsv = "twitterDataOutput.csv"


twitterScraperLoopUntilComplete(queryStringOfKeywords, startTime, endTime, maxResults, outputCsv)

most recent date: 2019-01-01T00:14:27.000Z
Reached end time


This confirms no two tweets are the same in output CSV

In [177]:
def confirmNoDuplicateTweets(csvFile):
    newTweetData = pd.read_csv(csvFile)
    ids = newTweetData['id'].tolist()
    if len(set(ids)) == len(ids):
        print("There are no duplicates in the data")
    else:
        print("There are duplicates in the data")

confirmNoDuplicateTweets(outputCsv)


There are no duplicates in the data
