In [1]:
import requests
from dateutil import parser
import pandas as pd
import collections
import copy
import ast
import html
import time
from time import sleep
from datetime import datetime, timezone
import urllib.parse

In [2]:
class HTTPError(Exception):
    pass


def make_headers(bearer_token):
    headers = {'Authorization': f'Bearer {bearer_token}'}
    return headers


def request(url, headers, params):
    response = requests.get(url, headers=headers, params=params, timeout=2)
    if response.status_code != 200:
        print(f"Request returned an error: {response.status_code} {response.text}")
        raise HTTPError(response)
    return response


def get_users(user_names, headers):
    user_fields = ['name', 'id', 'created_at', 'location', 'public_metrics']
    request_params = {
        'usernames': ','.join(user_names),
        'user.fields': ','.join(user_fields)
    }
    response = request("https://api.twitter.com/2/users/by", headers, request_params)
    results = response.json()
    users = results['data']
    return users


def flatten(ls):
    ls = copy.deepcopy(ls)
    for d in ls:
        d['text'] = html.unescape(d['text'])
        d['created_at'] = parser.parse(d['created_at'])
        metrics = d['public_metrics']
        for k, v in metrics.items():
            d.update({k:v})
        del d['public_metrics']
    return ls


def wait_if_exceeded(response, wait=False):
    if int(response.headers['x-rate-limit-remaining']) < 100 or wait:
        reset_unix_timestamp = int(response.headers['x-rate-limit-reset'])
        reset_timestamp = datetime.fromtimestamp(reset_unix_timestamp)
        remaining = reset_timestamp - datetime.now()
        print(f"Waiting for {remaining:.2f} seconds...")
        sleep(remaining)
    return


def get_tweets_timeline(user_names, start_time, end_time, bearer_token, max_results=5):
    """https://developer.twitter.com/en/docs/twitter-api/tweets/timelines/api-reference/get-users-id-tweets
    """
    headers = make_headers(bearer_token)
    try:
        users = get_users(user_names, headers)
    except Exception as e:
        print(e)
        return None, None
    tweets = []
    tweet_fields = ['lang', 'author_id', 'created_at', 'geo', 'public_metrics', 'source', 'context_annotations']
    request_params = {
        'tweet.fields':','.join(tweet_fields),
        'max_results': str(max_results),
        'start_time': start_time,
        'end_time': end_time,
        'exclude': 'retweets,replies'
    }
    stop = False
    try:
        for user in users:
            url = f"https://api.twitter.com/2/users/{user['id']}/tweets"
            nextPageToken = None
            while True:
                try:
                    response = request(url, headers, request_params)
                except HTTPError as e:
                    r, = e.args
                    if r.status_code == 429:
                        wait_if_exceeded(r, wait=True)
                    else:
                        print(f"Request returned an error: {r.status_code} {r.text}")
                        stop = True
                        break
                except Exception as e:
                    print("Something went wrong when requesting.")
                    print(e)
                    stop = True
                    break
                results = response.json()
                nextPageToken = results['meta'].get('next_token')
                data = results.get('data')
                if not nextPageToken:
                    if request_params.get('pagination_token'):
                        del request_params['pagination_token']
                    if data:
                        tweets.extend(data)
                    break
                else:
                    request_params.update({'pagination_token': nextPageToken})
                tweets.extend(data)
                wait_if_exceeded(response)
            if stop:
                break
    finally:
        return tweets, users


def get_tweets_search(query, start_time, end_time, bearer_token, max_results=10):
    """https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent
    max_results field in the api is max results per page not total
    https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query
    """
    headers = make_headers(bearer_token)
    tweets = []
    users = []
    tweet_fields = ['lang', 'id', 'created_at', 'geo', 'public_metrics', 'source', 'context_annotations']
    user_fields = ['name', 'public_metrics']
    request_params = {
        'query': query,
        'tweet.fields': ','.join(tweet_fields),
        'expansions': "author_id",
        'user.fields': ','.join(user_fields),
        'start_time': start_time,
        'end_time': end_time,
    }
    stop = False
    try:
        url = f"https://api.twitter.com/2/tweets/search/recent"
        nextPageToken = None
        while len(tweets) < max_results:
            try:
                response = request(url, headers, request_params)
            except HTTPError as e:
                r, = e.args
                if r.status_code == 429:
                    wait_if_exceeded(r, wait=True)
                else:
                    print(f"Request returned an error: {r.status_code} {r.text}")
                    break
            except Exception as e:
                print("Something went wrong when requesting.")
                print(e)
                stop = True
                break
            results = response.json()
            nextPageToken = results['meta'].get('next_token')
            data = results.get('data')
            user_info = results.get('includes')
            if not nextPageToken:
                if request_params.get('pagination_token'):
                    del request_params['pagination_token']
                if data:
                    tweets.extend(data)
                if users:
                    users.extend(user_info["users"])
                break
            else:
                request_params.update({'next_token': nextPageToken})
            tweets.extend(data)
            users.extend(user_info["users"])
            wait_if_exceeded(response)
    finally:
        return tweets, users  # users contain user information

In [3]:
bearer_token = "https://developer.twitter.com/en/products/twitter-api/academic-research/application-info this is much better"

In [4]:
user_names = ['discord']
tweets_timeline, users_timeline = get_tweets_timeline(user_names, '2021-01-01T00:00:00Z', '2021-01-10T00:00:00Z', bearer_token, max_results=100)

In [5]:
tweets_timeline
pd.DataFrame(flatten(tweets_timeline))

Unnamed: 0,source,author_id,created_at,lang,id,context_annotations,text,retweet_count,reply_count,like_count,quote_count
0,Agorapulse app,3065618342,2021-01-08 19:13:06+00:00,en,1347622377056776197,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",the gangs all here https://t.co/HA7Qt3vHm4,2334,1192,58353,187
1,Agorapulse app,3065618342,2021-01-06 19:00:13+00:00,en,1346894357320921092,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",tip of the day: click on text channel and pres...,352,289,5660,31
2,Agorapulse app,3065618342,2021-01-04 19:26:05+00:00,en,1346176089169809410,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",maybe this is the year some of you finally lea...,8332,4600,133163,2169
3,Twitter Web App,3065618342,2021-01-02 19:07:48+00:00,en,1345446715168444417,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",some1 from our art team made this and i'm just...,613,610,24681,52


In [6]:
# the special operators cannot be used alone; 'a' means the letter a must occur in the tweet
tweets_search, users_search = get_tweets_search("a lang:en is:verified -is:retweet -is:reply", '2021-09-22T00:00:00Z', '2021-09-28T00:00:00Z', bearer_token, max_results=10)

In [7]:
pd.DataFrame(flatten(tweets_search))

Unnamed: 0,source,text,created_at,lang,context_annotations,id,author_id,retweet_count,reply_count,like_count,quote_count
0,Twitter for Advertisers.,Find out how your current iPhone can help you ...,2021-09-27 23:59:58+00:00,en,"[{'domain': {'id': '47', 'name': 'Brand', 'des...",1442640186530168837,380749300,0,0,0,0
1,Twitter for iPhone,I promise https://t.co/SKa5mfqL8w is a pretty ...,2021-09-27 23:59:57+00:00,en,,1442640186060443649,962712250992689152,8,2,26,0
2,Twitter for Android,"Even in the middle of a pandemic, @GovAndyBesh...",2021-09-27 23:59:57+00:00,en,"[{'domain': {'id': '45', 'name': 'Brand Vertic...",1442640184248520705,2249451608,1,0,5,0
3,SocialFlow,Norfolk's police chief has gathered crime data...,2021-09-27 23:59:57+00:00,en,,1442640183615176709,16259594,3,5,6,0
4,IFTTT,New from JSOnline and PackersNews: Matt LaFleu...,2021-09-27 23:59:56+00:00,en,"[{'domain': {'id': '3', 'name': 'TV Shows', 'd...",1442640180364648448,62173133,0,0,5,0
5,Twitter for iPhone,I need a new word to call myself instead of “b...,2021-09-27 23:59:55+00:00,en,,1442640177474715651,747596934941126656,1,71,111,2
6,SocialFlow,With its big butterscotch and chocolate flavor...,2021-09-27 23:59:55+00:00,en,"[{'domain': {'id': '65', 'name': 'Interests an...",1442640174891028488,25374040,2,0,2,0
7,Echobox,ICYMI: The paintings of Dutch master Rembrandt...,2021-09-27 23:59:55+00:00,en,"[{'domain': {'id': '30', 'name': 'Entities [En...",1442640174131863552,71594919,2,3,4,1
8,Echobox,Travie McCoy details canceling tours due to he...,2021-09-27 23:59:52+00:00,en,"[{'domain': {'id': '10', 'name': 'Person', 'de...",1442640161590939653,24036264,4,0,25,0
9,Echobox,"A useful analogy — we have the right to drive,...",2021-09-27 23:59:52+00:00,en,"[{'domain': {'id': '123', 'name': 'Ongoing New...",1442640161381158915,224495471,13,0,29,2


In [8]:
# df = pd.read_csv("discord.csv")
# df['context_annotations'] = df['context_annotations'].apply(lambda x: ast.literal_eval(x))