# Twitter API v2 scraper utilities

First, you need to get API Bearer Token fron Twitter.

How to get a Bearer Token:
 1. Apply for a Twitter developer account at https://developer.twitter.com/ (create a Project & App).
 2. In the Developer Portal go to your App -> Keys and tokens -> Generate Bearer Token (OAuth 2.0 Bearer Token).
 3. Make .env file in current directory and put your token inside that file

 `TWITTER_BEARER_TOKEN=[YOUR BEARER TOKEN]`

In [None]:
import os
import requests
import pandas as pd
from typing import Optional
from dotenv import load_dotenv
import urllib.parse
import time
import random

load_dotenv()

def _get_bearer_token_or_raise(token: Optional[str] = None) -> str:
    """Return bearer token from argument or environment; raise with guidance if missing."""
    if token:
        return urllib.parse.unquote(token)
    token = os.getenv("TWITTER_BEARER_TOKEN")
    if token:
        # URL-decode the token in case it was pasted URL-encoded
        return urllib.parse.unquote(token)
    raise RuntimeError(
        "TWITTER_BEARER_TOKEN not found. Generate one at https://developer.twitter.com/ "
        "and set it with `setx TWITTER_BEARER_TOKEN \"<token>\"` (PowerShell) or export in your shell. "
        "If you keep it in a .env file at project root, install `python-dotenv` and this cell will load it automatically."
    )


def _sleep_until_reset_or_backoff(resp: requests.Response, attempt: int):
    """Handle 429 by sleeping until x-rate-limit-reset header or doing exponential backoff.

    Args:
      resp: the Response object that returned 429
      attempt: current retry attempt (0-based)
    """
    reset = resp.headers.get("x-rate-limit-reset")
    if reset:
        try:
            reset_ts = int(reset)
            now = int(time.time())
            wait = max(reset_ts - now, 0)
            # add a small buffer
            wait += 5
            print(f"Rate limit reached. Sleeping until reset in {wait} seconds...")
            time.sleep(wait)
            return
        except Exception:
            pass
    # Fallback: exponential backoff with jitter
    base = 2 ** min(attempt, 6)
    jitter = random.uniform(0.5, 1.5)
    wait = base * jitter
    print(f"Rate limit (no reset header). Backing off for {wait:.1f} seconds (attempt {attempt})")
    time.sleep(wait)


def search_tweets_v2(query: str, count: int = 100, bearer_token: Optional[str] = None, lang: Optional[str] = None) -> pd.DataFrame:
    """
    Search recent tweets using Twitter API v2 and save results to CSV.

    This version handles 429 (Too Many Requests) by examining response headers and
    sleeping until the reset time or by using exponential backoff with jitter.

    Args:
      query: search query string (e.g. "indonesia -is:retweet")
      count: number of tweets to fetch
      bearer_token: optional bearer token string; if omitted reads env var or .env
      lang: optional language tag to restrict results (e.g. 'id' for Indonesian)

    Returns:
      pandas.DataFrame with columns ['Datetime', 'Text']
    """
    token = _get_bearer_token_or_raise(bearer_token)
    headers = {"Authorization": f"Bearer {token}"}
    url = "https://api.twitter.com/2/tweets/search/recent"

    query_with_lang = f"{query} lang:{lang}" if lang else query

    params = {
        "query": query_with_lang,
        "max_results": 30000,
        "tweet.fields": "created_at,text,lang"
    }

    tweets = []
    next_token = None
    attempt = 0

    while len(tweets) < count:
        if next_token:
            params["next_token"] = next_token
        try:
            resp = requests.get(url, headers=headers, params=params, timeout=30)
        except requests.RequestException as e:
            # network error; backoff and retry
            print(f"Network error: {e}; backing off...")
            time.sleep(min(60, 2 ** attempt))
            attempt += 1
            continue

        if resp.status_code == 200:
            attempt = 0
            payload = resp.json()
            for t in payload.get("data", []):
                # Double-check language if provided (defensive)
                if lang and t.get("lang") != lang:
                    continue
                tweets.append([t.get("created_at"), t.get("text")])
                if len(tweets) >= count:
                    break
            meta = payload.get("meta", {})
            next_token = meta.get("next_token")
            if not next_token:
                break
            # small pause between pages to avoid hitting very short-timescale limits
            time.sleep(0.5)
        elif resp.status_code == 401:
            raise RuntimeError("Unauthorized: check your TWITTER_BEARER_TOKEN value")
        elif resp.status_code == 429:
            # Too Many Requests: check headers and sleep until reset or backoff
            _sleep_until_reset_or_backoff(resp, attempt)
            attempt += 1
            continue
        else:
            # raise HTTPError for other status codes so user sees the cause
            resp.raise_for_status()

    df = pd.DataFrame(tweets, columns=["Datetime", "Text"])
    # Save CSV (filename safe-ish; replace spaces with underscores)
    safe_query = query.replace(" ", "_").replace('/', '_')
    filename = f"{safe_query}-{int(count/1000)}k-tweets.csv"
    df.to_csv(filename, index=False)
    print(f"Wrote {len(df)} rows to {filename}")
    return df


def user_tweets_v2(username: str, count: int = 100, bearer_token: Optional[str] = None, lang: Optional[str] = None) -> pd.DataFrame:
    """
    Fetch recent tweets from a username using Twitter API v2.

    Steps: get user id via users/by/username/:username, then fetch tweets from /users/:id/tweets.
    (This function inherits the same 429/backoff approach used above.)
    """
    token = _get_bearer_token_or_raise(bearer_token)
    headers = {"Authorization": f"Bearer {token}"}

    # get user id
    r = requests.get(f"https://api.twitter.com/2/users/by/username/{username}", headers=headers, timeout=30)
    if r.status_code != 200:
        r.raise_for_status()
    user_data = r.json().get("data", {})
    user_id = user_data.get("id")
    if not user_id:
        raise RuntimeError("User id not found for username: %s" % username)

    url = f"https://api.twitter.com/2/users/{user_id}/tweets"
    params = {"max_results": 100, "tweet.fields": "created_at,text,lang"}

    tweets = []
    next_token = None
    attempt = 0
    while len(tweets) < count:
        if next_token:
            params["pagination_token"] = next_token
        try:
            resp = requests.get(url, headers=headers, params=params, timeout=30)
        except requests.RequestException as e:
            print(f"Network error: {e}; backing off...")
            time.sleep(min(60, 2 ** attempt))
            attempt += 1
            continue

        if resp.status_code == 200:
            attempt = 0
            payload = resp.json()
            for t in payload.get("data", []):
                if lang and t.get("lang") != lang:
                    continue
                tweets.append([t.get("created_at"), t.get("text")])
                if len(tweets) >= count:
                    break
            meta = payload.get("meta", {})
            next_token = meta.get("next_token")
            if not next_token:
                break
            time.sleep(0.5)
        elif resp.status_code == 401:
            raise RuntimeError("Unauthorized: check your TWITTER_BEARER_TOKEN value")
        elif resp.status_code == 429:
            _sleep_until_reset_or_backoff(resp, attempt)
            attempt += 1
            continue
        else:
            resp.raise_for_status()

    df = pd.DataFrame(tweets, columns=["Datetime", "Text"])
    filename = f"{username}-{int(count/1000)}k-tweets.csv"
    df.to_csv(filename, index=False)
    print(f"Wrote {len(df)} rows to {filename}")
    return df


# Quick test helper (non-destructive):
if __name__ == '__main__':
    print("Loaded Twitter API v2 utilities. Example usage:\n  df = search_tweets_v2('indonesia -is:retweet', 10, lang='id')")

In [None]:
df = search_tweets_v2('indonesia -is:retweet', 30000, lang='id') #input your query, number of tweets, and language here
df.head()