# GitHub Repository Intelligence via Thordata Web Scraper API

This notebook demonstrates how to use **Thordata's Web Scraper API** to collect
GitHub repository intelligence (stars, language, issues, etc.) and load it into
a Pandas DataFrame for analysis.

We support two modes:

- **Live mode** (`USE_LIVE_THORDATA = True`): calls Thordata Web Scraper API and
  consumes credits.
- **Offline mode** (`USE_LIVE_THORDATA = False`): loads previously cached JSON
  data from `data/` without consuming credits.

In [1]:
# Optional: run this cell once if you don't have these packages installed.
# If you already installed them via requirements.txt, you can skip this cell.

#%pip install pandas requests

In [2]:
import json
import os
import time
from pathlib import Path
from typing import Any

import pandas as pd
import requests
from dotenv import load_dotenv
from IPython.display import display
from thordata import ThordataClient

load_dotenv()

True

In [3]:
# Resolve project root from this notebook location:
# notebooks/devtools -> parent = notebooks -> parent = repo root
ROOT_DIR = Path.cwd().parents[1]

# Toggle between live API calls and local cached JSON data.
# Set to True only when you want to consume Thordata credits.
USE_LIVE_THORDATA = False

# Local cache path for GitHub scraper JSON (under repo root)
CACHE_DIR = ROOT_DIR / "data"
GITHUB_JSON_CACHE_PATH = CACHE_DIR / "github_repo_intel_sample.json"

print("CWD:", os.getcwd())
print("ROOT_DIR:", ROOT_DIR)
print("USE_LIVE_THORDATA:", USE_LIVE_THORDATA)
print("GITHUB_JSON_CACHE_PATH:", GITHUB_JSON_CACHE_PATH)

CWD: D:\Thordata_Work\thordata-cookbook\notebooks\devtools
ROOT_DIR: D:\Thordata_Work\thordata-cookbook
USE_LIVE_THORDATA: False
GITHUB_JSON_CACHE_PATH: D:\Thordata_Work\thordata-cookbook\data\github_repo_intel_sample.json


In [4]:
# Thordata credentials
SCRAPER_TOKEN = os.getenv("THORDATA_SCRAPER_TOKEN")
PUBLIC_TOKEN = os.getenv("THORDATA_PUBLIC_TOKEN")
PUBLIC_KEY = os.getenv("THORDATA_PUBLIC_KEY")

if not SCRAPER_TOKEN:
    raise ValueError("Please set THORDATA_* variables in your .env file.")

# Values copied from Thordata Dashboard → Web Scraper Store → GitHub tool
GITHUB_SPIDER_ID = "github_repository_by-repo-url"
GITHUB_SPIDER_NAME = "github.com"

# Target repository (same parameter name as in dashboard: repo_url)
GITHUB_TARGET_PARAMS = {
    "repo_url": "https://github.com/TheAlgorithms/Python",
}

# Build a slug from the repo URL, e.g. "TheAlgorithms_Python"
repo_slug = "_".join(
    GITHUB_TARGET_PARAMS["repo_url"].rstrip("/").split("/")[-2:]
)

# Dynamic output file name, e.g. github_repo_intel_TheAlgorithms_Python_20251127_163045.csv
timestamp = time.strftime("%Y%m%d_%H%M%S")
OUTPUT_FILE_NAME = f"github_repo_intel_{repo_slug}_{timestamp}"
OUTPUT_FILE_NAME

'github_repo_intel_TheAlgorithms_Python_20251128_101402'

In [5]:
client = ThordataClient(
    scraper_token=SCRAPER_TOKEN,
    public_token=PUBLIC_TOKEN,
    public_key=PUBLIC_KEY,
)

client

<thordata.client.ThordataClient at 0x28c2ab11400>

In [6]:
def run_github_repo_task(
    spider_id: str,
    spider_name: str,
    repo_params: dict[str, Any],
    file_name: str,
    poll_interval: int = 10,
    max_wait_minutes: int = 6,
) -> str:
    """
    Run a single GitHub repository scraper task and return the download URL.

    Args:
        spider_id: Spider ID from the dashboard.
        spider_name: Spider name from the dashboard.
        repo_params: Parameters for the GitHub spider (e.g. {"repo_url": "..."}).
        file_name: Desired output file name.
        poll_interval: Seconds between status checks.
        max_wait_minutes: Maximum minutes to wait before timing out.

    Returns:
        str: Download URL of the finished task.

    Raises:
        RuntimeError: If the task fails or exceeds the timeout.
    """
    max_wait_seconds = max_wait_minutes * 60

    print(f"Creating task for: {repo_params.get('repo_url')}")
    task_id = client.create_scraper_task(
        file_name=file_name,
        spider_id=spider_id,
        spider_name=spider_name,
        individual_params=repo_params,
    )
    print(f"Task created. ID = {task_id}")

    elapsed = 0
    status = "Unknown"
    while elapsed < max_wait_seconds:
        status = client.get_task_status(task_id)
        print(f"[{elapsed:>3}s] Status: {status}")
        if status in ["Ready", "Success", "finished", "completed"]:
            break
        if status in ["Failed", "Error"]:
            raise RuntimeError(f"Task failed with status: {status}")
        time.sleep(poll_interval)
        elapsed += poll_interval

    if status not in ["Ready", "Success", "finished", "completed"]:
        raise RuntimeError(
            f"Task did not finish within {max_wait_minutes} minutes "
            f"(last status: {status})"
        )

    download_url = client.get_task_result(task_id, file_type="json")
    print("Download URL:", download_url)
    return download_url

In [7]:
def load_scraper_json_as_dataframe(url: str) -> pd.DataFrame:
    """
    Load Web Scraper JSON result into a Pandas DataFrame.

    The JSON structure may vary by spider. We handle two common patterns:
    - A top-level list of records.
    - A dict with a "data" field containing the list of records.
    """
    resp = requests.get(url)
    resp.raise_for_status()
    data = resp.json()

    # Case 1: dict with "data" field
    if isinstance(data, dict) and "data" in data:
        records = data["data"]
    # Case 2: top-level list
    elif isinstance(data, list):
        records = data
    else:
        print("Unexpected JSON structure, returning raw object.")
        return pd.DataFrame([data])

    # Flatten nested structures if needed
    df = pd.json_normalize(records)
    return df

In [8]:
if USE_LIVE_THORDATA:
    # Call Thordata Web Scraper API and consume credits
    download_url = run_github_repo_task(
        spider_id=GITHUB_SPIDER_ID,
        spider_name=GITHUB_SPIDER_NAME,
        repo_params=GITHUB_TARGET_PARAMS,
        file_name=OUTPUT_FILE_NAME,
        poll_interval=10,
        max_wait_minutes=6,
    )
    df = load_scraper_json_as_dataframe(download_url)

    # Cache to local JSON so we can work offline later
    os.makedirs(CACHE_DIR, exist_ok=True)
    with open(GITHUB_JSON_CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(df.to_dict(orient="records"), f, ensure_ascii=False, indent=2)

    print(f"Cached JSON to {GITHUB_JSON_CACHE_PATH}")
else:
    # Load from local cache without consuming any credits
    print(f"Loading cached JSON from {GITHUB_JSON_CACHE_PATH}")
    with open(GITHUB_JSON_CACHE_PATH, encoding="utf-8") as f:
        records = json.load(f)
    df = pd.DataFrame.from_records(records)

df.head(3)

Loading cached JSON from D:\Thordata_Work\thordata-cookbook\data\github_repo_intel_sample.json


Unnamed: 0,url,id,code_language,code,num_lines,user_name,user_url,size,size_unit,size_num,...,num_issues,num_pull_requests,num_projects,num_fork,num_stared,last_feature,latest_update,input,error,error_code
0,https://github.com/TheAlgorithms/Python/blob/m...,63476337.0,Python,"[""""""For more information about the Binomial Di...",41,TheAlgorithms,https://github.com/TheAlgorithms,1.51 KB,KB,1.51,...,142,628,0,49408,213800,[pre-commit.ci] pre-commit autoupdate (#11322),2024-03-13T07:52:41.000+01:00,"{'spider_errors': True, 'proxy_region': 'us', ...",,
1,https://github.com/TheAlgorithms/Python/blob/m...,63476337.0,Python,"[import sys, , """""", Dynamic Programming, Imple...",67,TheAlgorithms,https://github.com/TheAlgorithms,1.81 KB,KB,1.81,...,142,628,0,49408,213800,Update matrix_chain_order calculation with mor...,2025-05-23T00:17:48.000+03:00,"{'spider_errors': True, 'proxy_region': 'us', ...",,
2,https://github.com/TheAlgorithms/Python/blob/m...,63476337.0,,,0,TheAlgorithms,https://github.com/TheAlgorithms,0 Bytes,Bytes,0.0,...,142,628,0,49408,213800,Rename Project Euler directories and other dep...,2020-10-15T12:43:28.000+05:30,"{'spider_errors': True, 'proxy_region': 'us', ...",,


In [9]:
print("All columns:", df.columns.tolist())

# Convert numeric-like columns from strings (with commas or 'None') to numbers
numeric_cols = [
    "num_issues",
    "num_pull_requests",
    "num_projects",
    "num_fork",
    "num_stared",
    "num_lines",
    "size_num",
]

for col in numeric_cols:
    if col in df.columns:
        # 1. unify as string
        series = df[col].astype(str)

        # 2. remove thousand separators
        series = series.str.replace(",", "", regex=False)

        # 3. handle obvious non-numeric values
        series = series.replace(["None", "none", ""], pd.NA)

        # 4. coerce to numeric; invalid values become NaN
        df[col] = pd.to_numeric(series, errors="coerce")

# Define columns we care about
possible_cols = [
    "url",
    "code_language",
    "user_name",
    "user_url",
    "num_issues",
    "num_pull_requests",
    "num_projects",
    "num_fork",
    "num_stared",
    "latest_update",
    "size",
    "size_unit",
    "size_num",
]

selected_cols = [c for c in possible_cols if c in df.columns]
df_selected = df[selected_cols].copy() if selected_cols else df.copy()

df_selected.head(5)

All columns: ['url', 'id', 'code_language', 'code', 'num_lines', 'user_name', 'user_url', 'size', 'size_unit', 'size_num', 'breadcrumbs', 'num_issues', 'num_pull_requests', 'num_projects', 'num_fork', 'num_stared', 'last_feature', 'latest_update', 'input', 'error', 'error_code']


Unnamed: 0,url,code_language,user_name,user_url,num_issues,num_pull_requests,num_projects,num_fork,num_stared,latest_update,size,size_unit,size_num
0,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2024-03-13T07:52:41.000+01:00,1.51 KB,KB,1.51
1,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2025-05-23T00:17:48.000+03:00,1.81 KB,KB,1.81
2,https://github.com/TheAlgorithms/Python/blob/m...,,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2020-10-15T12:43:28.000+05:30,0 Bytes,Bytes,0.0
3,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-05T08:39:29.000-04:00,1.02 KB,KB,1.02
4,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2021-10-21T21:13:42.000+08:00,1.34 KB,KB,1.34


In [10]:
# Convert numeric-like columns from strings (with commas) to numbers
numeric_cols = [
    "num_issues",
    "num_pull_requests",
    "num_projects",
    "num_fork",
    "num_stared",
    "num_lines",
    "size_num",
]

for col in numeric_cols:
    if col in df.columns:
        # 1. 统一转成字符串
        series = df[col].astype(str)

        # 2. 去掉千分位逗号
        series = series.str.replace(",", "", regex=False)

        # 3. 把明显不是数字的值（"None"、"none"、""）先替换成 NaN
        series = series.replace(["None", "none", ""], pd.NA)

        # 4. 使用 to_numeric，无法解析的值变成 NaN（errors='coerce'）
        df[col] = pd.to_numeric(series, errors="coerce")

# Define columns we care about
possible_cols = [
    "url",
    "code_language",
    "user_name",
    "user_url",
    "num_issues",
    "num_pull_requests",
    "num_projects",
    "num_fork",
    "num_stared",
    "latest_update",
    "size",
    "size_unit",
    "size_num",
]

selected_cols = [c for c in possible_cols if c in df.columns]
df_selected = df[selected_cols].copy() if selected_cols else df.copy()

df_selected.head(5)

Unnamed: 0,url,code_language,user_name,user_url,num_issues,num_pull_requests,num_projects,num_fork,num_stared,latest_update,size,size_unit,size_num
0,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2024-03-13T07:52:41.000+01:00,1.51 KB,KB,1.51
1,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2025-05-23T00:17:48.000+03:00,1.81 KB,KB,1.81
2,https://github.com/TheAlgorithms/Python/blob/m...,,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2020-10-15T12:43:28.000+05:30,0 Bytes,Bytes,0.0
3,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-05T08:39:29.000-04:00,1.02 KB,KB,1.02
4,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2021-10-21T21:13:42.000+08:00,1.34 KB,KB,1.34


In [11]:
# Top repositories by stars (GitHub tool uses 'num_stared' as field name)
if "num_stared" in df_selected.columns:
    star_col = "num_stared"
elif "stars" in df_selected.columns:
    star_col = "stars"
else:
    star_col = None

if star_col:
    print("Top repositories by stars:")
    display(df_selected.sort_values(by=star_col, ascending=False).head(10))
else:
    print("No stars column found (neither 'num_stared' nor 'stars'). Skipping top-stars view.")

# Language distribution (GitHub tool uses 'code_language')
if "code_language" in df_selected.columns:
    lang_col = "code_language"
elif "language" in df_selected.columns:
    lang_col = "language"
else:
    lang_col = None

if lang_col:
    print("\nLanguage distribution:")
    print(df_selected[lang_col].value_counts())
else:
    print("\nNo language column found (neither 'code_language' nor 'language'). Skipping language distribution.")

# Repo-level summary (each row may repeat repo-level metrics; drop duplicates)
summary_cols = [
    "user_name",
    "user_url",
    "num_issues",
    "num_pull_requests",
    "num_projects",
    "num_fork",
    "num_stared",
    "latest_update",
]

summary_cols = [c for c in summary_cols if c in df_selected.columns]
if summary_cols:
    repo_summary = df_selected[summary_cols].drop_duplicates()
    print("\nRepository summary:")
    display(repo_summary)

Top repositories by stars:


Unnamed: 0,url,code_language,user_name,user_url,num_issues,num_pull_requests,num_projects,num_fork,num_stared,latest_update,size,size_unit,size_num
0,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2024-03-13T07:52:41.000+01:00,1.51 KB,KB,1.51
1,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2025-05-23T00:17:48.000+03:00,1.81 KB,KB,1.81
2,https://github.com/TheAlgorithms/Python/blob/m...,,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2020-10-15T12:43:28.000+05:30,0 Bytes,Bytes,0.0
3,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-05T08:39:29.000-04:00,1.02 KB,KB,1.02
4,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2021-10-21T21:13:42.000+08:00,1.34 KB,KB,1.34
5,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2025-07-06T01:35:29.000+03:00,5.5 KB,KB,5.5
6,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2021-10-27T00:43:46.000+08:00,2.07 KB,KB,2.07
8,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-16T03:21:43.000-04:00,887 Bytes,Bytes,887.0
9,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-07T21:32:28.000+02:00,5.3 KB,KB,5.3
10,https://github.com/TheAlgorithms/Python/blob/m...,Python,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2022-10-13T22:03:15.000+02:00,1.67 KB,KB,1.67



Language distribution:
code_language
Python            350
Text               14
Markdown           12
YAML                5
JSON                2
Shell               2
INI                 1
Git Attributes      1
TOML                1
Name: count, dtype: int64

Repository summary:


Unnamed: 0,user_name,user_url,num_issues,num_pull_requests,num_projects,num_fork,num_stared,latest_update
0,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2024-03-13T07:52:41.000+01:00
1,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2025-05-23T00:17:48.000+03:00
2,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2020-10-15T12:43:28.000+05:30
3,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-05T08:39:29.000-04:00
4,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2021-10-21T21:13:42.000+08:00
...,...,...,...,...,...,...,...,...
468,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-01T16:46:12.000+02:00
474,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-10-17T03:10:24.000-04:00
476,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2023-09-06T15:16:51.000-04:00
487,TheAlgorithms,https://github.com/TheAlgorithms,142.0,628.0,0.0,49408.0,213800.0,2022-10-30T11:49:05.000+01:00


In [12]:
# Save CSV under the shared data/ directory at repo root
output_csv = CACHE_DIR / f"{OUTPUT_FILE_NAME}.csv"
df_selected.to_csv(output_csv, index=False, encoding="utf-8")
print(f"Saved cleaned data to {output_csv}")

Saved cleaned data to D:\Thordata_Work\thordata-cookbook\data\github_repo_intel_TheAlgorithms_Python_20251128_101402.csv
