In [1]:
import re, html, time
from typing import Optional
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm.auto import tqdm

def make_session(user_agent: str = "yt-wayback-scraper/1.0 (+contact@example.com)") -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": user_agent})
    retry = Retry(
        total=8, connect=5, read=5, backoff_factor=1.5,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=("GET",),
        raise_on_status=False,
        respect_retry_after_header=True,
    )
    s.mount("https://", HTTPAdapter(max_retries=retry))
    return s


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def list_snapshots(channel_id: str, start_year: int = 2010, end_year: Optional[int] = None) -> pd.DataFrame:
    """
    Return DataFrame with columns:
      timestamp (str 'YYYYMMDDhhmmss'), dt (datetime64)
    Uses the CDX API; collapsed to one snapshot per day.
    """
    if end_year is None:
        end_year = pd.Timestamp.utcnow().year

    url = "https://web.archive.org/cdx/search/cdx"
    params = {
        "url": f"https://www.youtube.com/channel/{channel_id}",
        "output": "json",
        "filter": "statuscode:200",
        "from": start_year,
        "to": end_year,
        "collapse": "timestamp:8",  # one per day
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    rows = r.json()
    if not rows or len(rows) == 1:
        return pd.DataFrame(columns=["timestamp", "dt"])
    rows = rows[1:]  # drop header row
    df = pd.DataFrame(rows, columns=["urlkey","timestamp","orig","mime","status","digest","length"])
    df = df[["timestamp"]].copy()
    df["dt"] = pd.to_datetime(df["timestamp"], format="%Y%m%d%H%M%S", utc=True).dt.tz_convert(None)
    return df


In [3]:
# JSON first, then HTML fallbacks (handles multiple eras/locales)
JSON_PATTERNS = [
    re.compile(r'"subscriberCountText"\s*:\s*{\s*"simpleText"\s*:\s*"([^"]+)"', re.S),
    re.compile(r'"approximateSubscriberCount"\s*:\s*"([^"]+)"', re.S),
    re.compile(r'"subscriberCountText"\s*:\s*{\s*"runs"\s*:\s*\[\s*{\s*"text"\s*:\s*"([^"]+)"', re.S),
]

HTML_PATTERNS = [
    re.compile(r'yt-subscription-button-subscriber-count[^>]*>([^<]+)</', re.I),
    re.compile(r'([\d\.,\sA-Za-z]{1,24})\s*subscribers?', re.I),
    re.compile(r'([\d\.,\sA-Za-z]{1,24})\s*(abonnés|abonné|suscriptores|iscritti|abonnenten)', re.I),
]

def human_to_int(raw: str) -> int:
    """
    Convert "10.8K", "1.3M", "200 thousand", "260ksubscribers", etc. → int
    (with a few non-English labels handled).
    """
    s = html.unescape(raw).strip().lower()
    s = re.sub(r"(subscribers?|abonnés|abonné|suscriptores|iscritti|abonnenten)\b", "", s)
    s = s.replace(",", "").replace(" ", "")
    s = s.replace("thousand", "k").replace("million", "m").replace("billion", "b")
    m = re.fullmatch(r"([0-9]*\.?[0-9]+)([kmb])?", s)
    if not m:
        raise ValueError(f"can't parse subscriber count: {raw!r}")
    num = float(m.group(1))
    mult = {"k": 1_000, "m": 1_000_000, "b": 1_000_000_000}.get(m.group(2), 1)
    return int(round(num * mult))

def extract_subs(html_text: str) -> Optional[int]:
    """
    Look for subscriber count in the page HTML.
    Prefer structured JSON near the channel header; fallback to HTML patterns.
    """
    h = html_text
    header_window = None
    key = "c4TabbedHeaderRenderer"  # anchor near channel header JSON
    i = h.find(key)
    if i != -1:
        header_window = h[max(0, i - 3000): i + 8000]

    spaces = [w for w in (header_window, h) if w]

    for space in spaces:
        for pat in JSON_PATTERNS:
            m = pat.search(space)
            if m:
                try:
                    return human_to_int(m.group(1))
                except ValueError:
                    pass

    for space in spaces:
        for pat in HTML_PATTERNS:
            m = pat.search(space)
            if m:
                try:
                    return human_to_int(m.group(1))
                except ValueError:
                    pass
    return None


In [4]:
def fetch_wayback_html(session: requests.Session, ts: str, channel_id: str, read_timeout: int = 60) -> str:
    """
    Download the raw HTML for a specific snapshot.
    Use 'id_/' to bypass Wayback’s UI banner.
    """
    url = f"https://web.archive.org/web/{ts}id_/https://www.youtube.com/channel/{channel_id}"
    r = session.get(url, timeout=(8, read_timeout))
    r.raise_for_status()
    return r.text

def scrape_wayback_subs(
    channel_id: str,
    start_year: int = 2010,
    end_year: Optional[int] = None,
    per_month: int = 1,
    sleep_between: float = 1.0,
    read_timeout: int = 60,
    show_progress: bool = True,
) -> pd.DataFrame:
    """
    Return DataFrame: columns ['date','subscribers','source'] with Wayback observations.
    - per_month: sample at most N snapshots per calendar month (1 recommended).
    """
    snaps = list_snapshots(channel_id, start_year, end_year)
    if snaps.empty:
        return pd.DataFrame(columns=["date", "subscribers", "source"])

    # Down-sample: keep up to `per_month` timestamps per month
    snaps["month"] = snaps.dt.dt.to_period("M")
    snaps = snaps.sort_values("dt").groupby("month").head(per_month).reset_index(drop=True)

    sess = make_session()
    rows = []
    iterator = tqdm(snaps["timestamp"], desc="Wayback fetch", unit="snap") if show_progress else snaps["timestamp"]

    for ts in iterator:
        try:
            html_txt = fetch_wayback_html(sess, ts, channel_id, read_timeout=read_timeout)
            subs = extract_subs(html_txt)
            if subs is not None:
                rows.append({
                    "date": pd.to_datetime(ts[:8], format="%Y%m%d"),
                    "subscribers": int(subs),
                    "source": "wayback",
                })
        except requests.exceptions.ReadTimeout:
            print(f"⚠️  Read timeout at {ts}; skipping after retries.")
        except requests.HTTPError as e:
            code = e.response.status_code if e.response is not None else "?"
            print(f"⚠️  HTTP {code} at {ts}; skipping.")
        except Exception as ex:
            print(f"⚠️  skip {ts} {ex}")
        time.sleep(sleep_between)  # be polite (≤ ~1 req/sec)

    df = pd.DataFrame(rows).sort_values("date").reset_index(drop=True)
    return df


In [5]:
def clean_wayback_series(
    df: pd.DataFrame,
    drop_large_dips: bool = True,
    dip_threshold: float = 0.7,
    enforce_monotone: bool = False
) -> pd.DataFrame:
    """
    Remove obvious mis-parses (huge drops) and optionally enforce non-decreasing subs.
    - drop_large_dips: drop rows < dip_threshold * prior_max (e.g., 0.7 = drop 30%+ dips)
    - enforce_monotone: replace series with cumulative max
    """
    if df.empty:
        return df.copy()
    df = df.sort_values("date").reset_index(drop=True)

    if drop_large_dips:
        keep = []
        max_so_far = -1
        for _, row in df.iterrows():
            s = int(row["subscribers"])
            if max_so_far >= 0 and s < dip_threshold * max_so_far:
                continue  # suspicious drop
            keep.append(row)
            if s > max_so_far:
                max_so_far = s
        df = pd.DataFrame(keep)

    if enforce_monotone and not df.empty:
        df["subscribers"] = df["subscribers"].cummax()

    return df.reset_index(drop=True)


In [6]:
CHANNEL_ID = "UC1E-JS8L0j1Ei70D9VEFrPQ"  # replace with your target "UC…" id

wb = scrape_wayback_subs(
    CHANNEL_ID,
    start_year=2015,     # change as needed
    per_month=1,         # 1 snapshot per month is usually enough
    sleep_between=1.0,   # be kind to the archive
    read_timeout=60,
    show_progress=True,
)

wb_clean = clean_wayback_series(wb, drop_large_dips=True, dip_threshold=0.7, enforce_monotone=False)
wb_clean.head(10)


Wayback fetch: 100%|██████████| 67/67 [03:32<00:00,  3.17s/snap]


Unnamed: 0,date,subscribers,source
0,2019-04-16,12097,wayback
1,2019-05-08,18975,wayback
2,2019-09-16,35800,wayback
3,2019-12-03,46700,wayback
4,2020-01-03,49700,wayback
5,2020-03-13,60900,wayback
6,2020-04-29,76800,wayback
7,2020-05-02,78200,wayback
8,2020-06-01,88200,wayback
9,2020-08-24,205000,wayback


In [7]:
# Save to CSV
wb_clean.to_csv("subs_wayback.csv", index=False)

# Quick look
display(wb_clean.tail(10))
wb_clean.plot(x="date", y="subscribers", title="Wayback: Subscriber History", figsize=(8, 3))


Unnamed: 0,date,subscribers,source
10,2020-09-01,210000,wayback
11,2020-11-01,260000,wayback
12,2023-08-10,447000,wayback
13,2023-10-02,455000,wayback
14,2023-11-23,463000,wayback
15,2024-01-13,470000,wayback
16,2024-02-10,471000,wayback
17,2024-03-05,452000,wayback
18,2024-05-09,450000,wayback
19,2024-06-01,450000,wayback


ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.