In [None]:
import requests
from bs4 import BeautifulSoup, Comment
from urllib.parse import urljoin, urlparse, urlunparse
import time
import sqlite3

BASE_URL = "https://github.com"
ORG_REPOS_URL = "https://github.com/google?tab=repositories"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

DB_PATH = "google_repos.db"

def init_db():
    conn = sqlite3.connect(DB_PATH)
    cur = conn.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS repos (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT NOT NULL UNIQUE,
            language TEXT,
            stars INTEGER NOT NULL
        )
    """)
    conn.commit()
    return conn

def parse_star_count(text):
    s = text.strip().lower().replace(",", "")
    if not s:
        return 0
    try:
        if s.endswith("k"):
            return int(float(s[:-1]) * 1000)
        if s.endswith("m"):
            return int(float(s[:-1]) * 1000000)
        return int(s)
    except ValueError:
        return 0

def normalize_repo_full_name(href_text, link_text):
    href = (href_text or "").strip()
    txt = (link_text or "").strip()
    full = None
    if href:
        parsed = urlparse(href)
        path = parsed.path if parsed.scheme else href
        path = path.strip("/")
        if path:
            full = path
    if not full:
        if "/" in txt:
            full = txt.strip()
        elif txt:
            full = f"google/{txt}"
    return full

def extract_repos_from_page(soup):
    repos = []

    candidates = []
    candidates.extend(soup.select("li[itemprop='owns']"))                    # 旧UI
    if not candidates:
        candidates.extend(soup.select("div[data-testid='results-list'] li")) # 新UI候補
    if not candidates:
        candidates.extend(soup.select("li.Box-row"))                         # 汎用
    if not candidates:
        for li in soup.select("li"):
            if li.select_one("a[href*='/google/'], a[data-hovercard-type='repository']"):
                candidates.append(li)

    for li in candidates:
        name_a = li.select_one(
            "h3 a, a[itemprop='name codeRepository'], a[data-hovercard-type='repository'], a[href*='/google/']"
        )
        if not name_a:
            continue

        full_name = normalize_repo_full_name(name_a.get("href"), name_a.get_text())
        if not full_name or "/" not in full_name:
            continue

        lang_el = (
            li.select_one("[itemprop='programmingLanguage']")
            or li.select_one("span[data-testid='repo-language-color'] + span")
        )
        language = lang_el.get_text(strip=True) if lang_el else None
        if language == "":
            language = None

        star_el = (
            li.select_one("a[href$='/stargazers']")
            or li.select_one("a.Link--muted[href$='/stargazers']")
        )
        stars = 0
        if star_el:
            stars = parse_star_count(star_el.get_text(strip=True))

        repos.append({"name": full_name, "language": language, "stars": stars})

    return repos

def get_next_page_url(soup, current_url):
    next_el = (
        soup.select_one("a.next_page")
        or soup.select_one("a[rel='next']")
        or soup.select_one("nav[aria-label='Pagination'] a[aria-label='Next']")
    )
    if next_el and next_el.get("href"):
        return urljoin(BASE_URL, next_el["href"])

    parsed = urlparse(current_url)
    qs = parsed.query.split("&") if parsed.query else []
    page_val = None
    new_qs = []
    for q in qs:
        if q.startswith("page="):
            try:
                page_val = int(q.split("=", 1)[1])
                new_qs.append(f"page={page_val+1}")
            except ValueError:
                page_val = None
        else:
            new_qs.append(q)

    if page_val is not None:
        new_query = "&".join([p for p in new_qs if p])
        return urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", new_query, ""))
    else:
        return None

def scrape_google_repos():
    url = ORG_REPOS_URL
    all_repos = []
    seen_names = set()
    visited_urls = set()
    empty_pages_in_a_row = 0
    page_index = 0

    while url:
        if url in visited_urls:
            print(f"[Stop] Already visited: {url}")
            break
        visited_urls.add(url)

        print(f"[Fetch] page {page_index+1}: {url}")
        resp = requests.get(url, headers=HEADERS, timeout=30)
        time.sleep(1)
        if resp.status_code != 200:
            print(f"[Error] Failed to fetch {url}: {resp.status_code}")
            break

        soup = BeautifulSoup(resp.text, "html.parser")

        repos = []
        for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
            if "itemprop='owns'" in c or "data-testid='results-list'" in c:
                inner = BeautifulSoup(c, "html.parser")
                repos.extend(extract_repos_from_page(inner))

        repos.extend(extract_repos_from_page(soup))

        new_count = 0
        for r in repos:
            if r["name"] not in seen_names:
                all_repos.append(r)
                seen_names.add(r["name"])
                new_count += 1

        print(f"[Parsed] new repos: {new_count}, total: {len(all_repos)}")

        if new_count == 0:
            empty_pages_in_a_row += 1
        else:
            empty_pages_in_a_row = 0

        if empty_pages_in_a_row >= 3:
            print("[Stop] No new repos for 3 pages in a row. Ending.")
            break

        page_index += 1

        next_url = get_next_page_url(soup, url)
        if not next_url:
            print("[Stop] No next page.")
            break
        if next_url == url:
            print(f"[Stop] Next page equals current URL ({next_url}).")
            break
        if next_url in visited_urls:
            print(f"[Stop] Next page already visited ({next_url}).")
            break

        print(f"[Next] {next_url}")
        url = next_url

    return all_repos

def save_repos(conn, repos):
    cur = conn.cursor()
    for r in repos:
        cur.execute(
            "INSERT OR IGNORE INTO repos (name, language, stars) VALUES (?, ?, ?)",
            (r["name"], r["language"], r["stars"])
        )
    conn.commit()

def main():
    conn = init_db()
    repos = scrape_google_repos()
    print(f"Scraped {len(repos)} repositories.")
    save_repos(conn, repos)

    print("Saved repos:")
    for row in conn.execute("SELECT name, language, stars FROM repos ORDER BY stars DESC, name ASC"):
        print(row)

    conn.close()

if __name__ == "__main__":
    main()

[Fetch] page 1: https://github.com/google?tab=repositories
[Parsed] new repos: 10, total: 10
[Stop] No next page.
Scraped 10 repositories.
Saved repos:
('google/or-tools', 'C++', 12715)
('google/perfetto', 'C++', 5019)
('google/angle', 'C++', 3843)
('google/XNNPACK', 'C', 2181)
('google/tunix', 'Python', 1916)
('google/nomulus', 'Java', 1768)
('google/yggdrasil-decision-forests', 'C++', 622)
('google/osv-scalibr', 'Go', 538)
('google/orbax', 'Python', 455)
('google/skia-buildbot', 'Go', 158)
('google/device-infra', 'Java', 58)
('google/dwh-migration-tools', 'Java', 54)
('google/koladata', 'C++', 27)
('google/kotlin-fhirpath', 'Kotlin', 6)
