Setup & Token (run first)

In [2]:
# =========================
# Cell 1: Setup & Token
# =========================
# What this cell does:
# - Imports libraries used later
# - Lets you paste ONLY your GitHub token
# - Sets base download folder (change if you want)

import os
import sys
import time
import json
import math
import shutil
import pathlib
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse

import requests

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# PASTE YOUR TOKEN HERE (string) — REQUIRED. Do not print it anywhere.
GITHUB_TOKEN = "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"  # <-- replace with your token
# <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

# Optional: You can also set via environment variable if you prefer:
# os.environ["GITHUB_TOKEN"] = GITHUB_TOKEN

# Base directory where repos will be cloned/updated (change if you like)
BASE_DIR = pathlib.Path("./github_repos").resolve()

# Fast toggle: include forks? (True/False)
INCLUDE_FORKS = True

# Fast toggle: include archived repos? (True/False)
INCLUDE_ARCHIVED = True

# Parallelism for cloning/pulling
MAX_WORKERS = max(4, (os.cpu_count() or 4) * 2)

# Git executable name (change if needed, e.g., "git.exe" on Windows if PATH issues)
GIT_BIN = "git"

# Fail early if token missing
if not GITHUB_TOKEN or not GITHUB_TOKEN.strip():
    raise RuntimeError("GITHUB_TOKEN is empty. Please paste your token in Cell 1.")

# Create base directory
BASE_DIR.mkdir(parents=True, exist_ok=True)
print(f"Repos will be synced into: {BASE_DIR}")


Repos will be synced into: C:\Users\admin\DOWNLOAD_REPOS\github_repos


Step 2 — GitHub API helpers (no owner needed)

In [3]:
# =========================
# Cell 2: Helper functions
# =========================

def _mask_token(text: str) -> str:
    """Mask appearance of token in any string to avoid accidental prints."""
    if not text:
        return text
    # Simple mask if token accidentally shows up
    return text.replace(GITHUB_TOKEN, "***TOKEN***")

def run_cmd(cmd, cwd=None, timeout=1800):
    """
    Run a shell command safely without printing the token.
    Returns (returncode, stdout, stderr).
    """
    # IMPORTANT: Never print cmd since it can contain the token in the URL.
    try:
        p = subprocess.run(
            cmd,
            cwd=str(cwd) if cwd else None,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout,
            check=False,
            text=True
        )
        return p.returncode, p.stdout, p.stderr
    except FileNotFoundError:
        raise RuntimeError(
            f"Command not found: {cmd[0]}. Make sure '{GIT_BIN}' is installed and in PATH."
        )

def ensure_git_available():
    """Check that git CLI is available."""
    code, out, err = run_cmd([GIT_BIN, "--version"])
    if code != 0:
        raise RuntimeError(
            f"'git' not available. Install Git and ensure it's on PATH. Details:\n{err}"
        )

def auth_clone_url(plain_clone_url: str, token: str) -> str:
    """
    Convert 'https://github.com/owner/repo.git' to
    'https://x-access-token:{token}@github.com/owner/repo.git'
    """
    # We always force https+token auth. Do not support ssh in this script (by design).
    u = urlparse(plain_clone_url)
    # For GitHub, netloc is 'github.com'; insert token as user:pass
    # Using 'x-access-token' as username avoids exposing 'ghp_' pattern in process listings.
    return f"https://x-access-token:{token}@{u.netloc}{u.path}"

def repo_local_path(base_dir: pathlib.Path, full_name: str) -> pathlib.Path:
    """
    Map 'owner/repo' to local path 'base_dir/owner/repo'.
    We do NOT ask you for owner; we just use what API returns for uniqueness.
    """
    owner, repo = full_name.split("/", 1)
    return base_dir / owner / repo

def human_repo(r):
    """Minimal printable identity for debug (never shows token)."""
    return f"{r.get('full_name')} (private={r.get('private')}, fork={r.get('fork')}, archived={r.get('archived')})"

ensure_git_available()
print("Git is available.")


Git is available.


🧪 Cell 3 — GitHub API: Fetch all repos you can access

In [15]:
# =========================================
# Cell 3: List all repos (public + private)
# =========================================
# This cell uses ONLY your token (no owner input) to list repos you can access:
# - Your own repos
# - Private repos you own
# - Repos you collaborate on
# - Organization repos you can access
# It handles pagination (no artificial limits).

def list_all_repos(token: str,
                   include_forks: bool = True,
                   include_archived: bool = True) -> list:
    """
    Returns a list of repository dicts (GitHub API format).
    Uses /user/repos so no owner is needed.
    """
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github+json",
        "X-GitHub-Api-Version": "2022-11-28",
        "User-Agent": "gh-sync-script"
    }

    # Affiliation covers repos you own, collaborate on, and org membership repos.
    # visibility=all -> public+private
    # per_page=100 -> max per page, then paginate until empty.
    base_url = "https://api.github.com/user/repos"
    params = {
        "visibility": "all",
        "affiliation": "owner,collaborator,organization_member",
        "per_page": 100,
        "page": 1
    }

    all_repos = []
    while True:
        resp = requests.get(base_url, headers=headers, params=params, timeout=60)
        if resp.status_code == 403 and "rate limit" in resp.text.lower():
            raise RuntimeError("Rate limited by GitHub. Try again later or use a fresh token with higher limits.")
        if resp.status_code >= 400:
            raise RuntimeError(f"GitHub API error {resp.status_code}: {resp.text}")
        page_repos = resp.json()
        if not page_repos:
            break
        for r in page_repos:
            if not include_forks and r.get("fork"):
                continue
            if not include_archived and r.get("archived"):
                continue
            all_repos.append(r)
        params["page"] += 1

    # Sort by full_name for stable processing order
    all_repos.sort(key=lambda r: r.get("full_name", "").lower())
    return all_repos

repos = list_all_repos(GITHUB_TOKEN, INCLUDE_FORKS, INCLUDE_ARCHIVED)
print(f"Discovered {len(repos)} repos to sync.")
# Optional: peek at a few names (safe to print)
for r in repos[:11]:
    print(" -", human_repo(r))


Discovered 11 repos to sync.
 - Tushar-Padole/Alphabets-Recognition (private=False, fork=False, archived=False)
 - Tushar-Padole/Cat-Vs-Dog (private=True, fork=False, archived=False)
 - Tushar-Padole/Digit-Recognizer (private=True, fork=False, archived=False)
 - Tushar-Padole/House-Prices-Prediction (private=True, fork=False, archived=False)
 - Tushar-Padole/Next-Word-Predictor-Sherlock-Holmes- (private=True, fork=False, archived=False)
 - Tushar-Padole/Recipe_making_recommender (private=False, fork=False, archived=False)
 - Tushar-Padole/Summer-Olympic-Games-Analysis-1896-2024- (private=False, fork=False, archived=False)
 - Tushar-Padole/test_git_pro (private=False, fork=False, archived=False)
 - Tushar-Padole/Titanic-survival-prediction (private=False, fork=False, archived=False)
 - Tushar-Padole/Titanic-survival-prediction_old (private=True, fork=False, archived=False)
 - Tushar-Padole/TripC_Bike (private=False, fork=False, archived=False)


🧪 Cell 4 — Clone or Update a Single Repo

In [11]:
# =======================================
# Cell 4: Clone / Update one repository
# =======================================
# This cell defines functions to clone a missing repo or update an existing one.
# It uses HTTPS with token auth. Do NOT print the command since it may contain token in URL.

def clone_repo(r: dict, token: str, base_dir: pathlib.Path) -> tuple:
    """
    Clone a repository that doesn't exist locally.
    Returns (success:bool, message:str)
    """
    full_name = r["full_name"]
    dest = repo_local_path(base_dir, full_name)
    dest.parent.mkdir(parents=True, exist_ok=True)

    url = auth_clone_url(r["clone_url"], token)

    # We do a full clone (no depth limit) to honor "no limit" requirement.
    cmd = [GIT_BIN, "clone", "--recursive", url, str(dest)]
    code, out, err = run_cmd(cmd)
    if code == 0:
        # Optional: initialize submodules (clone --recursive should already do this)
        sm_code, sm_out, sm_err = run_cmd([GIT_BIN, "-C", str(dest), "submodule", "update", "--init", "--recursive", "--jobs", "4"])
        if sm_code != 0:
            return False, f"[CLONE OK][SUBMODULE WARN] {full_name}: {sm_err.strip()}"
        return True, f"[CLONE OK] {full_name}"
    else:
        # If directory got partially created, clean it to avoid broken state
        if dest.exists() and not (dest / ".git").exists():
            try:
                shutil.rmtree(dest)
            except Exception:
                pass
        return False, f"[CLONE FAIL] {full_name}: {err.strip() or out.strip()}"

def update_repo(r: dict, token: str, base_dir: pathlib.Path) -> tuple:
    """
    Update an existing local repo by fetching/pulling latest commits.
    Returns (success:bool, message:str)
    """
    full_name = r["full_name"]
    dest = repo_local_path(base_dir, full_name)
    url = auth_clone_url(r["clone_url"], token)

    if not (dest / ".git").exists():
        return clone_repo(r, token, base_dir)

    # Ensure 'origin' remote uses an authenticated URL (important for private repos)
    code, out, err = run_cmd([GIT_BIN, "-C", str(dest), "remote", "set-url", "origin", url])
    if code != 0:
        return False, f"[REMOTE SET-URL FAIL] {full_name}: {err.strip() or out.strip()}"

    # Track all branches to keep everything current
    run_cmd([GIT_BIN, "-C", str(dest), "remote", "set-branches", "origin", "*"])

    # Fetch all refs and prune deleted ones, plus tags
    code, out, err = run_cmd([GIT_BIN, "-C", str(dest), "fetch", "--all", "--prune", "--tags"])
    if code != 0:
        return False, f"[FETCH FAIL] {full_name}: {err.strip() or out.strip()}"

    # Ensure we're on default branch (helps if repo was checked out elsewhere)
    default_branch = r.get("default_branch") or "main"
    # Checkout default branch (ignore errors if already on it)
    run_cmd([GIT_BIN, "-C", str(dest), "checkout", default_branch])

    # Fast-forward pull on the default branch
    code, out, err = run_cmd([GIT_BIN, "-C", str(dest), "pull", "--ff-only"])
    if code != 0:
        # If fast-forward fails because of local changes, try stash+pull (non-destructive)
        # NOTE: If you have local edits, they will be stashed.
        run_cmd([GIT_BIN, "-C", str(dest), "stash", "push", "-u", "-m", "auto-stash-before-ff-pull"])
        code2, out2, err2 = run_cmd([GIT_BIN, "-C", str(dest), "pull", "--ff-only"])
        if code2 != 0:
            return False, f"[PULL FAIL] {full_name}: {err2.strip() or out2.strip()}"

    # Update submodules if any
    run_cmd([GIT_BIN, "-C", str(dest), "submodule", "update", "--init", "--recursive", "--jobs", "4"])

    return True, f"[UPDATE OK] {full_name}"


🧪 Cell 5 — Sync All Repos (parallel, fast)

In [12]:
# ======================================
# Cell 5: Sync ALL repos in parallel
# ======================================
# - Detects whether repo exists locally
# - Clones if missing, otherwise fetches updates
# - Runs in parallel for speed

def sync_all_repos(repos: list, token: str, base_dir: pathlib.Path, max_workers: int = MAX_WORKERS):
    to_process = []
    for r in repos:
        local = repo_local_path(base_dir, r["full_name"])
        exists = (local / ".git").exists()
        to_process.append((exists, r))

    results = {"ok": 0, "fail": 0, "messages": []}

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        future_to_repo = {}
        for exists, r in to_process:
            if exists:
                future = ex.submit(update_repo, r, token, base_dir)
            else:
                future = ex.submit(clone_repo, r, token, base_dir)
            future_to_repo[future] = r

        for fut in as_completed(future_to_repo):
            r = future_to_repo[fut]
            try:
                ok, msg = fut.result()
                results["messages"].append(msg)
                if ok:
                    results["ok"] += 1
                else:
                    results["fail"] += 1
            except Exception as e:
                results["fail"] += 1
                results["messages"].append(f"[ERROR] {r.get('full_name')}: {e}")

    # Print a concise summary (no secrets)
    print(f"\n==== Sync Summary ====")
    print(f"Total repos: {len(repos)}")
    print(f"OK: {results['ok']}  |  FAIL: {results['fail']}")
    if results["fail"]:
        print("Some failures occurred. See messages below:")
    # Print last 30 messages to avoid too much output
    for m in results["messages"][-30:]:
        print(m)

    return results

# Run a one-shot sync now (optional; or run in next cell)
print("Ready to sync. Run the next cell to execute a one-shot sync.")


Ready to sync. Run the next cell to execute a one-shot sync.


🧪 Cell 6 — One-Shot Sync Now

In [17]:
# ======================================
# Cell 6: One-shot sync now
# ======================================
# This will:
# 1) Re-list repos (in case new ones were added)
# 2) Clone missing repos and update existing ones

repos = list_all_repos(GITHUB_TOKEN, INCLUDE_FORKS, INCLUDE_ARCHIVED)
result = sync_all_repos(repos, GITHUB_TOKEN, BASE_DIR, MAX_WORKERS)



==== Sync Summary ====
Total repos: 11
OK: 11  |  FAIL: 0
[UPDATE OK] Tushar-Padole/Recipe_making_recommender
[UPDATE OK] Tushar-Padole/Alphabets-Recognition
[UPDATE OK] Tushar-Padole/TripC_Bike
[UPDATE OK] Tushar-Padole/Titanic-survival-prediction
[UPDATE OK] Tushar-Padole/test_git_pro
[UPDATE OK] Tushar-Padole/Titanic-survival-prediction_old
[UPDATE OK] Tushar-Padole/Digit-Recognizer
[UPDATE OK] Tushar-Padole/House-Prices-Prediction
[UPDATE OK] Tushar-Padole/Next-Word-Predictor-Sherlock-Holmes-
[UPDATE OK] Tushar-Padole/Cat-Vs-Dog
[UPDATE OK] Tushar-Padole/Summer-Olympic-Games-Analysis-1896-2024-
