In [None]:
import yaml
import duckdb
from pathlib import Path

def run_job(job):
    # Ensure destination folder exists
    dest_path = Path(job["destination"])
    dest_path.parent.mkdir(parents=True, exist_ok=True)

    # Connect to DuckDB and load CSV directly
    con = duckdb.connect(dest_path)
    con.execute(f"""
        CREATE OR REPLACE TABLE {job['table']} AS
        SELECT * FROM read_csv_auto('{job["source_url"]}')
    """)

    print(f"✔ Saved table '{job['table']}' to: {dest_path}")

def main():
    with open("fetch_jobs.yaml", "r") as f:
        config = yaml.safe_load(f)
    for job in config["jobs"]:
        run_job(job)

if __name__ == "__main__":
    main()

In [21]:
#!/usr/bin/env python3
"""
Starred GitHub Repositories Fetcher
Fetches starred repositories from GitHub API and uploads to Google Sheets
"""

import os
import sys
import time
import json
from datetime import datetime
import requests
import pandas as pd
import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials
from dotenv import load_dotenv


# === CONFIGURATION ===
load_dotenv()
GHUB_TOKEN = os.getenv('GHUB_TOKEN')
GCP_CREDENTIALS = os.getenv('GCP_CREDENTIALS')
GOOGLE_SHEET_NAME = os.getenv('GOOGLE_SHEET_NAME')
GOOGLE_SHEET_ID = os.getenv('GOOGLE_SHEET_ID')

# GitHub API endpoints
API_STARRED_URL = 'https://api.github.com/user/starred'
API_RELEASES_URL = 'https://api.github.com/repos/{owner}/{repo}/releases/latest'
API_TOPICS_URL = 'https://api.github.com/repos/{owner}/{repo}/topics'

auth_headers = {
    'Authorization': f'token {GHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

topics_headers = {
    'Authorization': f'token {GHUB_TOKEN}',
    'Accept': 'application/vnd.github.mercy-preview+json'  # Needed to access topics
}

def get_starred_repos():
    """Fetch all starred repositories from GitHub API"""
    print("🔍 Fetching starred repositories...")
    starred = []
    page = 1

    while True:
        response = requests.get(
            API_STARRED_URL, 
            headers=auth_headers, 
            params={'per_page': 100, 'page': page}
        )
        
        if response.status_code != 200:
            print(f"Error fetching starred repos: {response.status_code} - {response.text}")
            break

        data = response.json()
        if not data:
            break

        starred.extend(data)
        page += 1
        print(f"📦 Fetched page {page-1} ({len(data)} repos)")

    print(f"✅ Total starred repositories: {len(starred)}")
    return starred


def get_last_release_date(owner, repo):
    """Get the last release date for a repository"""
    url = API_RELEASES_URL.format(owner=owner, repo=repo)
    response = requests.get(url, headers=auth_headers)

    if response.status_code == 200:
        return response.json().get("published_at")
    elif response.status_code == 404:
        return "No releases"
    else:
        return f"Error: {response.status_code}"


def get_repo_topics(owner, repo):
    """Get topics for a repository"""
    url = API_TOPICS_URL.format(owner=owner, repo=repo)
    response = requests.get(url, headers=topics_headers)

    if response.status_code == 200:
        return response.json().get('names', [])
    else:
        return []


def process_repositories(repos):
    """Process repositories and gather additional data"""
    print("🔄 Processing repositories and gathering additional data...")
    data = []

    for i, repo in enumerate(repos):
        full_name = repo['full_name']
        owner, repo_name = full_name.split('/')
        
        print(f"📊 Processing {full_name} ({i+1}/{len(repos)})")
        
        # Get additional data
        last_release = get_last_release_date(owner, repo_name)
        topics = get_repo_topics(owner, repo_name)

        data.append({
            'name': full_name,
            'description': repo.get('description', ''),
            'stars': repo['stargazers_count'],
            'forks': repo['forks_count'],
            'language': repo.get('language', 'Unknown'),
            'url': repo['html_url'],
            'last_release': last_release,
            'topics': ", ".join(topics),
            'created_at': repo['created_at'],
            'updated_at': repo['updated_at'],
            'pushed_at': repo.get('pushed_at', ''),
            'open_issues': repo.get('open_issues_count', 0),
            'archived': repo.get('archived', False),
            'fork': repo.get('fork', False),
            'fetched_at': datetime.now().isoformat()
        })

        time.sleep(0.1)

    print(f"✅ Processed {len(data)} repositories")
    return pd.DataFrame(data)


def upload_to_google_sheet(df, sheet_name=GOOGLE_SHEET_NAME):
    """Upload DataFrame to Google Sheets"""
    print(f"📤 Uploading to Google Sheet: {sheet_name}")
    
    # Scope for Sheets + Drive
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    
    try:
        # Load credentials from environment variable (JSON string)
        if GCP_CREDENTIALS.startswith('{'):
            # JSON string
            creds_dict = json.loads(GCP_CREDENTIALS)
        else:
            # File path
            with open(GCP_CREDENTIALS, 'r') as f:
                creds_dict = json.load(f)
        
        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
        client = gspread.authorize(creds)
        
        # Open (or create) spreadsheet
        try:
            sheet = client.open(sheet_name)
        except gspread.SpreadsheetNotFound:
            sheet = client.create(sheet_name)
            print(f"📝 Created new spreadsheet: {sheet_name}")

        # Use the first worksheet or create it
        worksheet = sheet.worksheet("starred")

        # Clear the sheet and upload new data
        worksheet.clear()
        set_with_dataframe(worksheet, df)
        
        print(f"✅ Uploaded {len(df)} rows to Google Sheet: {sheet_name}")
        print(f"🔗 Sheet URL: https://docs.google.com/spreadsheets/d/{sheet.id}")
        
        return sheet.id
        
    except Exception as e:
        print(f"❌ Error uploading to Google Sheet: {type(e).__name__}: {e.args}")
        raise


def main():
    """Main execution function"""
    print("🚀 Starting starred repositories sync...")
    print(f"⏰ Started at: {datetime.now().isoformat()}")
    
    repos = get_starred_repos()
    df = process_repositories(repos)
    return df    




In [22]:
df=main()

🚀 Starting starred repositories sync...
⏰ Started at: 2025-07-02T21:38:50.378480
🔍 Fetching starred repositories...
📦 Fetched page 1 (42 repos)
✅ Total starred repositories: 42
🔄 Processing repositories and gathering additional data...
📊 Processing onlook-dev/onlook (1/42)
📊 Processing dbt-labs/dbt-core (2/42)
📊 Processing mendableai/firecrawl (3/42)
📊 Processing microsoft/ML-For-Beginners (4/42)
📊 Processing ml-tooling/best-of-ml-python (5/42)
📊 Processing sdmg15/Best-websites-a-programmer-should-visit (6/42)
📊 Processing gitleaks/gitleaks (7/42)
📊 Processing eyaltoledano/claude-task-master (8/42)
📊 Processing celery/celery (9/42)
📊 Processing python-poetry/poetry (10/42)
📊 Processing evidence-dev/evidence (11/42)
📊 Processing 7PH/powerglitch (12/42)
📊 Processing microsoft/typescript-go (13/42)
📊 Processing lucide-icons/lucide (14/42)
📊 Processing topoteretes/cognee (15/42)
📊 Processing mlabonne/llm-course (16/42)
📊 Processing optuna/optuna (17/42)
📊 Processing coleifer/huey (18/42)
📊

In [None]:
df

In [25]:
df.to_clipboard(index=False)

In [23]:
df

Unnamed: 0,name,description,stars,forks,language,url,last_release,topics,created_at,updated_at,pushed_at,open_issues,archived,fork,fetched_at
0,onlook-dev/onlook,The Cursor for Designers • An Open-Source Visu...,20025,1295,TypeScript,https://github.com/onlook-dev/onlook,2025-05-28T06:12:15Z,"react, typescript, tailwindcss, nextjs, webflo...",2024-06-25T19:16:02Z,2025-07-02T19:36:28Z,2025-07-02T19:01:46Z,247,False,False,2025-07-02T21:38:52.728844
1,dbt-labs/dbt-core,dbt enables data analysts and engineers to tra...,11040,1760,Python,https://github.com/dbt-labs/dbt-core,2025-06-20T19:28:47Z,"dbt-viewpoint, slack, pypa, data-modeling, bus...",2016-03-10T02:38:00Z,2025-07-02T14:53:05Z,2025-07-02T17:27:36Z,723,False,False,2025-07-02T21:38:53.531144
2,mendableai/firecrawl,🔥 Turn entire websites into LLM-ready markdown...,41907,3949,TypeScript,https://github.com/mendableai/firecrawl,2025-06-27T15:19:08Z,"ai, crawler, data, markdown, scraper, html-to-...",2024-04-15T21:02:29Z,2025-07-02T19:35:04Z,2025-07-02T19:34:50Z,194,False,False,2025-07-02T21:38:54.243396
3,microsoft/ML-For-Beginners,"12 weeks, 26 lessons, 52 quizzes, classic Mach...",73476,16151,HTML,https://github.com/microsoft/ML-For-Beginners,No releases,"ml, data-science, machine-learning, machine-le...",2021-03-03T01:34:05Z,2025-07-02T19:37:37Z,2025-06-16T13:28:55Z,15,False,False,2025-07-02T21:38:55.342764
4,ml-tooling/best-of-ml-python,🏆 A ranked list of awesome machine learning Py...,21421,2879,,https://github.com/ml-tooling/best-of-ml-python,2025-06-26T15:25:44Z,"python, machine-learning, data-science, nlp, d...",2020-11-29T19:41:36Z,2025-07-02T15:03:59Z,2025-06-26T15:25:35Z,32,False,False,2025-07-02T21:38:56.073361
5,sdmg15/Best-websites-a-programmer-should-visit,:link: Some useful websites for programmers.,70902,8251,,https://github.com/sdmg15/Best-websites-a-prog...,No releases,"books, programmer, cs, links, sites, hacktober...",2017-03-05T20:25:17Z,2025-07-02T19:35:05Z,2025-07-01T08:13:14Z,921,False,False,2025-07-02T21:38:56.803884
6,gitleaks/gitleaks,Find secrets with Gitleaks 🔑,21324,1666,Go,https://github.com/gitleaks/gitleaks,2025-06-09T00:35:40Z,"security, security-tools, git, golang, go, sec...",2018-01-27T18:19:31Z,2025-07-02T17:49:38Z,2025-07-01T20:59:34Z,286,False,False,2025-07-02T21:38:57.536355
7,eyaltoledano/claude-task-master,An AI-powered task-management system you can d...,17354,1746,JavaScript,https://github.com/eyaltoledano/claude-task-ma...,2025-06-21T21:00:33Z,"ai, cursor, task-manager, tasks, tasks-list, c...",2025-03-04T18:54:54Z,2025-07-02T19:30:11Z,2025-07-02T10:53:12Z,150,False,False,2025-07-02T21:38:58.227084
8,celery/celery,Distributed Task Queue (development branch),26721,4799,Python,https://github.com/celery/celery,2025-06-01T11:08:20Z,"python, task-manager, task-scheduler, task-run...",2009-04-24T11:31:24Z,2025-07-02T14:07:49Z,2025-07-02T14:44:23Z,749,False,False,2025-07-02T21:38:58.919942
9,python-poetry/poetry,Python packaging and dependency management mad...,33392,2350,Python,https://github.com/python-poetry/poetry,2025-05-04T13:37:05Z,"python, dependency-manager, package-manager, p...",2018-02-28T15:23:47Z,2025-07-02T16:05:50Z,2025-06-30T20:04:16Z,572,False,False,2025-07-02T21:38:59.710678
