In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_github_list(list_url):
    headers = {
        "User-Agent": "Mozilla/5.0"
    }

    response = requests.get(list_url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to load page: {response.status_code}")

    soup = BeautifulSoup(response.text, 'html.parser')
    repo_blocks = soup.select('div#user-list-repositories > div.border-bottom')

    data = []
    for block in repo_blocks:
        name_tag = block.select_one('h3 a')
        full_name = name_tag['href'].strip('/') if name_tag else 'N/A'
        repo_url = f"https://github.com/{full_name}"

        desc_tag = block.select_one('[itemprop=description]')
        description = desc_tag.text.strip() if desc_tag else ''

        lang_tag = block.select_one('[itemprop=programmingLanguage]')
        language = lang_tag.text.strip() if lang_tag else ''

        stars_tag = block.select_one('a[href$="/stargazers"]')
        stars = stars_tag.text.strip().replace(',', '') if stars_tag else '0'

        forks_tag = block.select_one('a[href$="/forks"]')
        forks = forks_tag.text.strip().replace(',', '') if forks_tag else '0'

        updated_tag = block.select_one('relative-time')
        updated = updated_tag['datetime'] if updated_tag else ''

        data.append({
            'name': full_name,
            'url': repo_url,
            'description': description,
            'language': language,
            'stars': int(stars),
            'forks': int(forks),
            'updated_at': updated
        })

    return pd.DataFrame(data)

# Example usage:
df_list = scrape_github_list("https://github.com/stars/Veatec22/lists/stack")
print(df_list.head())


                        name                                           url  \
0  scikit-learn/scikit-learn  https://github.com/scikit-learn/scikit-learn   
1          onlook-dev/onlook          https://github.com/onlook-dev/onlook   
2       python-poetry/poetry       https://github.com/python-poetry/poetry   
3      evidence-dev/evidence      https://github.com/evidence-dev/evidence   
4            7PH/powerglitch            https://github.com/7PH/powerglitch   

                                         description    language  stars  \
0           scikit-learn: machine learning in Python      Python  62514   
1  The Cursor for Designers • An Open-Source Visu...  TypeScript  20044   
2  Python packaging and dependency management mad...      Python  33392   
3  Business intelligence as code: build fast, int...  JavaScript   5338   
4      Tiny JS library to glitch anything on the web  TypeScript   1389   

   forks            updated_at  
0  26008  2025-07-02T17:30:25Z  
1   1296  2025

In [2]:
df_list

Unnamed: 0,name,url,description,language,stars,forks,updated_at
0,scikit-learn/scikit-learn,https://github.com/scikit-learn/scikit-learn,scikit-learn: machine learning in Python,Python,62514,26008,2025-07-02T17:30:25Z
1,onlook-dev/onlook,https://github.com/onlook-dev/onlook,The Cursor for Designers • An Open-Source Visu...,TypeScript,20044,1296,2025-07-02T20:09:53Z
2,python-poetry/poetry,https://github.com/python-poetry/poetry,Python packaging and dependency management mad...,Python,33392,2350,2025-06-30T20:04:16Z
3,evidence-dev/evidence,https://github.com/evidence-dev/evidence,"Business intelligence as code: build fast, int...",JavaScript,5338,268,2025-06-26T14:58:31Z
4,7PH/powerglitch,https://github.com/7PH/powerglitch,Tiny JS library to glitch anything on the web,TypeScript,1389,15,2025-01-16T14:31:58Z
5,lucide-icons/lucide,https://github.com/lucide-icons/lucide,Beautiful & consistent icon toolkit made by th...,TypeScript,17952,877,2025-06-30T19:59:20Z
6,optuna/optuna,https://github.com/optuna/optuna,A hyperparameter optimization framework,Python,12221,1128,2025-07-02T10:22:41Z


In [1]:
# GitHub Lists Configuration
# This file contains the dictionary of GitHub starred lists to be used as tags

GITHUB_LISTS = {
    "stack": {
        "url": "https://github.com/stars/Veatec22/lists/stack",
        "description": "Core development stack and essential tools"
    },
    "nice-to-have": {
        "url": "https://github.com/stars/Veatec22/lists/nice-to-have", 
        "description": "Useful tools and libraries for future consideration"
    },
    "future-ideas": {
        "url": "https://github.com/stars/Veatec22/lists/future-ideas",
        "description": "Innovative projects and experimental technologies"
    }
}

# List of tag names for easy iteration
TAG_NAMES = list(GITHUB_LISTS.keys())

# Default sheet tab name for the combined lists data
LISTS_SHEET_TAB = "lists"

In [2]:
#!/usr/bin/env python3
"""
GitHub Starred Lists Fetcher with Tags
Fetches repositories from multiple GitHub starred lists and combines them with tags
"""

import os
import sys
import time
import json
from datetime import datetime
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import pandas as pd
import gspread
from gspread_dataframe import set_with_dataframe
from oauth2client.service_account import ServiceAccountCredentials
from dotenv import load_dotenv

# Import our lists configuration
# === CONFIGURATION ===
load_dotenv()
GCP_CREDENTIALS = os.getenv('GCP_CREDENTIALS')
GOOGLE_SHEET_NAME = os.getenv('GOOGLE_SHEET_NAME')
GOOGLE_SHEET_ID = os.getenv('GOOGLE_SHEET_ID')

def scrape_github_list(list_url, tag_name):
    """Scrape a single GitHub list and return repository data with tag"""
    print(f"🔍 Scraping list '{tag_name}': {list_url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(list_url, headers=headers)
        if response.status_code != 200:
            print(f"❌ Failed to load page for {tag_name}: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        repo_blocks = soup.select('div#user-list-repositories > div.border-bottom')

        data = []
        for block in repo_blocks:
            name_tag = block.select_one('h3 a')
            if not name_tag:
                continue
                
            full_name = name_tag['href'].strip('/')
            repo_url = f"https://github.com/{full_name}"

            desc_tag = block.select_one('[itemprop=description]')
            description = desc_tag.text.strip() if desc_tag else ''

            lang_tag = block.select_one('[itemprop=programmingLanguage]')
            language = lang_tag.text.strip() if lang_tag else ''

            stars_tag = block.select_one('a[href$="/stargazers"]')
            stars_text = stars_tag.get_text().strip().replace(',', '') if stars_tag else '0'
            
            forks_tag = block.select_one('a[href$="/forks"]')
            forks_text = forks_tag.get_text().strip().replace(',', '') if forks_tag else '0'

            updated_tag = block.select_one('relative-time')
            updated = updated_tag['datetime'] if updated_tag else ''

            # Parse stars and forks (handle 'k' suffix)
            stars = parse_number_with_suffix(stars_text)
            forks = parse_number_with_suffix(forks_text)

            data.append({
                'name': full_name,
                'url': repo_url,
                'description': description,
                'language': language,
                'stars': stars,
                'forks': forks,
                'updated_at': updated,
                'tag': tag_name
            })

        print(f"✅ Found {len(data)} repositories in '{tag_name}' list")
        return data
        
    except Exception as e:
        print(f"❌ Error scraping {tag_name}: {str(e)}")
        return []

def parse_number_with_suffix(text):
    """Parse numbers that might have 'k' suffix (e.g., '1.2k' -> 1200)"""
    if not text or text == '0':
        return 0
    
    text = text.lower().strip()
    if text.endswith('k'):
        try:
            return int(float(text[:-1]) * 1000)
        except ValueError:
            return 0
    
    try:
        return int(text)
    except ValueError:
        return 0

def combine_repos_with_tags(all_repos_data):
    """Combine repositories from multiple lists and concatenate tags"""
    print("🔄 Combining repositories and merging tags...")
    
    # Group repos by name (full_name)
    repo_dict = {}
    
    # Process all repos and collect tags
    for repo_data in all_repos_data:
        repo_name = repo_data['name']
        tag = repo_data['tag']
        
        # Initialize repo entry if not exists
        if repo_name not in repo_dict:
            repo_dict[repo_name] = {
                'tags': set(),
                'data': None
            }
        
        # Add tag to this repo
        repo_dict[repo_name]['tags'].add(tag)
        
        # Store repo data (use the first occurrence or update with more recent)
        if repo_dict[repo_name]['data'] is None:
            repo_dict[repo_name]['data'] = repo_data.copy()
            del repo_dict[repo_name]['data']['tag']  # Remove individual tag
        else:
            # Update with more recent data if available
            existing = repo_dict[repo_name]['data']
            if repo_data.get('updated_at') and existing.get('updated_at'):
                if repo_data['updated_at'] > existing['updated_at']:
                    repo_dict[repo_name]['data'] = repo_data.copy()
                    del repo_dict[repo_name]['data']['tag']
    
    # Create final combined list
    combined_data = []
    for repo_name, repo_info in repo_dict.items():
        repo_data = repo_info['data']
        tags_list = sorted(list(repo_info['tags']))
        
        repo_data['tags'] = ', '.join(tags_list)
        repo_data['tags_count'] = len(tags_list)
        repo_data['fetched_at'] = datetime.now().isoformat()
        
        combined_data.append(repo_data)
    
    print(f"✅ Combined {len(combined_data)} unique repositories with tags")
    
    # Sort by stars descending
    combined_data.sort(key=lambda x: x.get('stars', 0), reverse=True)
    
    return pd.DataFrame(combined_data)

def fetch_all_lists():
    """Fetch repositories from all configured GitHub lists"""
    print("🚀 Starting to fetch all GitHub lists...")
    print(f"📋 Lists to process: {', '.join(TAG_NAMES)}")
    
    all_repos = []
    
    for tag_name in TAG_NAMES:
        list_config = GITHUB_LISTS[tag_name]
        list_url = list_config['url']
        
        repos = scrape_github_list(list_url, tag_name)
        all_repos.extend(repos)
        
        # Be nice to GitHub
        time.sleep(1)
    
    print(f"📊 Total repositories fetched: {len(all_repos)}")
    
    # Combine and process
    if all_repos:
        return combine_repos_with_tags(all_repos)
    else:
        print("⚠️ No repositories found in any list")
        return pd.DataFrame()

def upload_to_google_sheet(df, sheet_name=GOOGLE_SHEET_NAME, tab_name=LISTS_SHEET_TAB):
    """Upload DataFrame to Google Sheets"""
    print(f"📤 Uploading to Google Sheet: {sheet_name}, tab: {tab_name}")
    
    # Scope for Sheets + Drive
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
    
    try:
        # Validate credentials
        if not GCP_CREDENTIALS:
            raise ValueError("GCP_CREDENTIALS environment variable is not set")
        
        # Load credentials from environment variable (JSON string)
        if GCP_CREDENTIALS.startswith('{'):
            # JSON string
            creds_dict = json.loads(GCP_CREDENTIALS)
        else:
            # File path
            with open(GCP_CREDENTIALS, 'r') as f:
                creds_dict = json.load(f)
        
        creds = ServiceAccountCredentials.from_json_keyfile_dict(creds_dict, scope)
        client = gspread.authorize(creds)
        
        # Open (or create) spreadsheet
        try:
            sheet = client.open(sheet_name)
        except gspread.SpreadsheetNotFound:
            sheet = client.create(sheet_name)
            print(f"📝 Created new spreadsheet: {sheet_name}")

        # Try to get the worksheet, create if it doesn't exist
        try:
            worksheet = sheet.worksheet(tab_name)
        except gspread.WorksheetNotFound:
            worksheet = sheet.add_worksheet(title=tab_name, rows="1000", cols="20")
            print(f"📝 Created new worksheet: {tab_name}")

        # Clear the sheet and upload new data
        worksheet.clear()
        set_with_dataframe(worksheet, df)
        
        print(f"✅ Uploaded {len(df)} rows to Google Sheet: {sheet_name}/{tab_name}")
        print(f"🔗 Sheet URL: https://docs.google.com/spreadsheets/d/{sheet.id}")
        
        return sheet.id
        
    except Exception as e:
        print(f"❌ Error uploading to Google Sheet: {type(e).__name__}: {e}")
        raise

def main():
    """Main execution function"""
    print("🚀 Starting GitHub starred lists sync with tags...")
    print(f"⏰ Started at: {datetime.now().isoformat()}")
    
    try:
        # Fetch all lists
        df = fetch_all_lists()
        
        if df.empty:
            print("⚠️ No repositories found in any list")
            return
        
        # Upload to Google Sheets
        sheet_id = upload_to_google_sheet(df)
        
        # Print summary
        print(f"\n📈 Summary:")
        print(f"   • Total unique repositories: {len(df)}")
        print(f"   • Lists processed: {', '.join(TAG_NAMES)}")
        
        # Show tag distribution
        tag_counts = {}
        for _, row in df.iterrows():
            for tag in row['tags'].split(', '):
                tag_counts[tag] = tag_counts.get(tag, 0) + 1
        
        print(f"   • Tag distribution:")
        for tag, count in sorted(tag_counts.items()):
            print(f"     - {tag}: {count} repos")
        
        print(f"\n🎉 Successfully synced GitHub lists!")
        print(f"📊 Data available at: https://docs.google.com/spreadsheets/d/{sheet_id}")
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        sys.exit(1)
    
    print(f"✅ Completed at: {datetime.now().isoformat()}")

if __name__ == '__main__':
    main()

🚀 Starting GitHub starred lists sync with tags...
⏰ Started at: 2025-07-03T18:37:28.370291
🚀 Starting to fetch all GitHub lists...
📋 Lists to process: stack, nice-to-have, future-ideas
🔍 Scraping list 'stack': https://github.com/stars/Veatec22/lists/stack
✅ Found 14 repositories in 'stack' list
🔍 Scraping list 'nice-to-have': https://github.com/stars/Veatec22/lists/nice-to-have
✅ Found 1 repositories in 'nice-to-have' list
🔍 Scraping list 'future-ideas': https://github.com/stars/Veatec22/lists/future-ideas
✅ Found 17 repositories in 'future-ideas' list
📊 Total repositories fetched: 32
🔄 Combining repositories and merging tags...
✅ Combined 32 unique repositories with tags
📤 Uploading to Google Sheet: github_data, tab: lists
📝 Created new worksheet: lists
✅ Uploaded 32 rows to Google Sheet: github_data/lists
🔗 Sheet URL: https://docs.google.com/spreadsheets/d/1sC--EoeGVjOfcKjeI5U9jL55nN0LhZXD4sepUTqCE20

📈 Summary:
   • Total unique repositories: 32
   • Lists processed: stack, nice-to-