In [1]:
# GitHub Lists Configuration
# This file contains the dictionary of GitHub starred lists to be used as tags

GITHUB_LISTS = {
    "stack": {
        "url": "https://github.com/stars/Veatec22/lists/stack",
        "description": "Core development stack and essential tools"
    },
    "nice-to-have": {
        "url": "https://github.com/stars/Veatec22/lists/nice-to-have", 
        "description": "Useful tools and libraries for future consideration"
    },
    "future-ideas": {
        "url": "https://github.com/stars/Veatec22/lists/future-ideas",
        "description": "Innovative projects and experimental technologies"
    },
    "ignore": {
        "url": "https://github.com/stars/Veatec22/lists/ignore",
        "description": "Repositories to ignore"
    }
    
}

# List of tag names for easy iteration
TAG_NAMES = list(GITHUB_LISTS.keys())

# Default sheet tab name for the unified starred data
STARRED_SHEET_TAB = "starred"

In [2]:
#!/usr/bin/env python3
"""
Test script to verify MotherDuck connection
"""

import os
import duckdb
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

MOTHERDUCK_TOKEN = os.getenv('MOTHERDUCK_TOKEN')
MOTHERDUCK_DB = os.getenv('MOTHERDUCK_DB', 'github')

def test_motherduck_connection():
    """Test the MotherDuck connection"""
    print("üß™ Testing MotherDuck connection...")
    
    try:
        # Try to connect to MotherDuck
        if MOTHERDUCK_TOKEN:
            connection_string = f'md:{MOTHERDUCK_DB}?motherduck_token={MOTHERDUCK_TOKEN}'
            print(f"üîê Using token authentication for database: {MOTHERDUCK_DB}")
        else:
            connection_string = f'md:{MOTHERDUCK_DB}'
            print(f"üåê Using browser authentication for database: {MOTHERDUCK_DB}")
        
        conn = duckdb.connect(connection_string)
        print("‚úÖ Successfully connected to MotherDuck!")
        
        # Test basic functionality
        result = conn.execute("SELECT 1 as test_value").fetchone()
        print(f"‚úÖ Basic query test: {result}")
        
        # Show databases
        print("\nüìä Available databases:")
        databases = conn.execute("SHOW DATABASES").fetchall()
        for db in databases:
            print(f"  - {db[0]}")
        
        # Check if our tables exist
        print(f"\nüìã Tables in {MOTHERDUCK_DB} database:")
        try:
            tables = conn.execute("SHOW TABLES").fetchall()
            if tables:
                for table in tables:
                    print(f"  - {table[0]}")
            else:
                print("  No tables found - run the sync scripts to populate data")
        except Exception as e:
            print(f"  Could not list tables: {e}")
        
        conn.close()
        print("\nüéâ MotherDuck connection test completed successfully!")
        return True
        
    except Exception as e:
        print(f"‚ùå Error testing MotherDuck connection: {e}")
        return False

if __name__ == '__main__':
    success = test_motherduck_connection()
    if not success:
        exit(1)

üß™ Testing MotherDuck connection...
üåê Using browser authentication for database: github
‚úÖ Successfully connected to MotherDuck!
‚úÖ Basic query test: (1,)

üìä Available databases:
  - github
  - md_information_schema

üìã Tables in github database:
  No tables found - run the sync scripts to populate data

üéâ MotherDuck connection test completed successfully!


In [3]:
#!/usr/bin/env python3
"""
Unified Starred GitHub Repositories Fetcher
Fetches starred repositories from GitHub API and merges with curated list tags
Creates a comprehensive portfolio view combining detailed repo data with tag organization
Now uses MotherDuck (cloud DuckDB) for data storage
"""

import os
import sys
import time
import duckdb
from datetime import datetime
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import pandas as pd
from dotenv import load_dotenv

# === CONFIGURATION ===
load_dotenv()
GHUB_TOKEN = os.getenv('GHUB_TOKEN')
MOTHERDUCK_TOKEN = os.getenv('MOTHERDUCK_TOKEN')
MOTHERDUCK_DB = os.getenv('MOTHERDUCK_DB', 'github')  # Default database name

# GitHub API endpoints
API_STARRED_URL = 'https://api.github.com/user/starred'
API_RELEASES_URL = 'https://api.github.com/repos/{owner}/{repo}/releases/latest'
API_TOPICS_URL = 'https://api.github.com/repos/{owner}/{repo}/topics'

auth_headers = {
    'Authorization': f'token {GHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

topics_headers = {
    'Authorization': f'token {GHUB_TOKEN}',
    'Accept': 'application/vnd.github.mercy-preview+json'  # Needed to access topics
}

def get_motherduck_connection():
    """Get connection to MotherDuck"""
    try:
        if MOTHERDUCK_TOKEN:
            connection_string = f'md:{MOTHERDUCK_DB}?motherduck_token={MOTHERDUCK_TOKEN}'
        else:
            # Use browser-based authentication
            connection_string = f'md:{MOTHERDUCK_DB}'
        
        conn = duckdb.connect(connection_string)
        print(f"‚úÖ Connected to MotherDuck database: {MOTHERDUCK_DB}")
        return conn
    except Exception as e:
        print(f"‚ùå Error connecting to MotherDuck: {e}")
        raise

def get_starred_repos():
    """Fetch all starred repositories from GitHub API"""
    print("üîç Fetching starred repositories...")
    starred = []
    page = 1

    while True:
        response = requests.get(
            API_STARRED_URL, 
            headers=auth_headers, 
            params={'per_page': 100, 'page': page}
        )
        
        if response.status_code != 200:
            print(f"Error fetching starred repos: {response.status_code} - {response.text}")
            break

        data = response.json()
        if not data:
            break

        starred.extend(data)
        page += 1
        print(f"üì¶ Fetched page {page-1} ({len(data)} repos)")

    print(f"‚úÖ Total starred repositories: {len(starred)}")
    return starred

def get_last_release_date(owner, repo):
    """Get the last release date for a repository"""
    url = API_RELEASES_URL.format(owner=owner, repo=repo)
    response = requests.get(url, headers=auth_headers)

    if response.status_code == 200:
        return response.json().get("published_at")
    elif response.status_code == 404:
        return "No releases"
    else:
        return f"Error: {response.status_code}"

def get_repo_topics(owner, repo):
    """Get topics for a repository"""
    url = API_TOPICS_URL.format(owner=owner, repo=repo)
    response = requests.get(url, headers=topics_headers)

    if response.status_code == 200:
        return response.json().get('names', [])
    else:
        return []

def scrape_github_list(list_url, tag_name):
    """Scrape a single GitHub list and return repository names with tag"""
    print(f"üîç Scraping list '{tag_name}': {list_url}")
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    try:
        response = requests.get(list_url, headers=headers)
        if response.status_code != 200:
            print(f"‚ùå Failed to load page for {tag_name}: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        repo_blocks = soup.select('div#user-list-repositories > div.border-bottom')

        repo_names = []
        for block in repo_blocks:
            name_tag = block.select_one('h3 a')
            if not name_tag:
                continue
                
            full_name = name_tag['href'].strip('/')
            repo_names.append(full_name)

        print(f"‚úÖ Found {len(repo_names)} repositories in '{tag_name}' list")
        return repo_names
        
    except Exception as e:
        print(f"‚ùå Error scraping {tag_name}: {str(e)}")
        return []

def get_curated_tags():
    """Fetch all curated lists and create a mapping of repo names to tags"""
    print("üè∑Ô∏è Fetching curated list tags...")
    repo_tags = defaultdict(set)
    ignore_repos = set()
    
    for tag_name in TAG_NAMES:
        list_config = GITHUB_LISTS[tag_name]
        list_url = list_config['url']
        
        repo_names = scrape_github_list(list_url, tag_name)
        
        if tag_name == 'ignore':
            # Special handling for ignore list
            ignore_repos.update(repo_names)
            print(f"üö´ Added {len(repo_names)} repositories to ignore list")
        else:
            for repo_name in repo_names:
                repo_tags[repo_name].add(tag_name)
        
        # Be nice to GitHub
        time.sleep(1)
    
    print(f"‚úÖ Collected tags for {len(repo_tags)} repositories")
    print(f"üö´ Ignoring {len(ignore_repos)} repositories")
    return repo_tags, ignore_repos

def process_repositories(repos, repo_tags, ignore_repos):
    """Process repositories and gather additional data, merging with curated tags"""
    print("üîÑ Processing repositories and gathering additional data...")
    data = []

    for i, repo in enumerate(repos):
        full_name = repo['full_name']
        
        # Skip ignored repositories
        if full_name in ignore_repos:
            print(f"üö´ Skipping ignored repository: {full_name}")
            continue
            
        owner, repo_name = full_name.split('/')
        
        print(f"üìä Processing {full_name} ({i+1}/{len(repos)})")
        
        # Get additional data
        last_release = get_last_release_date(owner, repo_name)
        topics = get_repo_topics(owner, repo_name)
        
        # Get curated tags for this repo
        curated_tags = list(repo_tags.get(full_name, set()))
        
        # Combine GitHub topics and curated tags
        all_tags = topics + curated_tags
        
        data.append({
            'name': full_name,
            'description': repo.get('description', ''),
            'stars': repo['stargazers_count'],
            'forks': repo['forks_count'],
            'language': repo.get('language', 'Unknown'),
            'url': repo['html_url'],
            'last_release': last_release,
            'topics': ", ".join(topics),
            'curated_tags': ", ".join(sorted(curated_tags)),
            'all_tags': ", ".join(sorted(all_tags)),
            'tags_count': len(all_tags),
            'is_curated': len(curated_tags) > 0,
            'created_at': repo['created_at'],
            'updated_at': repo['updated_at'],
            'pushed_at': repo.get('pushed_at', ''),
            'open_issues': repo.get('open_issues_count', 0),
            'archived': repo.get('archived', False),
            'fork': repo.get('fork', False),
            'fetched_at': datetime.now().isoformat()
        })

        time.sleep(0.1)

    print(f"‚úÖ Processed {len(data)} repositories (filtered out {len([r for r in repos if r['full_name'] in ignore_repos])} ignored)")
    return pd.DataFrame(data)

def upload_to_motherduck(df, table_name="starred"):
    """Upload DataFrame to MotherDuck"""
    print(f"üì§ Uploading to MotherDuck: {table_name}")
    
    try:
        conn = get_motherduck_connection()
        
        # Create table if it doesn't exist and insert data
        conn.execute(f"DROP TABLE IF EXISTS {table_name}")
        conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
        
        # Verify upload
        result = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()
        row_count = result[0]
        
        print(f"‚úÖ Uploaded {row_count} rows to MotherDuck table: {table_name}")
        
        conn.close()
        return True
        
    except Exception as e:
        print(f"‚ùå Error uploading to MotherDuck: {type(e).__name__}: {e}")
        raise

def main():
    """Main execution function"""
    print("üöÄ Starting unified starred repositories sync with MotherDuck...")
    print(f"‚è∞ Started at: {datetime.now().isoformat()}")
    
    try:
        # Fetch starred repositories
        repos = get_starred_repos()
        
        if not repos:
            print("‚ö†Ô∏è No starred repositories found")
            return
        
        # Get curated tags from lists and ignore list
        repo_tags, ignore_repos = get_curated_tags()
        
        # Process repositories with merged data
        df = process_repositories(repos, repo_tags, ignore_repos)
        
        # Upload to MotherDuck
        upload_to_motherduck(df)
        
        # Print summary
        curated_count = len(df[df['is_curated'] == True])
        languages_count = len(df['language'].unique())
        
        print(f"\nüìà Portfolio Summary:")
        print(f"   ‚Ä¢ Total starred repositories: {len(df)}")
        print(f"   ‚Ä¢ Curated repositories: {curated_count}")
        print(f"   ‚Ä¢ Programming languages: {languages_count}")
        print(f"   ‚Ä¢ Total stars accumulated: {df['stars'].sum():,}")
        print(f"   ‚Ä¢ Ignored repositories: {len(ignore_repos)}")
        
        # Show curated tag distribution
        if curated_count > 0:
            print(f"   ‚Ä¢ Curated tag distribution:")
            for tag in [t for t in TAG_NAMES if t != 'ignore']:
                tag_count = len(df[df['curated_tags'].str.contains(tag, na=False)])
                if tag_count > 0:
                    print(f"     - {tag}: {tag_count} repos")
        
        print(f"\nüéâ Successfully synced unified starred repositories to MotherDuck!")
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        sys.exit(1)
    
    print(f"‚úÖ Completed at: {datetime.now().isoformat()}")

if __name__ == '__main__':
    main()

üöÄ Starting unified starred repositories sync with MotherDuck...
‚è∞ Started at: 2025-07-04T13:49:48.864583
üîç Fetching starred repositories...
üì¶ Fetched page 1 (63 repos)
‚úÖ Total starred repositories: 63
üè∑Ô∏è Fetching curated list tags...
üîç Scraping list 'stack': https://github.com/stars/Veatec22/lists/stack
‚úÖ Found 22 repositories in 'stack' list
üîç Scraping list 'nice-to-have': https://github.com/stars/Veatec22/lists/nice-to-have
‚úÖ Found 4 repositories in 'nice-to-have' list
üîç Scraping list 'future-ideas': https://github.com/stars/Veatec22/lists/future-ideas
‚úÖ Found 27 repositories in 'future-ideas' list
üîç Scraping list 'ignore': https://github.com/stars/Veatec22/lists/ignore
‚úÖ Found 3 repositories in 'ignore' list
üö´ Added 3 repositories to ignore list
‚úÖ Collected tags for 53 repositories
üö´ Ignoring 3 repositories
üîÑ Processing repositories and gathering additional data...
üìä Processing fastai/fastai (1/63)
üö´ Skipping ignored repository:

In [6]:
'''
GitHub Repository Recommender
Recommends GitHub repositories based on user's starred repositories topics.
Optimized for speed and relevance.
Now uses MotherDuck (cloud DuckDB) for data storage.
'''

import os
import sys
import time
import duckdb
import requests
import pandas as pd
from dotenv import load_dotenv
from collections import Counter
from tqdm import tqdm

# Import our lists configuration for ignore list
# === CONFIGURATION ===
load_dotenv()
GHUB_TOKEN = os.getenv('GHUB_TOKEN')
MOTHERDUCK_TOKEN = os.getenv('MOTHERDUCK_TOKEN')
MOTHERDUCK_DB = os.getenv('MOTHERDUCK_DB', 'github')  # Default database name

auth_headers = {
    'Authorization': f'token {GHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

def get_motherduck_connection():
    """Get connection to MotherDuck"""
    try:
        if MOTHERDUCK_TOKEN:
            connection_string = f'md:{MOTHERDUCK_DB}?motherduck_token={MOTHERDUCK_TOKEN}'
        else:
            # Use browser-based authentication
            connection_string = f'md:{MOTHERDUCK_DB}'
        
        conn = duckdb.connect(connection_string)
        print(f"‚úÖ Connected to MotherDuck database: {MOTHERDUCK_DB}")
        return conn
    except Exception as e:
        print(f"‚ùå Error connecting to MotherDuck: {e}")
        raise

def get_starred_repos_from_motherduck():
    """Read starred repositories from MotherDuck"""
    print(f"üìä Reading starred repositories from MotherDuck")
    
    try:
        conn = get_motherduck_connection()
        
        # Get starred repositories data
        query = "SELECT * FROM starred"
        df = conn.execute(query).df()
        
        conn.close()
        
        print(f"‚úÖ Read {len(df)} rows from starred table")
        return df
        
    except Exception as e:
        print(f"‚ùå Error reading from MotherDuck: {type(e).__name__}: {e}")
        raise

def get_ignore_repos():
    """Get the list of repositories to ignore from the ignore list"""
    print("üö´ Getting ignore list...")
    
    try:
        # Import the scraper function from starred_fetcher
        
        ignore_config = GITHUB_LISTS.get('ignore')
        if not ignore_config:
            print("‚ö†Ô∏è No ignore list found in configuration")
            return set()
        
        ignore_repos = scrape_github_list(ignore_config['url'], 'ignore')
        ignore_set = set(ignore_repos)
        
        print(f"üö´ Found {len(ignore_set)} repositories to ignore")
        return ignore_set
        
    except Exception as e:
        print(f"‚ùå Error getting ignore list: {e}")
        return set()

def extract_topic_frequencies(df_starred):
    """Extract and count topic frequencies from starred repositories"""
    print("üîç Analyzing topics from starred repositories...")
    
    all_topics = []
    starred_repo_full_names = set()
    
    for _, row in df_starred.iterrows():
        # Collect starred repo full names for filtering (owner/repo format)
        if pd.notna(row.get('name')):
            starred_repo_full_names.add(row['name'])
        
        # Extract topics
        topics_str = row.get('topics', '')
        if pd.notna(topics_str) and topics_str.strip():
            topics = [t.strip().lower() for t in topics_str.split(',') if t.strip()]
            all_topics.extend(topics)
    
    # Count topic frequencies
    topic_counter = Counter(all_topics)
    
    print(f"üìà Found {len(topic_counter)} unique topics from {len(df_starred)} starred repos")
    print(f"üîç Tracking {len(starred_repo_full_names)} starred repositories for filtering")
    for topic, count in topic_counter.most_common(10):
        print(f"  ‚Ä¢ {topic}: {count} repos")
    
    return topic_counter, starred_repo_full_names

def search_repositories_by_topic(topic, min_stars=1000, max_results=50):
    """Search GitHub repositories by a specific topic"""
    print(f"üîé Searching for topic: '{topic}' (min {min_stars} stars)")
    
    repositories = []
    per_page = 100
    max_pages = (max_results // per_page) + 1
    
    for page in range(1, max_pages + 1):
        query = f"topic:{topic} stars:>={min_stars}"
        params = {
            'q': query,
            'sort': 'stars',
            'order': 'desc',
            'per_page': per_page,
            'page': page
        }
        
        try:
            response = requests.get('https://api.github.com/search/repositories', 
                                  headers=auth_headers, params=params)
            
            if response.status_code != 200:
                print(f"‚ö†Ô∏è API error for topic '{topic}': {response.status_code}")
                break
            
            data = response.json()
            items = data.get('items', [])
            
            if not items:
                break
                
            repositories.extend(items)
            print(f"  üì¶ Page {page}: {len(items)} repos (total: {len(repositories)})")
            
            # Stop if we have enough results
            if len(repositories) >= max_results:
                repositories = repositories[:max_results]
                break
                
            # Rate limiting
            time.sleep(0.1)
            
        except Exception as e:
            print(f"‚ùå Error searching for topic '{topic}': {e}")
            break
    
    print(f"‚úÖ Found {len(repositories)} repositories for topic '{topic}'")
    return repositories

def get_recommendations(topic_counter, starred_repo_full_names, ignore_repos, min_stars=1000, max_per_topic=50):
    """Get repository recommendations based on topic frequencies"""
    print("üéØ Generating recommendations based on topic analysis...")
    
    all_recommendations = {}
    filtered_count = 0
    ignored_count = 0
    
    # Process topics in order of frequency (most common first)
    for topic, frequency in tqdm(topic_counter.most_common(), desc="Processing topics"):
        # print(f"\n--- Processing topic: '{topic}' (appears in {frequency} starred repos) ---")
        
        repos = search_repositories_by_topic(topic, min_stars, max_per_topic)
        
        for repo in repos:
            repo_id = repo['id']
            repo_full_name = repo['full_name']  # This is "owner/repo" format
            
            # Skip if already starred (check against full_name)
            if repo_full_name in starred_repo_full_names:
                filtered_count += 1
                print(f"  üîÑ Skipping already starred: {repo_full_name}")
                continue
            
            # Skip if in ignore list
            if repo_full_name in ignore_repos:
                ignored_count += 1
                print(f"  üö´ Skipping ignored repository: {repo_full_name}")
                continue
            
            # If we haven't seen this repo before, add it
            if repo_id not in all_recommendations:
                all_recommendations[repo_id] = {
                    'repo_data': repo,
                    'topic_matches': [],
                    'total_frequency': 0
                }
            
            # Add this topic match
            all_recommendations[repo_id]['topic_matches'].append(topic)
            all_recommendations[repo_id]['total_frequency'] += frequency
    
    print(f"\n‚úÖ Found {len(all_recommendations)} unique recommendations")
    print(f"üîÑ Filtered out {filtered_count} already-starred repositories")
    print(f"üö´ Filtered out {ignored_count} ignored repositories")
    return all_recommendations

def format_recommendations(recommendations_dict):
    """Format recommendations into a DataFrame sorted by relevance and stars"""
    print("üìä Formatting and ranking recommendations...")
    
    formatted_recommendations = []
    
    for repo_id, data in recommendations_dict.items():
        repo = data['repo_data']
        topic_matches = data['topic_matches']
        total_frequency = data['total_frequency']
        
        formatted_recommendations.append({
            'name': repo['full_name'],
            'description': repo.get('description', ''),
            'stars': repo['stargazers_count'],
            'forks': repo['forks_count'],
            'language': repo.get('language', 'Unknown'),
            'url': repo['html_url'],
            'topics': ', '.join(repo.get('topics', [])),
            'matched_topics': ', '.join(topic_matches),
            'topic_frequency_score': total_frequency,
            'num_topic_matches': len(topic_matches)
        })
    
    df = pd.DataFrame(formatted_recommendations)
    
    # Sort by topic frequency score (descending) and then by stars (descending)
    df = df.sort_values(['topic_frequency_score', 'stars'], ascending=[False, False])
    
    print(f"üèÜ Top 10 recommendations:")
    for i, row in df.head(10).iterrows():
        print(f"  {i+1}. {row['name']} ({row['stars']:,} ‚≠ê) - Score: {row['topic_frequency_score']}")
        print(f"     Matched topics: {row['matched_topics']}")
    
    return df

def upload_recommendations_to_motherduck(df, table_name='recommendations'):
    """Upload DataFrame to MotherDuck"""
    print(f"üì§ Uploading recommendations to MotherDuck: {table_name}")
    
    try:
        conn = get_motherduck_connection()
        
        # Create table if it doesn't exist and insert data
        conn.execute(f"DROP TABLE IF EXISTS {table_name}")
        conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
        
        # Verify upload
        result = conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()
        row_count = result[0]
        
        print(f"‚úÖ Uploaded {row_count} rows to MotherDuck table: {table_name}")
        
        conn.close()
        return True
        
    except Exception as e:
        print(f"‚ùå Error uploading to MotherDuck: {type(e).__name__}: {e}")
        raise

def main():
    """Main execution function"""
    print("üöÄ Starting optimized repository recommendation sync with MotherDuck...")
    
    try:
        # Read starred repositories from MotherDuck
        df_starred = get_starred_repos_from_motherduck()
        
        if df_starred.empty:
            print("‚ö†Ô∏è No starred repositories found in MotherDuck. Cannot generate recommendations.")
            return
        
        # Get ignore list
        ignore_repos = get_ignore_repos()
        
        # Extract topic frequencies and starred repo names
        topic_counter, starred_repo_full_names = extract_topic_frequencies(df_starred)
        
        if not topic_counter:
            print("‚ö†Ô∏è No topics found in starred repositories. Cannot generate recommendations.")
            return
        
        # Get recommendations based on topics
        recommendations_dict = get_recommendations(topic_counter, starred_repo_full_names, ignore_repos)
        
        if not recommendations_dict:
            print("‚ö†Ô∏è No recommendations found.")
            return
        
        # Format and sort recommendations
        df_recommendations = format_recommendations(recommendations_dict)
        
        # Upload recommendations to MotherDuck
        upload_recommendations_to_motherduck(df_recommendations)
        
        print("üéâ Successfully generated and uploaded repository recommendations!")
        print(f"üìà Generated {len(df_recommendations)} recommendations based on {len(topic_counter)} topics")
        print(f"üö´ Ignored {len(ignore_repos)} repositories from ignore list")
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        sys.exit(1)

if __name__ == '__main__':
    main() 

üöÄ Starting optimized repository recommendation sync with MotherDuck...
üìä Reading starred repositories from MotherDuck
‚úÖ Connected to MotherDuck database: github
‚úÖ Read 60 rows from starred table
üö´ Getting ignore list...
üîç Scraping list 'ignore': https://github.com/stars/Veatec22/lists/ignore
‚úÖ Found 3 repositories in 'ignore' list
üö´ Found 3 repositories to ignore
üîç Analyzing topics from starred repositories...
üìà Found 391 unique topics from 60 starred repos
üîç Tracking 60 starred repositories for filtering
  ‚Ä¢ python: 35 repos
  ‚Ä¢ machine-learning: 25 repos
  ‚Ä¢ data-science: 20 repos
  ‚Ä¢ deep-learning: 14 repos
  ‚Ä¢ data-analysis: 12 repos
  ‚Ä¢ hacktoberfest: 11 repos
  ‚Ä¢ data-visualization: 9 repos
  ‚Ä¢ llm: 7 repos
  ‚Ä¢ ai: 7 repos
  ‚Ä¢ analytics: 7 repos
üéØ Generating recommendations based on topic analysis...


Processing topics:   0%|          | 0/391 [00:00<?, ?it/s]

üîé Searching for topic: 'python' (min 1000 stars)


Processing topics:   0%|          | 1/391 [00:03<22:49,  3.51s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'python'
  üîÑ Skipping already starred: tensorflow/tensorflow
  üîÑ Skipping already starred: pytorch/pytorch
  üö´ Skipping ignored repository: home-assistant/core
  üîÑ Skipping already starred: microsoft/ML-For-Beginners
  üîÑ Skipping already starred: apache/superset
  üö´ Skipping ignored repository: sherlock-project/sherlock
  üîÑ Skipping already starred: keras-team/keras
  üîÑ Skipping already starred: scikit-learn/scikit-learn
  üîÑ Skipping already starred: scrapy/scrapy
  üîÑ Skipping already starred: Avik-Jain/100-Days-Of-ML-Code
  üîÑ Skipping already starred: pandas-dev/pandas
  üîÑ Skipping already starred: aymericdamien/TensorFlow-Examples
üîé Searching for topic: 'machine-learning' (min 1000 stars)


Processing topics:   1%|          | 2/391 [00:06<20:47,  3.21s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'machine-learning'
  üîÑ Skipping already starred: tensorflow/tensorflow
  üîÑ Skipping already starred: pytorch/pytorch
  üîÑ Skipping already starred: microsoft/ML-For-Beginners
  üîÑ Skipping already starred: keras-team/keras
  üîÑ Skipping already starred: scikit-learn/scikit-learn
  üîÑ Skipping already starred: mlabonne/llm-course
  üîÑ Skipping already starred: Avik-Jain/100-Days-Of-ML-Code
  üîÑ Skipping already starred: aymericdamien/TensorFlow-Examples
  üîÑ Skipping already starred: streamlit/streamlit
  üîÑ Skipping already starred: gradio-app/gradio
  üîÑ Skipping already starred: ray-project/ray
  üîÑ Skipping already starred: AMAI-GmbH/AI-Expert-Roadmap
  üîÑ Skipping already starred: fastai/fastai
üîé Searching for topic: 'data-science' (min 1000 stars)


Processing topics:   1%|          | 3/391 [00:10<22:14,  3.44s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'data-science'
  üîÑ Skipping already starred: microsoft/ML-For-Beginners
  üîÑ Skipping already starred: apache/superset
  üîÑ Skipping already starred: keras-team/keras
  üîÑ Skipping already starred: scikit-learn/scikit-learn
  üîÑ Skipping already starred: pandas-dev/pandas
  üîÑ Skipping already starred: streamlit/streamlit
  üîÑ Skipping already starred: gradio-app/gradio
  üîÑ Skipping already starred: ray-project/ray
  üîÑ Skipping already starred: AMAI-GmbH/AI-Expert-Roadmap
  üîÑ Skipping already starred: ml-tooling/best-of-ml-python
  üîÑ Skipping already starred: sinaptik-ai/pandas-ai
  üîÑ Skipping already starred: ydataai/ydata-profiling
üîé Searching for topic: 'deep-learning' (min 1000 stars)


Processing topics:   1%|          | 4/391 [00:12<19:56,  3.09s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'deep-learning'
  üîÑ Skipping already starred: tensorflow/tensorflow
  üîÑ Skipping already starred: pytorch/pytorch
  üîÑ Skipping already starred: keras-team/keras
  üîÑ Skipping already starred: Avik-Jain/100-Days-Of-ML-Code
  üîÑ Skipping already starred: aymericdamien/TensorFlow-Examples
  üîÑ Skipping already starred: streamlit/streamlit
  üîÑ Skipping already starred: gradio-app/gradio
  üîÑ Skipping already starred: ray-project/ray
  üîÑ Skipping already starred: AMAI-GmbH/AI-Expert-Roadmap
  üîÑ Skipping already starred: fastai/fastai
üîé Searching for topic: 'data-analysis' (min 1000 stars)


Processing topics:   1%|‚ñè         | 5/391 [00:16<20:47,  3.23s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'data-analysis'
  üîÑ Skipping already starred: apache/superset
  üîÑ Skipping already starred: scikit-learn/scikit-learn
  üîÑ Skipping already starred: pandas-dev/pandas
  üîÑ Skipping already starred: metabase/metabase
  üîÑ Skipping already starred: streamlit/streamlit
  üîÑ Skipping already starred: gradio-app/gradio
  üîÑ Skipping already starred: AMAI-GmbH/AI-Expert-Roadmap
  üîÑ Skipping already starred: ml-tooling/best-of-ml-python
  üîÑ Skipping already starred: sinaptik-ai/pandas-ai
  üö´ Skipping ignored repository: dataease/dataease
  üîÑ Skipping already starred: Kanaries/pygwalker
  üîÑ Skipping already starred: ydataai/ydata-profiling
üîé Searching for topic: 'hacktoberfest' (min 1000 stars)


Processing topics:   2%|‚ñè         | 6/391 [00:19<20:48,  3.24s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'hacktoberfest'
  üö´ Skipping ignored repository: home-assistant/core
  üîÑ Skipping already starred: sdmg15/Best-websites-a-programmer-should-visit
  üö´ Skipping ignored repository: sherlock-project/sherlock
  üîÑ Skipping already starred: scrapy/scrapy
üîé Searching for topic: 'data-visualization' (min 1000 stars)


Processing topics:   2%|‚ñè         | 7/391 [00:23<21:25,  3.35s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'data-visualization'
  üîÑ Skipping already starred: apache/superset
  üîÑ Skipping already starred: metabase/metabase
  üîÑ Skipping already starred: streamlit/streamlit
  üîÑ Skipping already starred: gradio-app/gradio
  üîÑ Skipping already starred: ml-tooling/best-of-ml-python
  üîÑ Skipping already starred: sinaptik-ai/pandas-ai
  üö´ Skipping ignored repository: dataease/dataease
  üîÑ Skipping already starred: Avaiga/taipy
üîé Searching for topic: 'llm' (min 1000 stars)


Processing topics:   2%|‚ñè         | 8/391 [00:26<20:41,  3.24s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'llm'
  üîÑ Skipping already starred: mlabonne/llm-course
  üîÑ Skipping already starred: mendableai/firecrawl
  üîÑ Skipping already starred: ray-project/ray
  üîÑ Skipping already starred: gitleaks/gitleaks
  üîÑ Skipping already starred: sinaptik-ai/pandas-ai
üîé Searching for topic: 'ai' (min 1000 stars)


Processing topics:   2%|‚ñè         | 9/391 [00:29<21:33,  3.39s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'ai'
  üîÑ Skipping already starred: supabase/supabase
  üîÑ Skipping already starred: mendableai/firecrawl
  üîÑ Skipping already starred: AMAI-GmbH/AI-Expert-Roadmap
üîé Searching for topic: 'analytics' (min 1000 stars)


Processing topics:   3%|‚ñé         | 10/391 [00:32<20:59,  3.31s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'analytics'
  üîÑ Skipping already starred: apache/superset
  üîÑ Skipping already starred: metabase/metabase
  üîÑ Skipping already starred: duckdb/duckdb
  üîÑ Skipping already starred: getredash/redash
  üîÑ Skipping already starred: dbt-labs/dbt-core
  üîÑ Skipping already starred: modin-project/modin
üîé Searching for topic: 'pandas' (min 1000 stars)


Processing topics:   3%|‚ñé         | 11/391 [00:35<19:39,  3.10s/it]

  üì¶ Page 1: 99 repos (total: 99)
‚úÖ Found 50 repositories for topic 'pandas'
  üîÑ Skipping already starred: pandas-dev/pandas
  üîÑ Skipping already starred: sinaptik-ai/pandas-ai
  üîÑ Skipping already starred: Kanaries/pygwalker
  üîÑ Skipping already starred: dask/dask
  üîÑ Skipping already starred: ydataai/ydata-profiling
  üîÑ Skipping already starred: modin-project/modin
üîé Searching for topic: 'pytorch' (min 1000 stars)


Processing topics:   3%|‚ñé         | 12/391 [00:38<18:28,  2.93s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'pytorch'
  üîÑ Skipping already starred: keras-team/keras
  üîÑ Skipping already starred: ray-project/ray
  üîÑ Skipping already starred: fastai/fastai
  üîÑ Skipping already starred: ml-tooling/best-of-ml-python
üîé Searching for topic: 'tensorflow' (min 1000 stars)


Processing topics:   3%|‚ñé         | 13/391 [00:41<19:00,  3.02s/it]

  üì¶ Page 1: 100 repos (total: 100)
‚úÖ Found 50 repositories for topic 'tensorflow'
  üîÑ Skipping already starred: tensorflow/tensorflow
  üîÑ Skipping already starred: keras-team/keras
  üîÑ Skipping already starred: aymericdamien/TensorFlow-Examples
  üîÑ Skipping already starred: ray-project/ray
  üîÑ Skipping already starred: ml-tooling/best-of-ml-python
üîé Searching for topic: 'scikit-learn' (min 1000 stars)


Processing topics:   4%|‚ñé         | 14/391 [00:44<18:22,  2.92s/it]

  üì¶ Page 1: 91 repos (total: 91)
‚úÖ Found 50 repositories for topic 'scikit-learn'
  üîÑ Skipping already starred: microsoft/ML-For-Beginners
  üîÑ Skipping already starred: Avik-Jain/100-Days-Of-ML-Code
  üîÑ Skipping already starred: ml-tooling/best-of-ml-python
  üîÑ Skipping already starred: dask/dask
  üîÑ Skipping already starred: autogluon/autogluon
üîé Searching for topic: 'business-intelligence' (min 1000 stars)


Processing topics:   4%|‚ñç         | 15/391 [00:45<15:32,  2.48s/it]

  üì¶ Page 1: 30 repos (total: 30)
‚úÖ Found 30 repositories for topic 'business-intelligence'
  üîÑ Skipping already starred: apache/superset
  üîÑ Skipping already starred: metabase/metabase
  üîÑ Skipping already starred: getredash/redash
  üö´ Skipping ignored repository: dataease/dataease
  üîÑ Skipping already starred: dbt-labs/dbt-core
  üîÑ Skipping already starred: evidence-dev/evidence
üîé Searching for topic: 'gpu' (min 1000 stars)


Processing topics:   4%|‚ñç         | 15/391 [00:48<20:15,  3.23s/it]


KeyboardInterrupt: 