In [None]:
# If running in an environment for the first time, install these packages.
# %pip install PyGithub pandas psycopg2-binary

In [None]:
from github import Github
import pandas as pd
import psycopg2
from datetime import datetime, timedelta
import os
import numpy as np

In [None]:
# Get token safely from environment
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
assert GITHUB_TOKEN, "GITHUB_TOKEN environment variable not set!"

g = Github(GITHUB_TOKEN)

repo_names = [
    "meta-llama/llama3",
    "ollama/ollama",
    "langchain-ai/langgraph",
    "openai/openai-cookbook",
    "milvus-io/pymilvus"
]


In [None]:
end_date = datetime.now()
start_date = end_date - timedelta(days=60)

# For comparison, ensure both are offset-naive
end_date_naive = end_date.replace(tzinfo=None)
start_date_naive = start_date.replace(tzinfo=None)
print("Start:", start_date_naive, "End:", end_date_naive)


In [None]:
def fetch_issues(repo_name, start_datetime, end_datetime):
    repo = g.get_repo(repo_name)
    issues = repo.get_issues(state="all", since=start_datetime)
    data = []
    for issue in issues:
        # Ensure created_at is offset-naive
        created_at = issue.created_at.replace(tzinfo=None)
        if created_at > end_datetime.replace(tzinfo=None):
            continue
        data.append({
            "repo": repo_name,
            "number": issue.number,
            "title": issue.title,
            "state": issue.state,
            # Make all dates offset-naive (handle None)
            "created_at": issue.created_at.replace(tzinfo=None) if issue.created_at else None,
            "closed_at": issue.closed_at.replace(tzinfo=None) if issue.closed_at else None,
            "labels": ",".join([label.name for label in issue.labels])
        })
    return pd.DataFrame(data)


In [None]:
for repo_name in repo_names:
    df = fetch_issues(repo_name, start_date_naive, end_date_naive)
    filename = repo_name.replace("/", "_") + "_issues.csv"
    df.to_csv(filename, index=False)
    print(f"Saved {filename} with {len(df)} rows.")


In [None]:
df = pd.read_csv("meta-llama_llama3_issues.csv")
df.head()


In [None]:
print(df.shape)
print(df.head())


In [None]:
conn = psycopg2.connect(
    dbname="postgres",
    user="postgres",
    password="admin",
    host="localhost",
    port=5432
)
cur = conn.cursor()

conn.autocommit = True  # <-- Enable autocommit!


In [None]:
dbname = "spm_db"
cur.execute(f"SELECT 1 FROM pg_catalog.pg_database WHERE datname = '{dbname}';")
exists = cur.fetchone()
if not exists:
    cur.execute(f"CREATE DATABASE {dbname};")
    print(f"Database '{dbname}' created.")
else:
    print(f"Database '{dbname}' already exists.")
conn.commit()
conn.close()


In [None]:
conn = psycopg2.connect(
    dbname="spm_db",
    user="postgres",
    password="admin",
    host="localhost",
    port=5432
)
cur = conn.cursor()

cur.execute("""
    CREATE TABLE IF NOT EXISTS github_issues (
        repo VARCHAR(50),
        number INTEGER,
        title TEXT,
        state VARCHAR(10),
        created_at TIMESTAMP,
        closed_at TIMESTAMP,
        labels TEXT
    );
""")
conn.commit()


In [None]:
def insert_issues_csv(csv_filename, conn):
    df = pd.read_csv(csv_filename)
    print(f"Inserting {len(df)} rows from {csv_filename}")
    print(df.head())
    cur = conn.cursor()
    error_count = 0
    for _, row in df.iterrows():
        try:
            closed_at = None if pd.isna(row.closed_at) else row.closed_at
            created_at = None if pd.isna(row.created_at) else row.created_at
            print(f"Inserting: {row.repo}, {row.number}, {row.created_at}, {row.closed_at}")
            cur.execute("""
                INSERT INTO github_issues
                (repo, number, title, state, created_at, closed_at, labels)
                VALUES (%s, %s, %s, %s, %s, %s, %s);
            """, (row.repo, row.number, row.title, row.state, created_at, closed_at, row.labels))
        except Exception as e:
            print("Error on row:", row.to_dict())
            print("Exception:", e)
            error_count += 1
    conn.commit()
    print(f"Total errors: {error_count}")


In [None]:
# Call the insert function for each CSV file
csv_files = [
    "meta-llama_llama3_issues.csv",
    "ollama_ollama_issues.csv",
    "langchain-ai_langgraph_issues.csv",
    "openai_openai-cookbook_issues.csv",
    "milvus-io_pymilvus_issues.csv"
]

for csv_file in csv_files:
    insert_issues_csv(csv_file, conn)


In [None]:
cur.execute("SELECT COUNT(*) FROM github_issues;")
print("Total issues loaded:", cur.fetchone()[0])
