In [None]:
import requests
from dotenv import load_dotenv
import os
import time 
import json
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

HEADERS = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
API_URL = "https://api.github.com/graphql"

query = """
query ($after: String) {
  search(query: "stars:>1000", type: REPOSITORY, first: 50, after: $after) {
    repositoryCount
    pageInfo {
      endCursor
      hasNextPage
    }
    edges {
      node {
        ... on Repository {
          nameWithOwner
          description
          stargazerCount
          forkCount
          url
          updatedAt
          issues(states: OPEN) {
            totalCount
          }
          watchers {
            totalCount
          }
        }
      }
    }
  }
}
"""


In [None]:
return_data = []
cursor = None

while True:
    variables = {"after": cursor}
    response = requests.post(API_URL, json={"query": query, "variables": variables}, headers=HEADERS)
    data = response.json()
    print(data)

    # Optional: basic error handling
    if "errors" in data:
        print("GraphQL error:", data["errors"])
        break

    return_data.append(data)

    page_info = data['data']['search']['pageInfo']
    if not page_info["hasNextPage"]:
        break

    cursor = page_info["endCursor"]

    # Optional: sleep to respect rate limits
    time.sleep(1)

# Write all responses to a file
with open("github_response.json", "w") as f:
    json.dump(return_data, f, indent=2)

In [None]:
CURSOR_FILE = 'github_cursor.json'
DATA_FILE = 'github_response_2.json'

def save_state(cursor, data):
    with open(CURSOR_FILE, '+a') as f:
        json.dump({'cursor': cursor}, f)
    with open(DATA_FILE, 'w') as f:
        json.dump(data, f, indent=2)

def load_state():
    if os.path.exists(CURSOR_FILE) and os.path.exists(DATA_FILE):
        with open(CURSOR_FILE) as f:
            cursor = json.load(f).get('cursor')
        with open(DATA_FILE) as f:
            data = json.load(f)
        return cursor, data
    return None, []

def run_query():
    cursor, return_data = load_state()
    retry_delay = 1
    max_delay = 300

    while True:
        variables = {'after': cursor}
        try:
            response = requests.post(API_URL, json={'query': query, 'variables': variables}, headers=HEADERS)
        except requests.exceptions.RequestException as e:
            print(f'Connection error: {e}, retrying in {retry_delay}s...')
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        if response.status_code == 403:
            reset = int(response.headers.get('X-RateLimit-Reset', time.time() + 60))
            wait_time = max(reset - time.time(), 60)
            print(f'Rate limited! Sleeping for {wait_time:.2f} seconds...')
            time.sleep(wait_time)
            continue

        if response.status_code != 200:
            print(f'Error {response.status_code}: {response.text}, retrying in {retry_delay}s...')
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        data = response.json()
        if 'errors' in data:
            print('GraphQL error:', data['errors'])
            break

        return_data.append(data)
        save_state(cursor, return_data)

        page_info = data['data']['search']['pageInfo']
        if not page_info['hasNextPage']:
            print('No more pages to fetch.')
            break

        cursor = page_info['endCursor']
        retry_delay = 1  # reset delay after success
        time.sleep(1)    # be nice to GitHub

    print(f'Fetched {len(return_data)} pages of data.')

if __name__ == '__main__':
    run_query()


In [None]:
""""
FOSS project name, description, stars, forks, issues, watchers
"""

import json
import csv

# Load the JSON data from the file
with open('github_response.json', 'r') as f:
    data = json.load(f)

# Prepare the list to hold the rows for the CSV
rows = []

# Loop through the search results and extract the relevant fields
for item in data:
    for repo in item['data']['search']['edges']:
        repo_data = repo['node']
        rows.append([
            repo_data['nameWithOwner'],
            repo_data['description'],
            repo_data['stargazerCount'],
            repo_data['forkCount'],
            repo_data['issues']['totalCount'],
            repo_data['watchers']['totalCount']
        ])

# Write the data to a CSV file
with open('github_repositories.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write the header
    writer.writerow(["FOSS project name", "description", "stars", "forks", "issues", "watchers"])
    # Write the rows
    writer.writerows(rows)

print("Data successfully written to github_repositories.csv")


In [None]:
import os
import json
import time
import requests

API_URL = "https://api.github.com/graphql"
HEADERS = {
    "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}"
}
OUTPUT_DIR = "github_data"
STEP = 1000

query_template = """
query($after: String, $range: String!) {
  search(query: $range, type: REPOSITORY, first: 50, after: $after) {
    repositoryCount
    pageInfo {
      endCursor
      hasNextPage
    }
    edges {
      node {
        ... on Repository {
          nameWithOwner
          description
          stargazerCount
          forkCount
          url
          updatedAt
          issues {
            totalCount
          }
          watchers {
            totalCount
          }
        }
      }
    }
  }
}
"""

def generate_star_ranges(start=1000, end=450000, step=1000):
    ranges = []
    for s in range(start, end, step):
        e = s + step - 1
        ranges.append(f"stars:{s}..{e}")
    ranges.append(f"stars:>={end}")
    return ranges

def save_state(star_range, cursor, data):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    prefix = star_range.replace(":", "_").replace("..", "_to_").replace(">=", "gte")
    with open(os.path.join(OUTPUT_DIR, f"{prefix}_cursor.json"), "w") as f:
        json.dump({"cursor": cursor}, f)
    with open(os.path.join(OUTPUT_DIR, f"{prefix}_data.json"), "w") as f:
        json.dump(data, f, indent=2)

def load_state(star_range):
    prefix = star_range.replace(":", "_").replace("..", "_to_").replace(">=", "gte")
    cursor_path = os.path.join(OUTPUT_DIR, f"{prefix}_cursor.json")
    data_path = os.path.join(OUTPUT_DIR, f"{prefix}_data.json")
    if os.path.exists(cursor_path) and os.path.exists(data_path):
        with open(cursor_path) as f:
            cursor = json.load(f).get("cursor")
        with open(data_path) as f:
            data = json.load(f)
        return cursor, data
    return None, []

def fetch_star_range(star_range):
    cursor, all_data = load_state(star_range)
    retry_delay = 1
    max_delay = 300
    print(f"\n▶ Fetching range: {star_range}")

    while True:
        variables = {
            "after": cursor,
            "range": f"{star_range} sort:stars-desc"
        }
        try:
            res = requests.post(API_URL, json={"query": query_template, "variables": variables}, headers=HEADERS)
        except requests.exceptions.RequestException as e:
            print(f"❌ Network error: {e}. Retrying in {retry_delay}s...")
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        if res.status_code == 403:
            reset = int(res.headers.get("X-RateLimit-Reset", time.time() + 60))
            wait_time = max(reset - time.time(), 60)
            print(f"⏳ Rate limited. Sleeping for {wait_time:.1f}s...")
            time.sleep(wait_time)
            continue

        if res.status_code != 200:
            print(f"❌ Error {res.status_code}: {res.text}")
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        data = res.json()
        if "errors" in data:
            print(f"❌ GraphQL error: {data['errors']}")
            break

        all_data.append(data)
        page_info = data["data"]["search"]["pageInfo"]
        cursor = page_info["endCursor"]
        save_state(star_range, cursor, all_data)

        if not page_info["hasNextPage"]:
            print(f"✅ Finished range: {star_range}. Pages fetched: {len(all_data)}")
            break

        retry_delay = 1  # Reset retry delay on success
        time.sleep(1)

def run_all():
    ranges = generate_star_ranges(1000, 450000, STEP)
    for r in ranges:
        fetch_star_range(r)

if __name__ == "__main__":
    run_all()


In [None]:
from pathlib import Path
import json
import csv

github_stats = Path("../github_data")

# Prepare the list to hold the rows for the CSV
rows = []

for data_file in github_stats.iterdir():
    if data_file.is_file():
        
        with open(data_file) as f:
            data = json.load(f)


        # Loop through the search results and extract the relevant fields
        for item in data:
            # if isinstance(item, str):
            #     item = json.load(item)
            try:
                for repo in item['data']['search']['edges']:
                    repo_data = repo['node']
                    rows.append([
                        repo_data['nameWithOwner'],
                        repo_data['description'],
                        repo_data['stargazerCount'],
                        repo_data['forkCount'],
                        repo_data['issues']['totalCount'],
                        repo_data['watchers']['totalCount'],
                        repo_data['updatedAt']
                    ])
            except Exception as e:
                print(f"Error parsing {data_file.name}: {e}")
        
with open("../csv_github_data/final_github_repos.csv",'w',newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["FOSS project name", "description", "stars", "forks", "issues", "watchers","last_ updated"])
    # Write the rows
    writer.writerows(rows)
    
   
print("Data successfully written to github_repositories_final.csv") 

In [None]:
import os
import csv
import json
import time
import requests
import fnmatch

# Config 
CSV_PATH = "../csv_github_data/FOSS_projects_slash.csv"
OUTPUT_PATH = "../enriched_github_data/FOSS_projects_with_dependencies.json"
GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")

# Dependency filenames
dependency_files = [
    "package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
    "requirements.txt", "setup.py", "pyproject.toml", "Pipfile", "Pipfile.lock",
    "Cargo.toml", "Cargo.lock", "go.mod", "go.sum", "pom.xml",
    "build.gradle", "build.gradle.kts", "settings.gradle", "settings.gradle.kts",
    "Gemfile", "Gemfile.lock", "composer.json", "composer.lock",
    "*.csproj", "*.fsproj", "packages.config", "global.json",
    "conanfile.txt", "conanfile.py", "vcpkg.json", "mix.exs", "rebar.config", "rebar.lock",
    "DESCRIPTION", "renv.lock", "packrat.lock", "cpanfile", "Makefile.PL", "Build.PL",
    "stack.yaml", "cabal.project", "*.cabal", "dune", "dune-project", "*.opam",
    "build.sbt", "project/*.scala", "Project.toml", "Manifest.toml",
    "pubspec.yaml", "nimble.json", "v.mod", "build.zig.zon",
    "deno.json", "import_map.json", "Dockerfile", "*.sh", "install.sh",
    "*.tf", "terraform.lock.hcl", "default.nix", "flake.nix", "AndroidManifest.xml",
    "Podfile", "Cartfile", "Package.swift", "project.clj", "deps.edn",
    "requirements.yml", "playbook.yml", "Chart.yaml", "requirements.yaml"
]
HEADERS = {
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}

# Pattern matching
exact_matches = set()
wildcard_patterns = []

for pattern in dependency_files:
    if '*' in pattern or '?' in pattern or '/' in pattern:
        wildcard_patterns.append(pattern)
    else:
        exact_matches.add(pattern)

def matches_dependency_file(filename):
    if filename in exact_matches:
        return True
    return any(fnmatch.fnmatch(filename, pattern) for pattern in wildcard_patterns)

# Retry wrapper for API requests
def request_with_backoff(url, headers, max_retries=5):
    retry_delay = 1
    max_delay = 300
    for attempt in range(max_retries):
        try:
            res = requests.get(url, headers=headers)
            if res.status_code == 403:
                reset = int(res.headers.get("X-RateLimit-Reset", time.time() + 60))
                wait_time = max(reset - time.time(), 60)
                print(f"⏳ Rate limited. Sleeping for {wait_time:.1f}s...")
                time.sleep(wait_time)
                continue
            elif res.status_code != 200:
                print(f"❌ Error {res.status_code} from {url}")
                time.sleep(retry_delay)
                retry_delay = min(retry_delay * 2, max_delay)
                continue
            return res
        except requests.exceptions.RequestException as e:
            print(f"❌ Network error: {e}. Retrying in {retry_delay}s...")
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
    return None

def get_repo_tree(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/HEAD?recursive=1"
    res = request_with_backoff(url, HEADERS)
    if res:
        try:
            return res.json().get("tree", [])
        except Exception as e:
            print(f"❌ JSON parsing error in get_repo_tree: {e}")
    return []

def get_dependency_file_content(owner, repo, path):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
    res = request_with_backoff(url, HEADERS)
    if res:
        try:
            content = res.json()
            if content.get("encoding") == "base64":
                import base64
                return base64.b64decode(content["content"]).decode("utf-8")
        except Exception as e:
            print(f"❌ Error decoding content for {path}: {e}")
    return None

def get_dependencies(owner, repo):
    tree = get_repo_tree(owner, repo)
    found_deps = []

    for item in tree:
        path = item.get("path")
        if item.get("type") == "blob" and matches_dependency_file(path):
            found_deps.append(path)

    return found_deps

def main():
    results = []

    with open(CSV_PATH, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            full_name = row['FOSS project']
            if not full_name or '/' not in full_name:
                continue

            owner, repo = full_name.strip().split("/", 1)
            print(f"🔍 Scanning {owner}/{repo} for dependency files...")
            deps = get_dependencies(owner, repo)

            results.append({
                "FOSS project": full_name,
                "dependency_files": deps,
                "description": row.get("description", ""),
                "stars": int(row.get("stars", 0)),
                "forks": int(row.get("forks", 0)),
                "issues": int(row.get("issues", 0)),
                "watchers": int(row.get("watchers", 0)),
                "last updated": row.get("last_updated", "")
            })

            time.sleep(1)  # Avoid hammering API

    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Finished writing data for {len(results)} projects to {OUTPUT_PATH}")

if __name__ == "__main__":
    main()
