# GitHub Contributor Analytics Pipeline

End-to-end ingestion and transformation pipeline using GitHub REST API
for the apache/airflow repository.

## Part 1: Data Ingestion

In [1]:
import os
import time
import requests
import json

BASE_URL = "https://api.github.com"
OWNER = "apache"
REPO = "airflow"

# Optional but strongly recommended
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

HEADERS = {
    "Accept": "application/vnd.github+json"
}

if GITHUB_TOKEN:
    HEADERS["Authorization"] = f"Bearer {GITHUB_TOKEN}"

DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)


def fetch_paginated(endpoint, params=None, max_pages=5):
    """
    Fetch paginated GitHub API data.
    max_pages is used to control API usage.
    """
    results = []
    page = 1

    while page <= max_pages:
        response = requests.get(
            f"{BASE_URL}{endpoint}",
            headers=HEADERS,
            params={**(params or {}), "per_page": 100, "page": page}
        )

        if response.status_code != 200:
            print(f"Failed {endpoint}: {response.status_code}")
            break

        data = response.json()
        if not data:
            break

        results.extend(data)
        page += 1
        time.sleep(0.5)  # polite rate limiting

    return results


def save_json(filename, data):
    path = os.path.join(DATA_DIR, filename)
    with open(path, "w") as f:
        json.dump(data, f, indent=2)
    return len(data)


def main():
    row_counts = {}

    # 1. Commits
    commits = fetch_paginated(f"/repos/{OWNER}/{REPO}/commits")
    row_counts["commits"] = save_json("commits.json", commits)

    # 2. Pull Requests
    pulls = fetch_paginated(f"/repos/{OWNER}/{REPO}/pulls", params={"state": "all"})
    row_counts["pulls"] = save_json("pulls.json", pulls)

    # 3. Pull Request Comments
    pr_comments = fetch_paginated(f"/repos/{OWNER}/{REPO}/pulls/comments")
    row_counts["pull_comments"] = save_json("pull_comments.json", pr_comments)

    # 4. Issues (includes PRs — we’ll clean later)
    issues = fetch_paginated(f"/repos/{OWNER}/{REPO}/issues", params={"state": "all"})
    row_counts["issues"] = save_json("issues.json", issues)

    # 5. Pull Reviews (sample first 20 PRs to stay safe)
    pull_reviews = []
    for pr in pulls[:20]:
        pr_number = pr["number"]
        reviews = fetch_paginated(
            f"/repos/{OWNER}/{REPO}/pulls/{pr_number}/reviews"
        )
        pull_reviews.extend(reviews)

    row_counts["pull_reviews"] = save_json("pull_reviews.json", pull_reviews)

    print("\nIngestion Complete — Row Counts:")
    for k, v in row_counts.items():
        print(f"{k}: {v}")


if __name__ == "__main__":
    main()


Failed /repos/apache/airflow/pulls: 500

Ingestion Complete — Row Counts:
commits: 500
pulls: 0
pull_comments: 500
issues: 500
pull_reviews: 0


## Part 2: Contributor Analytics Transformation


In [2]:
import json
import pandas as pd
from collections import defaultdict

DATA_DIR = "data"

def load_json(name):
    with open(f"{DATA_DIR}/{name}.json", "r") as f:
        return json.load(f)

# -------------------------------
# Load datasets
# -------------------------------
commits = load_json("commits")
pulls = load_json("pulls")
comments = load_json("pull_comments")
reviews = load_json("pull_reviews")

metrics = defaultdict(lambda: {
    "commits": 0,
    "prs": 0,
    "comments": 0,
    "reviews": 0
})

# -------------------------------
# Aggregate metrics
# -------------------------------
for c in commits:
    author = c.get("commit", {}).get("author", {}).get("name")
    login = c.get("author", {}).get("login")
    if login:
        metrics[login]["commits"] += 1

for p in pulls:
    login = p.get("user", {}).get("login")
    if login:
        metrics[login]["prs"] += 1

for c in comments:
    login = c.get("user", {}).get("login")
    if login:
        metrics[login]["comments"] += 1

for r in reviews:
    login = r.get("user", {}).get("login")
    if login:
        metrics[login]["reviews"] += 1

# -------------------------------
# Create DataFrame
# -------------------------------
df = pd.DataFrame.from_dict(metrics, orient="index").reset_index()
df.rename(columns={"index": "author"}, inplace=True)

# -------------------------------
# Score
# -------------------------------
df["raw_score"] = (
    df["commits"] * 5
    + df["prs"] * 10
    + df["comments"] * 2
    + df["reviews"] * 3
)

df["score"] = df["raw_score"].clip(upper=100)

# -------------------------------
# Tier assignment (ORDER MATTERS)
# -------------------------------
def assign_tier(row):
    activity = row["commits"] + row["prs"]
    if activity >= 20:
        return "core"
    elif activity >= 5:
        return "active"
    elif activity >= 1:
        return "contributor"
    else:
        return "observer"

df["tier"] = df.apply(assign_tier, axis=1)

# -------------------------------
# Ranking
# -------------------------------
df = df.sort_values("score", ascending=False)

df["overall_rank"] = df["score"].rank(
    method="dense", ascending=False
).astype(int)

df["tier_rank"] = (
    df.groupby("tier")["score"]
    .rank(method="dense", ascending=False)
    .astype(int)
)

df["percentile"] = (df["score"].rank(pct=True) * 100).round(2)

# -------------------------------
# Required Outputs
# -------------------------------

print("\nTop 10 Contributors (by score):")
print(
    df[
        ["author", "score", "tier", "commits", "prs", "comments", "reviews"]
    ].head(10)
)

print("\nTier Distribution:")
print(df["tier"].value_counts())

print("\nSummary:")
print(f"Total contributors: {len(df)}")
print(f"Min score: {df['score'].min()}")
print(f"Max score: {df['score'].max()}")
print(f"Contributors with max score: {(df['score'] == 100).sum()}")



Top 10 Contributors (by score):
              author  score      tier  commits  prs  comments  reviews
4             potiuk    100      core       22    0         0        0
7           jscheffl    100      core       25    0         0        0
27   dependabot[bot]    100      core       31    0         0        0
23       amoghrajesh    100      core       24    0         0        0
132     mistercrunch    100  observer        0    0       210        0
46          vincbeck     95    active       19    0         0        0
36    pierrejeambrun     95    active       19    0         0        0
153       criccomini     94  observer        0    0        47        0
55     jedcunningham     80    active       16    0         0        0
32         henry3260     75    active       15    0         0        0

Tier Distribution:
tier
contributor    99
observer       43
active         27
core            4
Name: count, dtype: int64

Summary:
Total contributors: 173
Min score: 2
Max score: 100
C

In [3]:
df[
    [
        "author",
        "commits",
        "prs",
        "comments",
        "reviews",
        "score",
        "tier",
        "overall_rank",
        "tier_rank",
        "percentile",
    ]
].to_csv("Part2_Contributors.csv", index=False)
