In [2]:
import re
import pandas as pd
from tqdm import tqdm
from loguru import logger
from pymongo import MongoClient

# Logger config
logger.add("linking.log", rotation="1 MB")

# Mongo connection
client = MongoClient("mongodb://localhost:27017/")
db = client.github_data
issues_col = db.issues

logger.info("🔌 MongoDB connected.")


[32m2025-05-19 10:15:06.085[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1m🔌 MongoDB connected.[0m


In [None]:
# Cross-referenced timeline PR
def find_cross_referenced_pr(timeline):
    for event in timeline:
        if event.get("event") == "cross-referenced":
            source = event.get("source", {})
            issue = source.get("issue", {})
            if issue.get("pull_request"):
                return issue.get("number"), "cross-referenced"
    return None, None

# Closed by PR commit or source
def find_closed_by_pr(timeline):
    for event in timeline:
        if event.get("event") == "closed": 
            source = event.get("source", {})
            if source.get("issue", {}).get("pull_request"):
                return source["issue"]["number"], "closed_by"
    return None, None

# Keyword patterns (issue description/comments)
def find_keyword_pr(issue, comments):
    patterns = [
        r"(?:close|closes|closed|fix|fixes|fixed|resolve|resolves|resolved)\s+#(\d+)",
        r"#(\d+)\s+(?:has\s+been\s+)?(?:closed|fixed|resolved)",
        r"(?:(?:related\s+to|references)\s+#\d+)"
    ]
    texts = [issue.get("body", "")] + [c.get("body", "") for c in comments]
    for text in texts:
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return int(match.group(1)), "keyword_match"
    return None, None


In [5]:
results = []
cursor = issues_col.find()

logger.info("🔎 Starting issue-PR linking...")
for record in tqdm(cursor, desc="🔗 Linking"):
    issue_data = record.get("issue", {})
    comments = record.get("comments", [])
    timeline = record.get("timeline", [])

    issue_number = issue_data.get("number")
    pr_number, strategy = find_cross_referenced_pr(timeline)

    if not pr_number:
        pr_number, strategy = find_closed_by_pr(timeline)

    if not pr_number:
        pr_number, strategy = find_keyword_pr(issue_data, comments)

    if pr_number:
        logger.success(f"Issue #{issue_number} ➡️ PR #{pr_number} via {strategy}")
        results.append({
            "issue_number": issue_number,
            "linked_pr": pr_number,
            "strategy": strategy
        })
    else:
        logger.warning(f"Issue #{issue_number} ➡️ No PR linked.")

logger.info("✅ Linking process complete.")


[32m2025-05-16 10:38:23.407[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m🔎 Starting issue-PR linking...[0m
[32m2025-05-16 10:38:24.558[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [32m[1mIssue #43204 ➡️ PR #43224 via keyword_match[0m
🔗 Linking: 381it [00:01, 312.46it/s]


TypeError: expected string or bytes-like object, got 'NoneType'

In [None]:
df = pd.DataFrame(results)
df.to_csv("data/linked_issues_prs.csv", index=False)
logger.success("📄 CSV saved as 'linked_issues_prs.csv'")
