In [None]:
from dotenv import load_dotenv
import os
import sys

# Load environment variables from .env file
load_dotenv()

import re
import pandas as pd
from tqdm import tqdm
from loguru import logger
from pymongo import MongoClient
import requests 
import time 

# GitHub Personal Access Token (optional, but recommended for higher rate limits)
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    logger.warning("GitHub token not found. Rate limits may apply.")

BATCH_SIZE = 500 # Number of records to process before writing to CSV

# Logger config
logger.add("logs/issue-pr-keyworld.log")


1

In [2]:
# Mongo connection
client = MongoClient("mongodb://localhost:27017/")
db = client.github_data
issues_col = db.issues

logger.info("🔌 MongoDB connected.")

[32m2025-05-23 11:02:01.635[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m🔌 MongoDB connected.[0m


In [3]:
def find_keyword_pr(issue, comments):
    pattern_priority = {
        r"(?i)Closed\s+with\s+#(\d+)": 1,
        r"(?i)Fixed\s+in\s+#(\d+)": 2,
        r"(?i)Fix\s+Issue\s+#(\d+)": 3,
        r"(?i)(?:close|closes|closed|fix|fixes|fixed|resolve|resolves|resolved)\s+#(\d+)": 4,
        r"(?i)#(\d+)\s+(?:has\s+been\s+)?(?:closed|fixed|resolved)": 5,
        r"(?i)(?:related\s+to|references)\s+#(\d+)": 6
    }

    found_prs = []

    # Check issue title
    issue_title = issue.get("title", "")
    if issue_title:
        for pattern, priority in pattern_priority.items():
            for match in re.finditer(pattern, issue_title, re.IGNORECASE):
                if match.group(1): 
                    found_prs.append((int(match.group(1)), "title_match", priority))

    # Check issue body
    issue_body = issue.get("body", "")
    if issue_body:
        for pattern, priority in pattern_priority.items():
            for match in re.finditer(pattern, issue_body, re.IGNORECASE):
                if match.group(1):
                    found_prs.append((int(match.group(1)), "body_match", priority))

    # Check comments
    for c in comments:
        comment_body = c.get("body", "")
        if comment_body:
            for pattern, priority in pattern_priority.items():
                for match in re.finditer(pattern, comment_body, re.IGNORECASE):
                    if match.group(1):
                        found_prs.append((int(match.group(1)), "comment_match", priority))
                        
    # Deduplicate and Sort by Priority
    found_prs = list(dict.fromkeys(found_prs))  # Remove duplicates while preserving order
    found_prs.sort(key=lambda x: x[2])  # Sort by priority (third element)

    return [(pr_number, strategy) for pr_number, strategy, priority in found_prs]  # Remove priority from final output


In [4]:
def fetch_pr_details_from_github(repo_api_url, pr_number, token=None):
    """
    Fetches PR details (title, body, html_url) from the GitHub API.
    repo_api_url is the API URL for the repository (e.g., https://api.github.com/repos/owner/repo)
    """
    if not repo_api_url:
        logger.warning(f"Repository API URL is missing. Cannot fetch PR #{pr_number}.")
        return None

    # Construct the API URL for the pull request
    # Example repo_api_url: "https://api.github.com/repos/wso2/product-is"
    # We need to ensure it's the base repo API URL for pulls.
    pr_api_url = f"{repo_api_url}/pulls/{pr_number}"
    
    headers = {
        "Accept": "application/vnd.github.v3+json"
    }
    if token:
        headers["Authorization"] = f"token {token}"

    try:
        response = requests.get(pr_api_url, headers=headers)
        
        if response.status_code == 403:
            logger.warning(f"Rate limit hit or forbidden access for PR #{pr_number} at {pr_api_url}. Checking headers...")
            if 'X-RateLimit-Remaining' in response.headers and int(response.headers['X-RateLimit-Remaining']) == 0:
                reset_time = int(response.headers.get('X-RateLimit-Reset', time.time() + 60))
                sleep_duration = max(0, reset_time - time.time()) + 5 # Add a small buffer
                logger.info(f"Rate limit exceeded. Sleeping for {sleep_duration:.2f} seconds.")
                time.sleep(sleep_duration)
                # Retry the request once after sleeping
                response = requests.get(pr_api_url, headers=headers)
        
        response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX) after retry attempt
        
        pr_data = response.json()
        return {
            "title": pr_data.get("title", "N/A"),
            "body": pr_data.get("body", "N/A"),
            "html_url": pr_data.get("html_url", "N/A")
        }
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            logger.warning(f"PR #{pr_number} not found at {pr_api_url}. Status: {e.response.status_code}")
        elif e.response.status_code == 403: # If still 403 after retry
             logger.error(f"Persistent 403 error for PR #{pr_number} at {pr_api_url} after retry. Token issue or access denied.")
        else:
            logger.error(f"HTTP error fetching PR #{pr_number} from {pr_api_url}: {e}")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching PR #{pr_number} from {pr_api_url}: {e}")
        return None
    except Exception as e:
        logger.error(f"An unexpected error occurred while fetching PR #{pr_number} from {pr_api_url}: {e}")
        return None

In [None]:
results_issue_only = []
# This list will store data needed for the next cell to fetch full PR details.
# Each item will be a dictionary like:
# { 'issue_number': ..., 'issue_link': ..., ..., 'linked_pr_number': ..., 'strategy': ..., 'repository_api_url': ... }
issues_for_full_pr_details = []


csv_issue_only_path = "../data/keyword-linking/issue-pr/kelinked_issues_prs_issue_only.csv"

headers_issue_only = [
    "issue_number", "issue_link", "issue_title", "issue_body",
    "linked_pr_number", "strategy"
]

# Write headers to csv_issue_only_path
pd.DataFrame(columns=headers_issue_only).to_csv(csv_issue_only_path, index=False, mode='w')
logger.info(f"Initialized {csv_issue_only_path} with headers.")

processed_issue_count = 0
total_issues = issues_col.count_documents({ "issue.state": "closed" })
cursor = issues_col.find({ "issue.state": "closed" })

logger.info("🔎 Starting initial issue-PR linking and data collection (for issue_only.csv)...")
for record in tqdm(cursor, total=total_issues, desc="🔗 Linking Issues (Pass 1)"):
    issue_data = record.get("issue", {})
    comments = record.get("comments", [])

    issue_number_val = issue_data.get("number")
    issue_html_url_val = issue_data.get("html_url", "N/A")
    issue_title_val = issue_data.get("title", "N/A")
    issue_body_val = issue_data.get("body", "N/A")
    repository_api_url_val = issue_data.get("repository_url", None)

    linked_prs_info = find_keyword_pr(issue_data, comments)

    if linked_prs_info:
        for pr_number_val, strategy_val in linked_prs_info:
            issue_only_entry = {
                "issue_number": issue_number_val,
                "issue_link": issue_html_url_val,
                "issue_title": issue_title_val,
                "issue_body": issue_body_val,
                "linked_pr_number": pr_number_val,
                "strategy": strategy_val
            }
            results_issue_only.append(issue_only_entry)

            # Prepare data for the next cell (full PR details fetching)
            data_for_next_cell = {
                **issue_only_entry, # Includes all issue details, pr_number, strategy
                "repository_api_url": repository_api_url_val
            }
            issues_for_full_pr_details.append(data_for_next_cell)
            
            logger.success(f"Issue #{issue_number_val} ➡️ PR #{pr_number_val} via {strategy_val}. Logged for issue_only.csv.")
    else:
        logger.warning(f"Issue #{issue_number_val} ➡️ No PR linked.")
        # If you want to include issues with no PRs in issue_only.csv, add them here.
        # For example:
        # results_issue_only.append({
        #     "issue_number": issue_number_val, "issue_link": issue_html_url_val, 
        #     "issue_title": issue_title_val, "issue_body": issue_body_val,
        #     "linked_pr_number": "N/A", "strategy": "no_link_found"
        # })

    processed_issue_count += 1

    if processed_issue_count % BATCH_SIZE == 0:
        if results_issue_only:
            df_issue = pd.DataFrame(results_issue_only)
            df_issue.to_csv(csv_issue_only_path, mode='a', header=False, index=False)
            logger.info(f"Appended batch of {len(results_issue_only)} records to {csv_issue_only_path}")
            results_issue_only.clear()

# Write any remaining records for issue_only.csv
if results_issue_only:
    df_issue = pd.DataFrame(results_issue_only)
    df_issue.to_csv(csv_issue_only_path, mode='a', header=False, index=False)
    logger.info(f"Appended final batch of {len(results_issue_only)} records to {csv_issue_only_path}")
    results_issue_only.clear()

logger.info(f"✅ Initial linking complete. Output in '{csv_issue_only_path}'.")
logger.info(f"{len(issues_for_full_pr_details)} PRs identified for full detail fetching in the next step.")


[32m2025-05-23 11:02:14.107[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mInitialized data/linked_issues_prs_issue_only.csv with headers.[0m
[32m2025-05-23 11:02:14.335[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m🔎 Starting initial issue-PR linking and data collection (for issue_only.csv)...[0m
[32m2025-05-23 11:02:14.553[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m55[0m - [32m[1mIssue #43789 ➡️ PR #43820 via comment_match. Logged for issue_only.csv.[0m
[32m2025-05-23 11:02:15.044[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m55[0m - [32m[1mIssue #43348 ➡️ PR #43303 via comment_match. Logged for issue_only.csv.[0m
[32m2025-05-23 11:02:15.073[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m55[0m - [32m[1mIssue #43293 ➡️ PR #43295 via comment_match. Logged for issue_only.csv.[0m
[32m2025-05-23 11:02:15.115[0m | [32m[1mSUCCES

In [None]:
# This cell generates csv_full_details_path using data from the previous cell.

results_full_details = []
csv_full_details_path = "../data/keyword-linking/ssue-pr/linked_issues_prs_full_details.csv"

headers_full = [
    "issue_number", "issue_link", "issue_title", "issue_body",
    "linked_pr_number", "pr_title", "pr_body", "pr_link", "strategy"
]

# Write headers to csv_full_details_path
pd.DataFrame(columns=headers_full).to_csv(csv_full_details_path, index=False, mode='w')
logger.info(f"Initialized {csv_full_details_path} with headers.")

processed_pr_count_git = 0

if not 'issues_for_full_pr_details' in locals() or not issues_for_full_pr_details:
    logger.warning("No data ('issues_for_full_pr_details') found from the previous cell to process for full PR details.")
else:
    logger.info(f"🔎 Starting full PR detail fetching for {len(issues_for_full_pr_details)} identified PRs...")
    for item_data in tqdm(issues_for_full_pr_details, desc="🔗 Fetching PR Details (Pass 2)"):
        repository_api_url = item_data.get("repository_api_url")
        pr_number = item_data.get("linked_pr_number")
        
        pr_details_fetched = None
        if repository_api_url and pr_number != "N/A": # Ensure pr_number is valid
            pr_details_fetched = fetch_pr_details_from_github(repository_api_url, pr_number, GITHUB_TOKEN)
        else:
            logger.warning(f"Issue #{item_data.get('issue_number')} - PR #{pr_number}: Missing repo URL or PR number. Skipping PR detail fetch.")

        pr_title_val = pr_details_fetched.get("title", "N/A") if pr_details_fetched else "N/A"
        
        pr_body_temp = pr_details_fetched.get("body", "N/A") if pr_details_fetched else "N/A"
        if isinstance(pr_body_temp, str) and pr_body_temp != "N/A":
            # Replace all types of newlines with a single space
            pr_body_val = pr_body_temp.replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
        else:
            pr_body_val = pr_body_temp # Assigns "N/A" or empty string if that was the case

        pr_link_val = pr_details_fetched.get("html_url", "N/A") if pr_details_fetched else "N/A"

        full_detail_entry = {
            "issue_number": item_data.get("issue_number"),
            "issue_link": item_data.get("issue_link"),
            "issue_title": item_data.get("issue_title"),
            "issue_body": item_data.get("issue_body"),
            "linked_pr_number": pr_number,
            "pr_title": pr_title_val,
            "pr_body": pr_body_val, # Consider summarizing/truncating
            "pr_link": pr_link_val,
            "strategy": item_data.get("strategy")
        }
        results_full_details.append(full_detail_entry)
        # logger.debug(f"Fetched details for PR #{pr_number} linked to Issue #{item_data.get('issue_number')}")

        processed_pr_count_git += 1

        if processed_pr_count_git % BATCH_SIZE == 0:
            if results_full_details:
                df_full = pd.DataFrame(results_full_details)
                df_full.to_csv(csv_full_details_path, mode='a', header=False, index=False)
                logger.info(f"Appended batch of {len(results_full_details)} records to {csv_full_details_path}")
                results_full_details.clear()

    # Write any remaining records for full_details.csv
    if results_full_details:
        df_full = pd.DataFrame(results_full_details)
        df_full.to_csv(csv_full_details_path, mode='a', header=False, index=False)
        logger.info(f"Appended final batch of {len(results_full_details)} records to {csv_full_details_path}")
        results_full_details.clear()

    logger.info(f"✅ Full PR detail fetching complete. Output in '{csv_full_details_path}'.")


In [None]:
# This cell is now redundant as CSV saving is handled in the main processing loop.
# df = pd.DataFrame(results)
# df.to_csv("data/linked_issues_prs_keyword.csv", index=False)
# logger.success("📄 CSV saved as 'linked_issues_prs_keyword.csv'")
logger.info("Previous CSV saving cell is now integrated into the main processing loop.")