In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

import re
import pandas as pd
from tqdm import tqdm
from loguru import logger
from pymongo import MongoClient
import requests 
import time 
import concurrent.futures

# GitHub Personal Access Token (optional, but recommended for higher rate limits)
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    logger.warning("GitHub token not found. Rate limits may apply.")

BATCH_SIZE = 500 # Number of records to process before writing to CSV

# Logger config
logger.add("logs/prs-linking-commite.log")


1

 mongoDB configuration


In [4]:
# Mongo connection
client = MongoClient("mongodb://localhost:27017/")
db = client.github_data
pull_request_col = db.pull_requests

logger.info("🔌 MongoDB connected.")

[32m2025-06-02 10:41:21.178[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m🔌 MongoDB connected.[0m


In [5]:
# we are interested in isspull_requestues that are closed and referenced in commits
qurey = {
    "pull_request.state": "closed",
    "timeline.event": "referenced"
}

github APIs


In [6]:
def get_commit_data(url):
    """
    Fetch commit data for a given pull_request number from GitHub API.
    """
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}" if GITHUB_TOKEN else None
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        html_url = data.get("html_url")
        message = data.get("commit").get("message")
        if not message:
            logger.error(f"Missing commit message in response: {data}")   
        logger.success(f"✅ Successfully fetched commit data from {url}")
        return html_url, message
    else:
        # The variable 'issue_number' is not in the scope of this function.
        # Logging the URL that caused the error instead.
        # Also, providing more error details from response.text
        logger.error(f"Error fetching commit data from {url}: {response.status_code} - {response.text}")
        # Ensure two values are returned as expected by the calling code
        return None, None

csv initialization 

In [3]:
results_commite_id = []

csv_path = "../../data/commit_id-linking/pr-issue/linked_pull_request_commite_id.csv"

headers_pull_request_commite_id = [
    "pull_request_number", "pull_request_link", "pull_request_title", "pull_request_body",
    "linked_commit_id", "linked_commit_url", "linked_commit_message",
]

pd.DataFrame(columns=headers_pull_request_commite_id).to_csv(csv_path, index=False)
logger.info(f"Initialized CSV file at {csv_path}.")


[32m2025-06-02 10:40:34.626[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mInitialized CSV file at ../../data/commit_id-linking/pr-issue/linked_pull_request_commite_id.csv.[0m


fetching the data from mongoDB

In [8]:
# Fetch all records into a list to prevent cursor timeout during processing
logger.info("Fetching all records from MongoDB...")
all_records_list = list(pull_request_col.find(qurey))
toal_pull_request_count = len(all_records_list) # Actual number of records fetched
logger.info(f"Successfully fetched {toal_pull_request_count} records. Starting processing.")

[32m2025-06-02 10:43:01.285[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mFetching all records from MongoDB...[0m
[32m2025-06-02 10:43:04.530[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mSuccessfully fetched 998 records. Starting processing.[0m


Commit data fetching

In [9]:

logger.info("Starting to process pull_requests and fetch initial commit data concurrently...")

# Temporary list to hold data before converting to DataFrame rows
processed_event_data = []
MAX_WORKERS_COMMIT_DATA = 10  # Number of threads to use for fetching commit data
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS_COMMIT_DATA) as executor:
    future_to_event = {}
    for record_idx, record in enumerate(tqdm(all_records_list, total=toal_pull_request_count, desc="Submitting commit data tasks")):
        pull_request_data = record["pull_request"]
        for event_idx, event in enumerate(record.get("timeline", [])):
            if event.get("event") == "referenced":
                linked_commit_url_api = event.get("commit_url") # This is the API URL for the commit
                if linked_commit_url_api:
                    # Store record_idx and event_idx to link back results
                    future = executor.submit(get_commit_data, linked_commit_url_api)
                    future_to_event[future] = (record_idx, event_idx, pull_request_data, event)

    logger.info(f"Submitted {len(future_to_event)} tasks to fetch commit data. Waiting for completion...")
    for future in tqdm(concurrent.futures.as_completed(future_to_event), total=len(future_to_event), desc="Processing commit data results"):
        record_idx, event_idx, pull_request_data, event = future_to_event[future]
        try:
            linked_commit_html_url, linked_commit_message = future.result()
            linked_commit_id = event.get("commit_id")
            
            if linked_commit_id and linked_commit_html_url: # Ensure we have the essential data
                processed_event_data.append({
                    "pull_request_number": pull_request_data["number"],
                    "pull_request_link": pull_request_data.get("html_url"),
                    "pull_request_title": pull_request_data.get("title"),
                    "pull_request_body": pull_request_data.get("body"),
                    "linked_commit_id": linked_commit_id,
                    "linked_commit_url": linked_commit_html_url, 
                    "linked_commit_message": linked_commit_message,
                })
            elif linked_commit_id:
                 logger.warning(f"Commit data fetched for commit ID {linked_commit_id} but HTML URL was missing. API URL was {event.get('commit_url')}")

        except Exception as exc:
            original_commit_url = event.get("commit_url")
            logger.error(f"Fetching commit data for {original_commit_url} (event in pull_request {pull_request_data.get('number')}) generated an exception: {exc}")

logger.info(f"Finished fetching commit data. {len(processed_event_data)} events processed into commit details.")

# Convert processed_event_data to DataFrame and save to CSV
if processed_event_data:
    df_commits = pd.DataFrame(processed_event_data)
    df_commits.to_csv(csv_path, mode='w', header=headers_pull_request_commite_id, index=False) # Write header once with all data
    logger.info(f"Saved {len(df_commits)} records with commit data to {csv_path}.")
else:
    logger.info("No commit data was successfully processed to save.")

logger.info("✅ Initial commit data processing complete.")

[32m2025-06-02 10:43:10.118[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStarting to process pull_requests and fetch initial commit data concurrently...[0m
Submitting commit data tasks: 100%|██████████| 998/998 [00:00<00:00, 3911.45it/s]
[32m2025-06-02 10:43:10.404[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mSubmitted 2308 tasks to fetch commit data. Waiting for completion...[0m
Processing commit data results:   0%|          | 0/2308 [00:00<?, ?it/s][32m2025-06-02 10:43:11.145[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mget_commit_data[0m:[36m17[0m - [32m[1m✅ Successfully fetched commit data from https://api.github.com/repos/chiranSachintha/ballerina-lang/commits/ce793f94a38df5a844d84519473e8b04317e60bd[0m
Processing commit data results:   0%|          | 1/2308 [00:00<28:56,  1.33it/s][32m2025-06-02 10:43:11.163[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mget_commit_data[0m:[36m17[0m - 

getting commit url


In [10]:
# Function to parse owner and repo from a GitHub commit URL
def parse_commit_url(commit_url):
    if pd.isna(commit_url) or not isinstance(commit_url, str):
        logger.warning(f"Invalid commit_url for parsing: {commit_url}")
        return None, None
    # Regex for HTML URLs like https://github.com/owner/repo/commit/sha
    match = re.search(r"https://github.com/([^/]+)/([^/]+)/commit/.*", commit_url)
    if match:
        return match.group(1), match.group(2)
    
    # Regex for API URLs like https://api.github.com/repos/owner/repo/commits/sha
    # This is a fallback, as current 'linked_commit_url' seems to be HTML URLs
    match_api = re.search(r"https://api.github.com/repos/([^/]+)/([^/]+)/commits/.*", commit_url)
    if match_api:
        return match_api.group(1), match_api.group(2)
        
    logger.warning(f"Could not parse owner/repo from URL: {commit_url}")
    return None, None


search for issue refference

In [21]:
# csv_path = "../../data/commit_id-linking/pr-issue/linked_pull_request_commite_id.csv"

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

def extract_issue_details_from_commit(row: pd.Series) -> pd.Series:
    """
    Extracts an issue number from a commit message and constructs a full GitHub issue link.
    """
    commit_message = str(row.get('linked_commit_message', ''))
    if pd.isna(commit_message) or not commit_message.strip():
        return pd.Series([None, None], index=['extracted_issue_number', 'extracted_issue_link'])

    # Covers: fix(es/ed/ing), clos(e/es/ed), resolv(e/es/ed), issue(s)
    keyword_pattern = r'\b(?:fix(?:es|ed|ing)?|clos(?:e|es|ed)|resolv(?:e|es|ed)|issue(?:s)?)\b'

    # Optimized patterns to find issue numbers, ordered by specificity.
    issue_patterns = [
        # Priority 1: Keywords followed by a full GitHub issue or pull request URL.
        rf'{keyword_pattern}\s+https?://github\.com/[^/]+/[^/]+/(?:issues|pull)/(\d+)',

        # Priority 2: Keywords followed by #number.
        rf'{keyword_pattern}\s+#(\d+)\b',

        # Priority 3: Keywords followed by a plain number (no #).
        rf'{keyword_pattern}\s+(\d+)\b',

        # Priority 4: Keywords followed by "Fix <something> (from #number)".
        rf'{keyword_pattern}.*\(from\s+#(\d+)\)',

        # Priority 5: Keywords followed by "Fix <something> #number".
        rf'{keyword_pattern}.*#(\d+)\b',

        # Priority 6: Keywords followed by "part of #number" or "part of issue #number".
        rf'{keyword_pattern}.*(?:#(\d+)\b|part of\s+(?:issue\s+)?#(\d+)\b)',

        # Priority 7: Keywords followed by "/fix-#number" or "/fix-number".
        rf'{keyword_pattern}.*(?:/fix-#(\d+)\b|/fix-(\d+)\b)',

        # Priority 8: Specific pattern to extract issue numbers from "Fixes" URL.
        rf'Fixes:\s+https?://github\.com/[^/]+/[^/]+/issues/(\d+)'

    ]

    
    extracted_issue_number = None
    for pattern in issue_patterns:
        match = re.search(pattern, commit_message, re.IGNORECASE)
        if match:
            # The issue number is in the first captured group for these patterns.
            if match.groups(): 
                 extracted_issue_number = match.group(1)
                 if extracted_issue_number:
                    break  # Found a qualifying issue number, stop searching
    
    if not extracted_issue_number:
        return pd.Series([None, None], index=['extracted_issue_number', 'extracted_issue_link'])

    extracted_issue_link = None
    owner, repo = None, None
    
    commit_url_val = row.get('linked_commit_url')
    if pd.notna(commit_url_val):
        try:
            parsed_info = parse_commit_url(str(commit_url_val))
            if isinstance(parsed_info, tuple) and len(parsed_info) == 2:
                owner_candidate, repo_candidate = parsed_info
                # Ensure owner and repo are strings and not None before assignment
                if isinstance(owner_candidate, str) and isinstance(repo_candidate, str):
                    owner, repo = owner_candidate, repo_candidate
                else:
                    logger.warning(
                        f"parse_commit_url returned non-string owner/repo for URL: {commit_url_val}. "
                        f"Owner type: {type(owner_candidate)}, Repo type: {type(repo_candidate)}. "
                        f"PR: {row.get('pull_request_number', 'N/A')}."
                    )
                    # owner, repo remain None
            elif parsed_info is not None: # If it's None, it means parsing failed gracefully.
                logger.warning(
                    f"parse_commit_url did not return expected owner/repo tuple for URL: {commit_url_val}. "
                    f"Got: {type(parsed_info)}. PR: {row.get('pull_request_number', 'N/A')}."
                )
                # owner, repo remain None
        except Exception as e:
            logger.error(
                f"Exception during parse_commit_url for URL: {commit_url_val}. Error: {e}. "
                f"PR: {row.get('pull_request_number', 'N/A')}."
            )
            # owner, repo remain None
    
    if owner and repo and extracted_issue_number: # Check if owner and repo are valid strings
        extracted_issue_link = f"https://github.com/{owner}/{repo}/issues/{extracted_issue_number}"
    elif extracted_issue_number: 
        # This warning logs if issue number was found but link couldn't be formed
        # (e.g. owner/repo determination failed).
        current_owner_repo_state = f"owner: '{owner}', repo: '{repo}'"
        logger.warning(
            f"Could not determine valid owner/repo ({current_owner_repo_state}) from commit URL: {commit_url_val} "
            f"to form issue link for extracted issue #{extracted_issue_number} "
            f"from PR: {row.get('pull_request_number', 'N/A')}."
        )

    return pd.Series([extracted_issue_number, extracted_issue_link], index=['extracted_issue_number', 'extracted_issue_link'])

# Apply the function to the DataFrame to create new columns
# Ensure that 'linked_commit_message' and 'linked_commit_url' columns exist in df
logger.info("Extracting issue details from commit messages...")
df[['extracted_issue_number', 'extracted_issue_link']] = df.apply(
    extract_issue_details_from_commit, axis=1
)

df_referenced_issues = df.copy()
logger.info(f"Found {len(df_referenced_issues)} commits with extracted issue references.")

# Define the path for the new CSV file
output_csv_path_extracted_issues = "../../data/commit_id-linking/pr-issue/commits_with_extracted_issue_references.csv"

# Save the filtered DataFrame to the new CSV file
if not df_referenced_issues.empty:
    df_referenced_issues.to_csv(output_csv_path_extracted_issues, index=False)
    logger.info(f"Saved DataFrame with extracted issue references to {output_csv_path_extracted_issues}")
else:
    logger.info("No issue references were extracted, so no new CSV file was saved.")



[32m2025-06-02 12:33:37.279[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m107[0m - [1mExtracting issue details from commit messages...[0m
[32m2025-06-02 12:33:38.113[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m113[0m - [1mFound 2308 commits with extracted issue references.[0m
[32m2025-06-02 12:33:38.197[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m121[0m - [1mSaved DataFrame with extracted issue references to ../../data/commit_id-linking/pr-issue/commits_with_extracted_issue_references.csv[0m


In [27]:
output_csv_path_extracted_issues = "../../data/commit_id-linking/pr-issue/commits_with_extracted_issue_references.csv"
# Load the dataset
df_loaded = pd.read_csv(output_csv_path_extracted_issues)
logger.info(f"Loaded dataset from {output_csv_path_extracted_issues}. Shape: {df_loaded.shape}")

# Remove rows where 'issue_number' or 'pr_number' is missing
df_cleaned = df_loaded.dropna(subset=['extracted_issue_number', 'pull_request_number'])
logger.info(f"Shape after dropping rows with missing 'extracted_issue_number' or 'pull_request_number': {df_cleaned.shape}")

# Drop duplicate 'issue_number' and 'pr_number' pairs
# Keep the first occurrence by default
df_cleaned = df_cleaned.drop_duplicates(subset=['extracted_issue_number', 'pull_request_number'], keep='first')
logger.info(f"Shape after dropping duplicate 'extracted_issue_number'-'pull_request_number' pairs: {df_cleaned.shape}")

# Display the first few rows of the cleaned DataFrame and its info
print("Cleaned DataFrame head:")
print(df_cleaned.head())
print("\nCleaned DataFrame info:")
df_cleaned.info()

# Optionally, save the cleaned DataFrame to a new CSV file
cleaned_csv_path = "../../data/commit_id-linking/pr-issue/linked_pr_commit_issue_data_cleaned.csv"
df_cleaned.to_csv(cleaned_csv_path, index=False)
logger.info(f"💾 Cleaned DataFrame saved to {cleaned_csv_path}")

[32m2025-06-02 15:07:19.471[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mLoaded dataset from ../../data/commit_id-linking/pr-issue/commits_with_extracted_issue_references.csv. Shape: (2308, 9)[0m
[32m2025-06-02 15:07:19.471[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mShape after dropping rows with missing 'extracted_issue_number' or 'pull_request_number': (515, 9)[0m
[32m2025-06-02 15:07:19.485[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mShape after dropping duplicate 'extracted_issue_number'-'pull_request_number' pairs: (426, 9)[0m
[32m2025-06-02 15:07:19.527[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m24[0m - [1m💾 Cleaned DataFrame saved to ../../data/commit_id-linking/pr-issue/linked_pr_commit_issue_data_cleaned.csv[0m


Cleaned DataFrame head:
    pull_request_number                                  pull_request_link  \
5                 42717  https://github.com/ballerina-platform/ballerin...   
8                 43067  https://github.com/ballerina-platform/ballerin...   
14                38624  https://github.com/ballerina-platform/ballerin...   
22                38590  https://github.com/ballerina-platform/ballerin...   
31                38563  https://github.com/ballerina-platform/ballerin...   

                                   pull_request_title  \
5         Warn about corruption in Dependencies.toml.   
8            Improve the caching of lang gradle build   
14  Fix completions in the field access expression...   
22  Fix missing Fill required fields code action w...   
31  Fix completions for quote identifier given wit...   

                                    pull_request_body  \
5   ## Purpose\r\n> Describe the problems, issues,...   
8   ## Purpose\r\n> Fix the up-to-date checking of