In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

import re
import pandas as pd
from tqdm import tqdm
from loguru import logger
from pymongo import MongoClient
import requests 
import time 
import concurrent.futures

# GitHub Personal Access Token (optional, but recommended for higher rate limits)
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    logger.warning("GitHub token not found. Rate limits may apply.")

BATCH_SIZE = 500 # Number of records to process before writing to CSV

# Logger config
logger.add("logs/issues-linking-commite.log")


1

## mongofb configuration


In [2]:
# Mongo connection
client = MongoClient("mongodb://localhost:27017/")
db = client.github_data
issues_col = db.issues

logger.info("🔌 MongoDB connected.")

[32m2025-05-29 07:41:13.538[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m🔌 MongoDB connected.[0m


In [3]:
# we are interested in issues that are closed and referenced in commits
qurey = {
    "issue.state": "closed",
    "timeline.event": "referenced"
}

github APIs


In [6]:
def get_commit_data(url):
    """
    Fetch commit data for a given issue number from GitHub API.
    """
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}" if GITHUB_TOKEN else None
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        html_url = data.get("html_url")
        message = data.get("commit").get("message")
        if not message:
            logger.error(f"Missing commit message in response: {data}")   
        logger.success(f"✅ Successfully fetched commit data from {url}")
        return html_url, message
    else:
        # The variable 'issue_number' is not in the scope of this function.
        # Logging the URL that caused the error instead.
        # Also, providing more error details from response.text
        logger.error(f"Error fetching commit data from {url}: {response.status_code} - {response.text}")
        # Ensure two values are returned as expected by the calling code
        return None, None

In [7]:
results_commite_id = []

csv_path = "../../data/commit_id-linking/issue-pr/linked_issues_commite_id.csv"

headers_issue_commite_id = [
    "issue_number", "issue_link", "issue_title", "issue_body",
    "linked_commit_id", "linked_commit_url", "linked_commit_message",
]

MAX_WORKERS_COMMIT_DATA = 10  # Number of threads to use for fetching commit data

pd.DataFrame(columns=headers_issue_commite_id).to_csv(csv_path, index=False)
logger.info(f"Initialized CSV file at {csv_path}.")

# Fetch all records into a list to prevent cursor timeout during processing
logger.info("Fetching all records from MongoDB...")
all_records_list = list(issues_col.find(qurey))
toal_issue_count = len(all_records_list) # Actual number of records fetched
logger.info(f"Successfully fetched {toal_issue_count} records. Starting processing.")


[32m2025-05-29 07:41:31.569[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mInitialized CSV file at ../../data/commit_id-linking/issue-pr/linked_issues_commite_id.csv.[0m
[32m2025-05-29 07:41:31.569[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mFetching all records from MongoDB...[0m
[32m2025-05-29 07:41:34.877[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mSuccessfully fetched 1497 records. Starting processing.[0m


In [21]:

logger.info("Starting to process issues and fetch initial commit data concurrently...")

# Temporary list to hold data before converting to DataFrame rows
processed_event_data = []

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS_COMMIT_DATA) as executor:
    future_to_event = {}
    for record_idx, record in enumerate(tqdm(all_records_list, total=toal_issue_count, desc="Submitting commit data tasks")):
        issue_data = record["issue"]
        for event_idx, event in enumerate(record.get("timeline", [])):
            if event.get("event") == "referenced":
                linked_commit_url_api = event.get("commit_url") # This is the API URL for the commit
                if linked_commit_url_api:
                    # Store record_idx and event_idx to link back results
                    future = executor.submit(get_commit_data, linked_commit_url_api)
                    future_to_event[future] = (record_idx, event_idx, issue_data, event)

    logger.info(f"Submitted {len(future_to_event)} tasks to fetch commit data. Waiting for completion...")
    for future in tqdm(concurrent.futures.as_completed(future_to_event), total=len(future_to_event), desc="Processing commit data results"):
        record_idx, event_idx, issue_data, event = future_to_event[future]
        try:
            linked_commit_html_url, linked_commit_message = future.result()
            linked_commit_id = event.get("commit_id")
            
            if linked_commit_id and linked_commit_html_url: # Ensure we have the essential data
                processed_event_data.append({
                    "issue_number": issue_data["number"],
                    "issue_link": issue_data.get("html_url"),
                    "issue_title": issue_data.get("title"),
                    "issue_body": issue_data.get("body"),
                    "linked_commit_id": linked_commit_id,
                    "linked_commit_url": linked_commit_html_url, 
                    "linked_commit_message": linked_commit_message,
                })
            elif linked_commit_id:
                 logger.warning(f"Commit data fetched for commit ID {linked_commit_id} but HTML URL was missing. API URL was {event.get('commit_url')}")

        except Exception as exc:
            original_commit_url = event.get("commit_url")
            logger.error(f"Fetching commit data for {original_commit_url} (event in issue {issue_data.get('number')}) generated an exception: {exc}")

logger.info(f"Finished fetching commit data. {len(processed_event_data)} events processed into commit details.")

# Convert processed_event_data to DataFrame and save to CSV
if processed_event_data:
    df_commits = pd.DataFrame(processed_event_data)
    df_commits.to_csv(csv_path, mode='w', header=headers_issue_commite_id, index=False) # Write header once with all data
    logger.info(f"Saved {len(df_commits)} records with commit data to {csv_path}.")
else:
    logger.info("No commit data was successfully processed to save.")

logger.info("✅ Initial commit data processing complete.")

[32m2025-05-29 10:41:40.088[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mStarting to process issues and fetch initial commit data concurrently...[0m
Submitting commit data tasks: 100%|██████████| 1497/1497 [00:00<00:00, 5096.96it/s]
[32m2025-05-29 10:41:40.449[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mSubmitted 4244 tasks to fetch commit data. Waiting for completion...[0m
Submitting commit data tasks: 100%|██████████| 1497/1497 [00:00<00:00, 5096.96it/s]
[32m2025-05-29 10:41:40.449[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mSubmitted 4244 tasks to fetch commit data. Waiting for completion...[0m
Processing commit data results:   0%|          | 0/4244 [00:00<?, ?it/s][32m2025-05-29 10:41:41.266[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mget_commit_data[0m:[36m17[0m - [32m[1m✅ Successfully fetched commit data from https://api.github.com/repos/Chamika36/ballerina-

loading the Saved data


In [36]:
# Load the CSV file into a DataFrame
df = pd.read_csv(csv_path)

# Function to check if the commit message contains the issue number
def contains_issue_number(row):
    # Ensure issue_number is a string for searching
    issue_num_str = str(row['issue_number'])
    # Handle cases where linked_commit_message might be NaN or not a string
    commit_message = str(row['linked_commit_message'])
    issue_link_str = str(row['issue_link'])
    if pd.isna(commit_message):
        return False
    
    # Check for the issue number in various common formats (e.g., #123, Fixes 123, issue 123)
    # Using regex for more flexible matching, ensuring it's the whole number
    patterns = [
        rf'#{issue_num_str}\b',
        rf'\b(issue|fix(?:es|ed|ing|\s+for)?|close(?:s|d)?|resolve(?:s|d)?)\s+#?{issue_num_str}\b',
        rf'\b(issue|fix(?:es|ed|ing|\s+for)?|close(?:s|d)?|resolve(?:s|d)?)\s+{issue_link_str}\b',
        rf'\bgh-{issue_num_str}\b',  # Common GitHub reference
        rf'\b(?:Fix|Fixes|Fixing|Fix\s+for|Resolve|Resolved|Resolves|Closes|Closing|Issue|Issue\s+:)\s+https?://github\.com/[A-Za-z0-9_-]+/[A-Za-z0-9_-]+/issues/{issue_num_str}\b'
    ]

    # Add pattern for the issue link if it's a valid string
    if pd.notna(issue_link_str) and issue_link_str.strip():
        patterns.append(re.escape(issue_link_str)) # re.escape handles special characters in URL

    for pattern in patterns:
        if re.search(pattern, commit_message, re.IGNORECASE):
            return True
    return False

# Apply the function to create the new column
df['message_contains_issue_number'] = df.apply(contains_issue_number, axis=1)

# Update the output CSV filename to reflect the new column name
df.to_csv("../../data/commit_id-linking/issue-pr/message_contains_issue_reference.csv", index=False)
logger.info(f"Saved DataFrame with 'message_contains_issue_reference' to ../../data/commit_id-linking/issue-pr/message_contains_issue_reference.csv")

[32m2025-05-29 12:20:22.566[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m38[0m - [1mSaved DataFrame with 'message_contains_issue_reference' to ../../data/commit_id-linking/issue-pr/message_contains_issue_reference.csv[0m


In [37]:
df.head(100)

Unnamed: 0,issue_number,issue_link,issue_title,issue_body,linked_commit_id,linked_commit_url,linked_commit_message,message_contains_issue_number
0,43957,https://github.com/ballerina-platform/ballerin...,[Bug]: function '****' is called before module...,### Description\n\nWith the Test - https://git...,55e504f7222fb117483d57148cd7f63b6c17b727,https://github.com/Chamika36/ballerina-lang/co...,hack: BRunUtil changed (to resolve https://git...,True
1,43957,https://github.com/ballerina-platform/ballerin...,[Bug]: function '****' is called before module...,### Description\n\nWith the Test - https://git...,d8fa5ae831f7be09f8db0eccebb5f4eb64087844,https://github.com/ballerina-platform/ballerin...,hack: BRunUtil changed (to resolve https://git...,True
2,43957,https://github.com/ballerina-platform/ballerin...,[Bug]: function '****' is called before module...,### Description\n\nWith the Test - https://git...,aba69565ee92dfd81365c2ad18393f814d061fcc,https://github.com/Chamika36/ballerina-lang/co...,Change BRunUtil (to resolve https://github.com...,True
3,43348,https://github.com/ballerina-platform/ballerin...,[Task]: Investigate into the ballerina library...,### Description\r\n\r\n$subject.\r\n\r\nQuotin...,dce36aeb7758486689ead2e62693c091aed2ed58,https://github.com/lochana-chathura/ballerina-...,Add tmp hack to stop sharing `allocatedIds` ac...,True
4,43855,https://github.com/ballerina-platform/ballerin...,[Bug]: Cannot Read Annotation Value at Runtime...,### Description\n\nIn the `ballerinax/ai.agent...,16b566888cae24f1b73c3c2bb5d8c4c1027c8069,https://github.com/ballerina-platform/ballerin...,Fix https://github.com/ballerina-platform/ball...,True
...,...,...,...,...,...,...,...,...
95,37315,https://github.com/ballerina-platform/ballerin...,On fail should only be allowed with regular-co...,**Description:**\r\nRef https://ballerina.io/s...,2591d7bd584f23cdb91a0c3d5f3561f06900a04e,https://github.com/SasinduDilshara/ballerina-l...,Merge branch 'master' of https://github.com/ba...,True
96,37450,https://github.com/ballerina-platform/ballerin...,Incorrect diagnostic property in query expression,**Description:**\r\nConsider the diagnostic `v...,00c725ce3d25fc376f9676cf7eb6b41ace82a191,https://github.com/SasinduDilshara/ballerina-l...,Merge branch 'master' of https://github.com/ba...,True
97,36714,https://github.com/ballerina-platform/ballerin...,Distinct anonymous objects don't work as expected,**Description:**\r\n$title.\r\n\r\nWe seem to ...,b51784d189ab2506e9ee289e94f13d1ec3e94437,https://github.com/SasinduDilshara/ballerina-l...,Merge branch 'master' of https://github.com/ba...,True
98,37450,https://github.com/ballerina-platform/ballerin...,Incorrect diagnostic property in query expression,**Description:**\r\nConsider the diagnostic `v...,190fc57697b855cbb713ffbdffa2deffa354d726,https://github.com/SasinduDilshara/ballerina-l...,Merge branch 'master' of https://github.com/ba...,True


In [38]:
# Function to parse owner and repo from a GitHub commit URL
def parse_commit_url(commit_url):
    if pd.isna(commit_url) or not isinstance(commit_url, str):
        logger.warning(f"Invalid commit_url for parsing: {commit_url}")
        return None, None
    # Regex for HTML URLs like https://github.com/owner/repo/commit/sha
    match = re.search(r"https://github.com/([^/]+)/([^/]+)/commit/.*", commit_url)
    if match:
        return match.group(1), match.group(2)
    
    # Regex for API URLs like https://api.github.com/repos/owner/repo/commits/sha
    # This is a fallback, as current 'linked_commit_url' seems to be HTML URLs
    match_api = re.search(r"https://api.github.com/repos/([^/]+)/([^/]+)/commits/.*", commit_url)
    if match_api:
        return match_api.group(1), match_api.group(2)
        
    logger.warning(f"Could not parse owner/repo from URL: {commit_url}")
    return None, None


In [39]:

# Function to fetch PR data for a commit
def get_pr_data_for_commit(owner, repo, commit_sha):
    if not all([owner, repo, commit_sha]):
        logger.error(f"Missing owner, repo, or commit_sha for API call. Owner: {owner}, Repo: {repo}, SHA: {commit_sha}")
        return None, None, None, None

    api_url = f"https://api.github.com/repos/{owner}/{repo}/commits/{commit_sha}/pulls"
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}" if GITHUB_TOKEN else None,
        "Accept": "application/vnd.github.v3+json" 
    }
    
    try:
        response = requests.get(api_url, headers=headers, timeout=30) # Added timeout
        
        # Handle rate limits proactively
        if 'X-RateLimit-Remaining' in response.headers and int(response.headers['X-RateLimit-Remaining']) < 20: # Increased threshold
            reset_time = int(response.headers.get('X-RateLimit-Reset', time.time() + 60))
            sleep_duration = max(0, reset_time - time.time()) + 10 # Increased buffer
            logger.warning(f"Rate limit low ({response.headers['X-RateLimit-Remaining']}). Sleeping for {sleep_duration:.0f} seconds.")
            time.sleep(sleep_duration)

        if response.status_code == 200:
            prs_data = response.json()
            if prs_data: 
                pr = prs_data[0] # Take the first PR associated with the commit
                logger.success(f"✅ PR data for commit {commit_sha[:7]} in {owner}/{repo}: PR #{pr.get('number')}")
                return pr.get("number"), pr.get("html_url"), pr.get("title"), pr.get("body")
            else:
                logger.info(f"ℹ️ No PRs found for commit {commit_sha[:7]} in {owner}/{repo}.")
                return None, None, None, None
        elif response.status_code == 403 and "rate limit exceeded" in response.text.lower():
            reset_time = int(response.headers.get('X-RateLimit-Reset', time.time() + 300))
            sleep_duration = max(0, reset_time - time.time()) + 15 # Increased buffer
            logger.error(f"Rate limit exceeded for {api_url}. Sleeping for {sleep_duration:.0f} seconds. Will not retry automatically in this version.")
            time.sleep(sleep_duration) # Sleep and let the loop try next time or manual rerun
            return None, None, None, None # Indicate failure for this attempt
        else:
            logger.error(f"Error fetching PR data for {commit_sha[:7]} from {api_url}: {response.status_code} - {response.text[:200]}")
            return None, None, None, None
    except requests.exceptions.Timeout:
        logger.error(f"Request timed out for {api_url}")
        return None, None, None, None
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed for {api_url}: {e}")
        return None, None, None, None


In [40]:
# These will be populated by the concurrent tasks
# df is loaded from the CSV written in the previous cell

logger.info("🚀 Starting to fetch PR data for commits concurrently...")

# Ensure df is loaded if this cell is run independently after the previous one
try:
    df
except NameError:
    logger.info("DataFrame 'df' not found, loading from CSV...")
    df = pd.read_csv(csv_path) # csv_path should be defined in the previous cell

# To store results from concurrent execution. Each item will be a dict to update the DataFrame.
pr_data_results = [] 
MAX_WORKERS_PR_DATA = 10  # Number of threads to use for fetching PR data

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS_PR_DATA) as executor:
    future_to_row_index = {}
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Submitting PR data tasks"):
        commit_url = row.get('linked_commit_url') # HTML URL of the commit
        commit_sha = row.get('linked_commit_id')

        if pd.isna(commit_url) or pd.isna(commit_sha):
            # These rows won't have PR data, so we can skip submitting tasks for them
            # They will be handled later when merging results by having no PR data
            continue
        if row.get('message_contains_issue_number') is False:
            # If the commit message does not contain the issue number, skip fetching PR data
            logger.info(f"Skipping PR data fetch for commit {commit_sha[:7]} as message does not contain issue number.")
            continue

        owner, repo = parse_commit_url(str(commit_url)) # parse_commit_url expects HTML URL

        if owner and repo:
            future = executor.submit(get_pr_data_for_commit, owner, repo, str(commit_sha))
            future_to_row_index[future] = index # Store original DataFrame index
        else:
            # parse_commit_url would have logged a warning
            # No PR data will be fetched for this row
            pass

    logger.info(f"Submitted {len(future_to_row_index)} tasks to fetch PR data. Waiting for completion...")
    for future in tqdm(concurrent.futures.as_completed(future_to_row_index), total=len(future_to_row_index), desc="Processing PR data results"):
        original_df_index = future_to_row_index[future]
        try:
            pr_number_val, pr_link_val, pr_title_val, pr_body_val = future.result()
            pr_data_results.append({
                'original_index': original_df_index,
                'pr_number': pr_number_val,
                'pr_link': pr_link_val,
                'pr_title': pr_title_val,
                'pr_body': pr_body_val
            })
        except Exception as exc:
            commit_sha_for_error = df.loc[original_df_index, 'linked_commit_id']
            logger.error(f"Fetching PR data for commit SHA {commit_sha_for_error} (DataFrame index {original_df_index}) generated an exception: {exc}")
            # Append Nones so the row structure is maintained if needed, or handle missing data during merge
            pr_data_results.append({
                'original_index': original_df_index,
                'pr_number': None,
                'pr_link': None,
                'pr_title': None,
                'pr_body': None
            })

logger.info(f"Finished fetching PR data. {len(pr_data_results)} results received.")

# Update the DataFrame with the fetched PR data
if pr_data_results:
    # Create a temporary DataFrame from the results
    df_pr_updates = pd.DataFrame(pr_data_results)
    df_pr_updates.set_index('original_index', inplace=True)
    
    # Initialize new columns in the main DataFrame with None (or np.nan for numeric if preferred)
    df['pr_number'] = None
    df['pr_link'] = None
    df['pr_title'] = None
    df['pr_body'] = None

    # Update the main DataFrame using the original indices
    # Using .loc for safe assignment
    for col in ['pr_number', 'pr_link', 'pr_title', 'pr_body']:
        df.loc[df_pr_updates.index, col] = df_pr_updates[col]
    
    logger.info("✅ PR data merged into DataFrame.")
else:
    logger.info("No PR data was successfully fetched to update the DataFrame.")

# Display some info about the new columns
print("\nDataFrame with new PR columns (first 5 rows with relevant columns):")
# Ensure the columns exist before trying to print them, especially if no PR data was found
relevant_cols = ['issue_number', 'linked_commit_id']
if 'pr_number' in df.columns: relevant_cols.extend(['pr_number', 'pr_link', 'pr_title'])
print(df[relevant_cols].head())

pr_found_count = df['pr_number'].notna().sum() if 'pr_number' in df.columns else 0
print(f"\nNumber of commits for which PR data was successfully found: {pr_found_count} out of {df.shape[0]}")

# Optionally, save the updated DataFrame to a new CSV file
# output_csv_path_with_pr = "data/linked_issues_commit_pr_data.csv" # Defined in the final cell
# df.to_csv(output_csv_path_with_pr, index=False)
# logger.info(f"💾 Updated DataFrame potentially saved to {output_csv_path_with_pr}")


[32m2025-05-29 14:02:47.459[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m🚀 Starting to fetch PR data for commits concurrently...[0m


Submitting PR data tasks:  33%|███▎      | 1407/4218 [00:00<00:00, 4148.31it/s][32m2025-05-29 14:02:48.118[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mSkipping PR data fetch for commit 69075a6 as message does not contain issue number.[0m
[32m2025-05-29 14:02:48.118[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mSkipping PR data fetch for commit 69075a6 as message does not contain issue number.[0m
Submitting PR data tasks:  47%|████▋     | 1968/4218 [00:00<00:00, 4666.63it/s][32m2025-05-29 14:02:48.222[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mSkipping PR data fetch for commit 3ef5e3c as message does not contain issue number.[0m
Submitting PR data tasks:  47%|████▋     | 1968/4218 [00:00<00:00, 4666.63it/s][32m2025-05-29 14:02:48.222[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1mSkipping PR data fetch for commit 3ef5e3c as message does not conta


DataFrame with new PR columns (first 5 rows with relevant columns):
   issue_number                          linked_commit_id pr_number  \
0         43957  55e504f7222fb117483d57148cd7f63b6c17b727   43958.0   
1         43957  d8fa5ae831f7be09f8db0eccebb5f4eb64087844       NaN   
2         43957  aba69565ee92dfd81365c2ad18393f814d061fcc      10.0   
3         43348  dce36aeb7758486689ead2e62693c091aed2ed58   43303.0   
4         43855  16b566888cae24f1b73c3c2bb5d8c4c1027c8069   43858.0   

                                             pr_link  \
0  https://github.com/ballerina-platform/ballerin...   
1                                               None   
2  https://github.com/Chamika36/ballerina-lang/pu...   
3  https://github.com/ballerina-platform/ballerin...   
4  https://github.com/ballerina-platform/ballerin...   

                                            pr_title  
0              Java Runtime Implementation for Query  
1                                               None  
2 

In [41]:
output_csv_path_with_pr = "../../data/commit_id-linking/issue-pr/linked_issues_commit_pr_data.csv"
df.to_csv(output_csv_path_with_pr, index=False)
logger.info(f"💾 Updated DataFrame potentially saved to {output_csv_path_with_pr}")

[32m2025-05-29 15:40:50.632[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m💾 Updated DataFrame potentially saved to ../../data/commit_id-linking/issue-pr/linked_issues_commit_pr_data.csv[0m


In [4]:
output_csv_path_with_pr = "../../data/commit_id-linking/issue-pr/linked_issues_commit_pr_data.csv"
# Load the dataset
df_loaded = pd.read_csv(output_csv_path_with_pr)
logger.info(f"Loaded dataset from {output_csv_path_with_pr}. Shape: {df_loaded.shape}")

# Remove rows where 'issue_number' or 'pr_number' is missing
df_cleaned = df_loaded.dropna(subset=['issue_number', 'pr_number'])
logger.info(f"Shape after dropping rows with missing 'issue_number' or 'pr_number': {df_cleaned.shape}")

# Drop duplicate 'issue_number' and 'pr_number' pairs
# Keep the first occurrence by default
df_cleaned = df_cleaned.drop_duplicates(subset=['issue_number', 'pr_number'], keep='first')
logger.info(f"Shape after dropping duplicate 'issue_number'-'pr_number' pairs: {df_cleaned.shape}")

# Display the first few rows of the cleaned DataFrame and its info
print("Cleaned DataFrame head:")
print(df_cleaned.head())
print("\nCleaned DataFrame info:")
df_cleaned.info()

# Optionally, save the cleaned DataFrame to a new CSV file
cleaned_csv_path = "../../data/commit_id-linking/issue-pr/linked_issues_commit_pr_data_cleaned.csv"
df_cleaned.to_csv(cleaned_csv_path, index=False)
logger.info(f"💾 Cleaned DataFrame saved to {cleaned_csv_path}")

[32m2025-05-29 18:12:33.120[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mLoaded dataset from ../../data/commit_id-linking/issue-pr/linked_issues_commit_pr_data.csv. Shape: (4218, 12)[0m
[32m2025-05-29 18:12:33.137[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mShape after dropping rows with missing 'issue_number' or 'pr_number': (2276, 12)[0m
[32m2025-05-29 18:12:33.137[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mShape after dropping rows with missing 'issue_number' or 'pr_number': (2276, 12)[0m
[32m2025-05-29 18:12:33.142[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mShape after dropping duplicate 'issue_number'-'pr_number' pairs: (1858, 12)[0m
[32m2025-05-29 18:12:33.142[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mShape after dropping duplicate 'issue_number'-'pr_number' pairs: (1858, 12)[0m


Cleaned DataFrame head:
    issue_number                                         issue_link  \
0          43957  https://github.com/ballerina-platform/ballerin...   
2          43957  https://github.com/ballerina-platform/ballerin...   
3          43348  https://github.com/ballerina-platform/ballerin...   
4          43855  https://github.com/ballerina-platform/ballerin...   
12         43344  https://github.com/ballerina-platform/ballerin...   

                                          issue_title  \
0   [Bug]: function '****' is called before module...   
2   [Bug]: function '****' is called before module...   
3   [Task]: Investigate into the ballerina library...   
4   [Bug]: Cannot Read Annotation Value at Runtime...   
12  [Improvement]: Refactor `BUnionType`'s  `getMe...   

                                           issue_body  \
0   ### Description\n\nWith the Test - https://git...   
2   ### Description\n\nWith the Test - https://git...   
3   ### Description\r\n\r\n$subject

[32m2025-05-29 18:12:33.708[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m24[0m - [1m💾 Cleaned DataFrame saved to ../../data/commit_id-linking/issue-pr/linked_issues_commit_pr_data_cleaned.csv[0m
