In [11]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

import re
import pandas as pd
from tqdm import tqdm
from loguru import logger
from pymongo import MongoClient
import requests 
import time 
import concurrent.futures

# GitHub Personal Access Token (optional, but recommended for higher rate limits)
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    logger.warning("GitHub token not found. Rate limits may apply.")

BATCH_SIZE = 500 # Number of records to process before writing to CSV

# Logger config
logger.add("logs/pr-to-issues-keyword.log")


3

# Initial data set


mongoDB connection


In [12]:
# Mongo connection
client = MongoClient("mongodb://localhost:27017/")
db = client.github_data
pull_request_collection = db.pull_requests
logger.info("üîå MongoDB connected.")

[32m2025-06-03 15:08:35.844[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1müîå MongoDB connected.[0m


In [13]:
# Query to filter closed pull requests
qurey = {
    "pull_request.state": "closed",
}

# Fetch all records into a list to prevent cursor timeout during processing
logger.info("Fetching all records from MongoDB...")
all_records_list = list(pull_request_collection.find(qurey))
toal_pull_request_count = len(all_records_list) # Actual number of records fetched
logger.info(f"Successfully fetched {toal_pull_request_count} records.")


[32m2025-06-03 15:08:41.198[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mFetching all records from MongoDB...[0m
[32m2025-06-03 15:09:04.007[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mSuccessfully fetched 25042 records.[0m


In [14]:
pr_details_list = [] # List to store dictionaries of PR details

def extract_pr_details(record):
    """
    Extracts relevant details (number, link, title, body) from a pull request record.
    """
    pr_data = record.get("pull_request", {})
    # Handle cases where 'pull_request' key might be missing or empty
    if not pr_data: 
        logger.warning(f"Skipping record due to missing 'pull_request' data. Record ID: {record.get('_id', 'N/A')}")
        return None

    pr_number = pr_data.get("number")
    pr_link = pr_data.get("html_url", None)
    pr_title = pr_data.get("title", None)
    # Use "body" for description as it's the common field for PR description
    pr_body = pr_data.get("body", None) 

    if pr_number is None:
        logger.warning(f"Skipping record due to missing PR number. Record ID: {record.get('_id', 'N/A')}")
        return None

    return {
        "pr_number": pr_number,
        "pr_link": pr_link,
        "pr_title": pr_title,
        "pr_body": pr_body, # "description" in user request, mapped to "body"
    }

# Use ThreadPoolExecutor for processing records concurrently
# os.cpu_count() provides a sensible default for max_workers
# Adjust max_workers based on specific I/O vs CPU characteristics if needed
logger.info(f"Starting extraction of PR details for {toal_pull_request_count} records using concurrent.futures...")

with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    # Submit all tasks to the executor
    future_to_record = {executor.submit(extract_pr_details, record): record for record in all_records_list}
    
    # Process futures as they complete, with a progress bar
    for future in tqdm(concurrent.futures.as_completed(future_to_record), total=len(all_records_list), desc="Extracting PR Details"):
        record_for_future = future_to_record[future] # Get original record for logging in case of error
        try:
            result = future.result()
            if result:  # Add to list if extraction was successful (result is not None)
                pr_details_list.append(result)
        except Exception as exc:
            record_id = record_for_future.get("_id", "N/A")
            logger.error(f"Record ID {record_id} generated an exception during extraction: {exc}")

logger.info(f"Successfully processed all records. Extracted details for {len(pr_details_list)} pull requests.")

[32m2025-06-03 15:09:36.337[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1mStarting extraction of PR details for 25042 records using concurrent.futures...[0m
Extracting PR Details: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 25042/25042 [00:00<00:00, 147144.03it/s]
[32m2025-06-03 15:09:42.987[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m50[0m - [1mSuccessfully processed all records. Extracted details for 25042 pull requests.[0m


In [15]:
# Define the path for the output CSV file
output_csv_path = "../data/keyword-linking/pullrequestToIssue/closed_prs_summary.csv"

# Convert the list of dictionaries to a pandas DataFrame and save to CSV
if pr_details_list:
    df_prs = pd.DataFrame(pr_details_list)
    
    # Ensure the columns are in the desired order for the CSV
    # "description" from user request is mapped to "pr_body"
    csv_headers = ["pr_number", "pr_link", "pr_title", "pr_body"]
    df_prs = df_prs[csv_headers]

    try:
        # Save the DataFrame to CSV, overwriting if the file exists
        df_prs.to_csv(output_csv_path, index=False, mode='w')
        logger.info(f"Successfully saved PR details to {output_csv_path}")
    except Exception as e:
        logger.error(f"Failed to save PR details to CSV at {output_csv_path}: {e}")
else:
    logger.warning("No PR details were extracted (or all records were skipped). CSV file will not be created.")

[32m2025-06-03 15:10:25.477[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mSuccessfully saved PR details to ../data/keyword-linking/pullrequestToIssue/closed_prs_summary.csv[0m


# Key word processing


## Title


In [14]:
import pandas as pd
import re
from loguru import logger
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from typing import List, Tuple, Optional, Dict
import time


# Thread-safe lock for writing results
write_lock = threading.Lock()

csv_path = "../data/keyword-linking/pullrequestToIssue/closed_prs_summary.csv"

def extract_all_issue_numbers_from_title(title: str) -> List[Dict[str, str]]:
    """
    Extracts all issue numbers from pull request title.
    Returns a list of dictionaries with issue_number and issue_link (if found as URL).
    """
    if pd.isna(title) or not title.strip():
        return []

    # Covers: fix(es/ed/ing), clos(e/es/ed), resolv(e/es/ed), issue(s), Backport, Revert(s)
    keyword_pattern = r'\b(?:fix(?:es|ed|ing)?|clos(?:e|es|ed)|resolv(?:e|es|ed)|issue|Backport|Revert(?:s)?)\b'

    # Comprehensive patterns to find issue numbers, ordered by specificity
    issue_patterns = [
        # Priority 1: Keywords followed by colon and GitHub URL (most specific)
        rf'{keyword_pattern}:\s+(https?://github\.com/[^/\s]+/[^/\s]+/(?:issues|pull)/(\d+))',
        
        # Priority 2: Keywords followed by full GitHub issue URL
        rf'{keyword_pattern}\s+(https?://github\.com/[^/\s]+/[^/\s]+/(?:issues|pull)/(\d+))',
        
        # Priority 3: Keywords followed by repo path and issue number
        rf'{keyword_pattern}\s+[^/\s]+/[^/\s]+/issues/(\d+)',
        
        # Priority 4: Keywords + <text> + GitHub issue links + optional plain number
        rf'{keyword_pattern}.*<.*?>\s+(https?://github\.com/[^/\s]+/[^/\s]+/(?:issues|pull)/(\d+))(?:\s+(\d+))*',
        
        # Priority 5: Keywords + <text> + multiple issue numbers
        rf'{keyword_pattern}.*<.*?>\s+#(\d+)(?:\s+#(\d+))*',
        
        # Priority 6: Keywords followed by square brackets
        rf'{keyword_pattern}.*\[#(\d+)\]',
        
        # Priority 7: Square brackets followed by keywords
        rf'\[#(\d+)\].*{keyword_pattern}',
        
        # Priority 8: Keywords followed by (from #number)
        rf'{keyword_pattern}.*\(from\s+#(\d+)\)',
        
        # Priority 9: Keywords followed by "part of #number" or "part of issue #number"
        rf'{keyword_pattern}.*part\s+of\s+(?:issue\s+)?#(\d+)',
        
        # Priority 10: Keywords followed by "/fix-#number" or "/fix-number"
        rf'{keyword_pattern}.*(?:/fix-#(\d+)|/fix-(\d+))',
        
        # Priority 11: Multiple issues in one line (e.g., "fixes #123, #456, #789")
        rf'{keyword_pattern}[^#]*?((?:#\d+(?:\s*,\s*#\d+)*)+)',
        
        # Priority 12: Keywords followed by any text and #number
        rf'{keyword_pattern}.*#(\d+)',
        
        # Priority 13: Keywords followed by #number (direct)
        rf'{keyword_pattern}\s+#(\d+)',
        
        # Priority 14: Keywords followed by plain number (no #)
        rf'{keyword_pattern}\s+(\d+)\b',
        
        # Priority 15: #number followed by keyword
        rf'#(\d+)\s+{keyword_pattern}',
        
        # Priority 16: Specific "Address review suggestions" pattern
        rf'Address\s+review\s+suggestions.*#(\d+)',
        
        # Priority 17: Multiple issue numbers after "issues" (e.g., "issues #1159 #947")
        r'\bissues?\s+#(\d+)(?:\s+#(\d+))*',
        
        # Priority 18: GitHub URLs without keywords (issues only, not pull requests)
        r'(https?://github\.com/[^/\s]+/[^/\s]+/issues/(\d+))',
    ]

    found_issues = []
    
    for pattern in issue_patterns:
        matches = re.finditer(pattern, title, re.IGNORECASE)
        for match in matches:
            groups = match.groups()
            
            # Check if this is a URL pattern (contains full GitHub URL)
            full_url = None
            issue_number = None
            
            for group in groups:
                if group and group.startswith('http'):
                    full_url = group
                elif group and group.isdigit():
                    issue_number = group
                    
            if full_url and issue_number:
                # Found a full URL with issue number
                if not any(item['issue_number'] == issue_number for item in found_issues):
                    found_issues.append({
                        'issue_number': issue_number,
                        'issue_link': full_url
                    })
                continue
            
            # Handle non-URL patterns
            for group in groups:
                if group:
                    # Handle multiple issues in one match (e.g., "#123, #456")
                    if '#' in group:
                        issue_nums = re.findall(r'#(\d+)', group)
                        for issue_num in issue_nums:
                            if not any(item['issue_number'] == issue_num for item in found_issues):
                                found_issues.append({
                                    'issue_number': issue_num,
                                    'issue_link': None
                                })
                    elif group.isdigit():
                        if not any(item['issue_number'] == group for item in found_issues):
                            found_issues.append({
                                'issue_number': group,
                                'issue_link': None
                            })
    
    return found_issues

def generate_issue_link(issue_number: str, pr_link: str) -> str:
    """
    Generate GitHub issue link based on issue number and PR link.
    Assumes same repository as the PR.
    """
    if not issue_number or not pr_link:
        return ""
    
    # Extract repository info from PR link
    # Example: https://github.com/owner/repo/pull/123 -> https://github.com/owner/repo/issues/456
    match = re.match(r'(https?://github\.com/[^/]+/[^/]+)/', pr_link)
    if match:
        repo_base = match.group(1)
        return f"{repo_base}/issues/{issue_number}"
    
    return ""

def process_row(row_data: Tuple[int, pd.Series]) -> List[dict]:
    """
    Process a single row and return list of records (one per issue found).
    """
    idx, row = row_data
    
    try:
        title = str(row.get('pr_title', ''))
        pr_link = str(row.get('pr_link', ''))
        
        # Extract all issue numbers from title with their URLs (if found)
        issue_data = extract_all_issue_numbers_from_title(title)
        
        results = []
        
        if issue_data:
            # Create separate record for each issue found
            for issue_info in issue_data:
                issue_number = issue_info['issue_number']
                found_url = issue_info['issue_link']
                
                # Use found URL if available, otherwise generate one
                if found_url:
                    issue_link = found_url
                else:
                    issue_link = generate_issue_link(issue_number, pr_link)
                
                # Create new record with all original data plus extracted info
                new_record = row.to_dict()
                new_record.update({
                    'extracted_issue_number': issue_number,
                    'extracted_issue_link': issue_link,
                    'found_in_title': True,
                    'original_row_index': idx
                })
                results.append(new_record)
                
            logger.info(f"Row {idx}: Found {len(issue_data)} issues in title: {[item['issue_number'] for item in issue_data]}")
        else:
            # No issues found, keep original record with null values
            new_record = row.to_dict()
            new_record.update({
                'extracted_issue_number': None,
                'extracted_issue_link': None,
                'found_in_title': False,
                'original_row_index': idx
            })
            results.append(new_record)
            
        return results
        
    except Exception as e:
        logger.error(f"Error processing row {idx}: {str(e)}")
        # Return original record with error info
        new_record = row.to_dict()
        new_record.update({
            'extracted_issue_number': None,
            'extracted_issue_link': None,
            'found_in_title': False,
            'original_row_index': idx,
            'processing_error': str(e)
        })
        return [new_record]

def run():
    """
    Main function to process the CSV and create new output file.
    """
    logger.info("Starting issue number extraction process")
    
    try:
        # Load the CSV file
        logger.info(f"Loading CSV file: {csv_path}")
        df = pd.read_csv(csv_path)
        logger.info(f"Loaded {len(df)} records from CSV")
        
        # Prepare data for parallel processing
        row_data = list(df.iterrows())
        
        # Process rows in parallel
        all_results = []
        
        logger.info("Starting parallel processing...")
        with ThreadPoolExecutor(max_workers=4) as executor:
            # Submit all tasks
            future_to_row = {executor.submit(process_row, row): row[0] for row in row_data}
            
            # Process completed tasks with progress bar
            for future in tqdm(as_completed(future_to_row), total=len(future_to_row), desc="Processing rows"):
                try:
                    row_results = future.result()
                    with write_lock:
                        all_results.extend(row_results)
                except Exception as e:
                    row_idx = future_to_row[future]
                    logger.error(f"Failed to process row {row_idx}: {str(e)}")
        
        # Create new DataFrame from results
        logger.info(f"Creating new DataFrame with {len(all_results)} records")
        new_df = pd.DataFrame(all_results)
        
        # Generate output filename
        output_path = csv_path.replace('.csv', '_with_extracted_issues.csv')
        
        # Save to new CSV
        logger.info(f"Saving results to: {output_path}")
        new_df.to_csv(output_path, index=False)
        
        # Log statistics
        total_original_rows = len(df)
        total_new_rows = len(new_df)
        rows_with_issues = len(new_df[new_df['found_in_title'] == True])
        unique_issues_found = len(new_df[new_df['extracted_issue_number'].notna()]['extracted_issue_number'].unique())
        
        # Additional statistics for link sources
        extracted_links = len(new_df[(new_df['extracted_issue_link'].notna()) & 
                                    (new_df['extracted_issue_link'].str.startswith('http'))])
        generated_links = total_new_rows - extracted_links - len(new_df[new_df['extracted_issue_link'].isna()])
        
        logger.info("="*50)
        logger.info("PROCESSING SUMMARY")
        logger.info("="*50)
        logger.info(f"Original rows: {total_original_rows}")
        logger.info(f"New rows: {total_new_rows}")
        logger.info(f"Rows with issues found: {rows_with_issues}")
        logger.info(f"Unique issues extracted: {unique_issues_found}")
        logger.info(f"Links used from title: {extracted_links}")
        logger.info(f"Links generated: {generated_links}")
        logger.info(f"Expansion ratio: {total_new_rows/total_original_rows:.2f}x")
        logger.info(f"Output file: {output_path}")
        logger.info("="*50)
        
        print(f"\n‚úÖ Processing completed successfully!")
        print(f"üìä Original rows: {total_original_rows}")
        print(f"üìä New rows: {total_new_rows}")
        print(f"üìä Rows with issues: {rows_with_issues}")
        print(f"üìä Unique issues found: {unique_issues_found}")
        print(f"üîó Links from title: {extracted_links}")
        print(f"üîó Generated links: {generated_links}")
        print(f"üíæ Output saved to: {output_path}")
        
        return new_df
        
    except Exception as e:
        logger.error(f"Fatal error in main process: {str(e)}")
        raise

run()

[32m2025-06-06 15:18:02.008[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m216[0m - [1mStarting issue number extraction process[0m
[32m2025-06-06 15:18:02.012[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m220[0m - [1mLoading CSV file: ../data/keyword-linking/pullrequestToIssue/closed_prs_summary.csv[0m
[32m2025-06-06 15:18:02.370[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m222[0m - [1mLoaded 25042 records from CSV[0m
[32m2025-06-06 15:18:03.987[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun[0m:[36m230[0m - [1mStarting parallel processing...[0m
[32m2025-06-06 15:18:04.015[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_row[0m:[36m185[0m - [1mRow 41: Found 1 issues in title: ['12492'][0m
[32m2025-06-06 15:18:04.026[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_row[0m:[36m185[0m - [1mRow 89: Found 1 issues in title: ['12492'][0m
[32m2025-06-06 15:18:04.033[0m | [1mINFO    [0m | [36m__ma


‚úÖ Processing completed successfully!
üìä Original rows: 25042
üìä New rows: 25065
üìä Rows with issues: 313
üìä Unique issues found: 291
üîó Links from title: 313
üîó Generated links: 0
üíæ Output saved to: ../data/keyword-linking/pullrequestToIssue/closed_prs_summary_with_extracted_issues.csv


Unnamed: 0,pr_number,pr_link,pr_title,pr_body,extracted_issue_number,extracted_issue_link,found_in_title,original_row_index
0,3160,https://github.com/ballerina-platform/ballerin...,Update web socket sample for action invoation ...,,,,False,10799
1,42658,https://github.com/ballerina-platform/ballerin...,Create resource jars separately to read the re...,"## Purpose\r\n> Describe the problems, issues,...",,,False,7855
2,8108,https://github.com/ballerina-platform/ballerin...,Fix issues in list command help,## Purpose\r\n> This PR fixes issues in list c...,,,False,7854
3,43690,https://github.com/ballerina-platform/ballerin...,Update full build pipeline of PR build,## Purpose\r\n> $title.\r\n\r\nFixes #<Issue N...,,,False,7853
4,9535,https://github.com/ballerina-platform/ballerin...,Update keyword for composer syntax highlighting,## Purpose\r\nUpdate entries keywords in monar...,,,False,7852
...,...,...,...,...,...,...,...,...
25060,8235,https://github.com/ballerina-platform/ballerin...,Fix WebSub doc issues,## Purpose\r\nFix WebSub doc issues,,,False,17195
25061,12351,https://github.com/ballerina-platform/ballerin...,Update BBE with sync send and flush,## Purpose\r\n> Add sync send and flush to the...,,,False,14039
25062,25356,https://github.com/ballerina-platform/ballerin...,"Fix New Line Formatting in ""java.jdbc"" API Docs",## Purpose\r\nFix the new line formatting in j...,,,False,22298
25063,39163,https://github.com/ballerina-platform/ballerin...,Move architecture model generator implementati...,## Purpose\r\n> This PR removes the architectu...,,,False,14038


## Discription

In [5]:
import pandas as pd
import os  
import re
from loguru import logger
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from typing import List, Tuple, Optional, Dict
from urllib.parse import urlparse

# Initialize the write lock for thread safety
write_lock = threading.Lock()

def generate_issue_link(issue_number: str, pr_link: str) -> str:
    """
    Generate an issue link based on the issue number and PR link.
    Extracts the repository information from the PR link and constructs the issue URL.
    """
    try:
        if not pr_link or pd.isna(pr_link):
            return f"https://github.com/unknown/unknown/issues/{issue_number}"
        
        # Parse the PR link to extract owner and repo
        # Expected format: https://github.com/owner/repo/pull/123
        parsed_url = urlparse(str(pr_link))
        path_parts = parsed_url.path.strip('/').split('/')
        
        if len(path_parts) >= 2:
            owner = path_parts[0]
            repo = path_parts[1]
            return f"https://github.com/{owner}/{repo}/issues/{issue_number}"
        else:
            return f"https://github.com/unknown/unknown/issues/{issue_number}"
            
    except Exception as e:
        logger.warning(f"Error generating issue link for issue {issue_number} from PR {pr_link}: {str(e)}")
        return f"https://github.com/unknown/unknown/issues/{issue_number}"

def extract_all_issue_numbers_and_links_from_text(text: str) -> List[Dict[str, str]]:
    """
    Extracts all issue numbers and their corresponding links from a given text string.
    EXCLUDES issues found in remarks/notes sections.
    
    This function uses two methods:
    1. A list of regular expressions to find standard issue references (e.g., "fixes #123").
    2. A specific procedural search for lists of issues that appear after the phrase 
       "Following issues".
       
    Returns a list of dictionaries with 'issue_number' and 'issue_link' keys.
    If an issue is referenced as a link, the existing link is preserved.
    If an issue is referenced as just a number, issue_link will be None (to be generated later).
    """
    if pd.isna(text) or not text.strip():
        return []

    # First, identify and exclude remarks sections
    filtered_text = exclude_remarks_sections(text)
    
    found_issues = {}  # Use dict to store issue_number -> link mapping

    # --- Part 1: General Regex-Based Extraction (Modified to capture links) ---
    
    # Covers: fix(es/ed/ing), clos(e/es/ed), resolv(e/es/ed), issue(s)
    keyword_pattern = r'\b(?:fix(?:es|ed|ing)?|clos(?:e|es|ed)|resolv(?:e|es|ed)|issue(?:s)?)\b'

    issue_patterns = [
        # Matches: keyword + full GitHub issue/PR URL - CAPTURE THE FULL URL
        (rf'{keyword_pattern}\s+(https?://github\.com/[^/\s]+/[^/\s]+/(?:issues|pull)/(\d+))', 'full_url'),
        # Matches: "Fixes: " + full GitHub issue URL - CAPTURE THE FULL URL
        (rf'Fixes:\s+(https?://github\.com/[^/\s]+/[^/\s]+/issues/(\d+))', 'full_url'),
        # Matches: standalone full GitHub issue URL - CAPTURE THE FULL URL
        (r'(https?://github\.com/[^/\s]+/[^/\s]+/issues/(\d+))', 'full_url'),
        # Matches: "Fix " + org/repo/issues/issue_number - CAPTURE THE PARTIAL PATH
        (rf'Fix\s+([^/\s]+/[^/\s]+/issues/(\d+))', 'partial_path'),
        # Matches: keyword + #issue_number (e.g., "closes #456", "Fixes #32160")
        (rf'{keyword_pattern}\s+#(\d+)\b', 'hash_only'),
        # Matches: keyword + issue_number
        (rf'{keyword_pattern}\s+(\d+)\b', 'number_only'),
        # Matches: keyword + text + (from #issue_number)
        (rf'{keyword_pattern}.*\(from\s+#(\d+)\)', 'hash_only'),
        # Matches: keyword + text + #issue_number (a bit broad, but can be useful)
        (rf'{keyword_pattern}.*?#(\d+)\b', 'hash_only'),
        # Matches: keyword + text + (part of #issue_number)
        (rf'{keyword_pattern}.*?(?:part of\s+(?:issue\s+)?#(\d+)\b)', 'hash_only'),
        # Matches: keyword + text + /fix-#issue_number
        (rf'{keyword_pattern}.*?(?:/fix-#(\d+)\b|/fix-(\d+)\b)', 'number_only'),
        # Matches: keyword + multiple #issue_numbers separated by commas
        (rf'{keyword_pattern}[^#]*?((?:#\d+(?:\s*,\s*#\d+)*)+)', 'multiple_hash'),
    ]

    # Apply patterns to filtered text only
    for pattern_info in issue_patterns:
        pattern, pattern_type = pattern_info
        # Using re.IGNORECASE to match keywords like "Fixes" or "fixes"
        matches = re.finditer(pattern, filtered_text, re.IGNORECASE)
        for match in matches:
            if pattern_type == 'full_url':
                # Full URL captured - use it directly
                full_url = match.group(1)
                issue_number = match.group(2)
                found_issues[issue_number] = full_url
                
            elif pattern_type == 'partial_path':
                # Partial path captured - construct full URL
                partial_path = match.group(1)
                issue_number = match.group(2)
                full_url = f"https://github.com/{partial_path}"
                found_issues[issue_number] = full_url
                
            elif pattern_type == 'multiple_hash':
                # Multiple issues in one match (e.g., "#123, #456")
                group_val = match.group(1)
                if group_val and '#' in group_val:
                    issue_nums_in_group = re.findall(r'#(\d+)', group_val)
                    for issue_num in issue_nums_in_group:
                        if issue_num not in found_issues:
                            found_issues[issue_num] = None  # No link provided
                            
            else:  # hash_only, number_only
                # Just issue number captured - no link provided
                for group_val in match.groups():
                    if group_val and group_val.isdigit():
                        if group_val not in found_issues:
                            found_issues[group_val] = None  # No link provided

    # --- Part 1.5: Capture standalone issue references from filtered text ---
    # This captures any #issue_number that wasn't caught by keyword patterns
    # but only from the filtered text (excluding remarks sections)
    standalone_issue_matches = re.findall(r'#(\d+)\b', filtered_text)
    for issue_num in standalone_issue_matches:
        if issue_num not in found_issues:
            found_issues[issue_num] = None  # No link provided

    # --- Part 2: Procedural Extraction for "Following issues" List ---
    
    # Process only the filtered text for "Following issues" sections
    process_following_issues_section(filtered_text, found_issues)
    
    # Convert to list of dictionaries and sort
    result = []
    for issue_number in sorted(found_issues.keys(), key=int):
        result.append({
            'issue_number': issue_number,
            'issue_link': found_issues[issue_number]
        })
    
    return result

def exclude_remarks_sections(text: str) -> str:
    """
    Removes remarks/notes sections from the text to prevent extracting issues from them.
    This handles both markdown-style sections (##) and other formats.
    """
    lines = text.splitlines()
    filtered_lines = []
    in_remarks_section = False
    
    # Keywords that indicate a remarks/notes section
    remarks_keywords = [
        'remarks', 'remark', 'note', 'notes', 'comment', 'comments', 
        'todo', 'todos', 'known issues', 'limitations', 'caveats'
    ]
    
    for line in lines:
        line_stripped = line.strip().lower()
        
        # Check if this line starts a remarks section
        if line.strip().startswith('##') or line.strip().startswith('#'):
            # Extract the header text after the ## or #
            header_text = re.sub(r'^#+\s*', '', line.strip(), flags=re.IGNORECASE).lower()
            
            # Check if this is a remarks section
            if any(keyword in header_text for keyword in remarks_keywords):
                in_remarks_section = True
                continue  # Skip this line
            else:
                # This is a different section, we're no longer in remarks
                in_remarks_section = False
                filtered_lines.append(line)
        elif in_remarks_section:
            # Skip lines that are in the remarks section
            continue
        else:
            # Regular line, not in remarks section
            filtered_lines.append(line)
    
    return '\n'.join(filtered_lines)

def process_following_issues_section(text: str, found_issues: Dict[str, Optional[str]]) -> None:
    """
    Process text to find "Following issues" sections and extract issue numbers from them.
    This modifies the found_issues dictionary in place.
    """
    is_in_following_issues_section = False
    
    for line in text.splitlines():
        line_stripped = line.strip()
        
        # Turn on the flag when the trigger phrase is found with keyword validation
        if "following issues" in line.lower():
            # Check if the line contains relevant keywords
            relevant_keywords = [
                "fix", "fixes", "fixed", "fixing",
                "close", "closes", "closed", "closing",
                "resolve", "resolves", "resolved", "resolving",
                "address", "addresses", "addressed", "addressing",
                "implement", "implements", "implemented", "implementing",
                "handle", "handles", "handled", "handling",
                "track", "tracking", "tracked"
            ]
            
            # Check if any relevant keyword is present in the line
            if any(keyword in line.lower() for keyword in relevant_keywords):
                is_in_following_issues_section = True
                continue

        # If the flag is on, start extracting numbers and links
        if is_in_following_issues_section:
            # If the line is empty or starts a new section, the list is over
            if not line_stripped or line_stripped.startswith("##") or line_stripped.startswith("#"):
                is_in_following_issues_section = False
                continue

            # First, look for full GitHub URLs in the line
            url_matches = re.finditer(r'(https?://github\.com/[^/\s]+/[^/\s]+/issues/(\d+))', line)
            for url_match in url_matches:
                issue_number = url_match.group(2)
                issue_url = url_match.group(1)
                found_issues[issue_number] = issue_url

            # Then find all #<number> patterns on the line that weren't already captured as URLs
            hash_matches = re.findall(r'#(\d+)\b', line)
            for issue_num in hash_matches:
                if issue_num not in found_issues:
                    found_issues[issue_num] = None  # No link provided

def process_row_for_body(row_data: Tuple[int, pd.Series]) -> List[dict]:
    """
    Process a single row to extract issue numbers and links from 'pr_body'.
    Returns a list of dictionaries, one for each issue found, or one original if none found.
    """
    idx, row = row_data
    
    try:
        pr_body = str(row.get('pr_body', ''))
        pr_link = str(row.get('pr_link', ''))
        
        # Extract all issue numbers and links from pr_body
        issues_data = extract_all_issue_numbers_and_links_from_text(pr_body)
        
        results = []
        
        if issues_data:
            # Create separate record for each issue found in body
            for issue_data in issues_data:
                issue_num = issue_data['issue_number']
                existing_link = issue_data['issue_link']
                
                # Use existing link if provided, otherwise generate one
                if existing_link:
                    final_issue_link = existing_link
                    link_source = 'extracted_from_text'
                else:
                    # Generate link using the function defined above
                    final_issue_link = generate_issue_link(issue_num, pr_link)
                    link_source = 'generated'
                
                new_record = row.to_dict()
                new_record.update({
                    'extracted_issue_number_body': issue_num,
                    'extracted_issue_link_body': final_issue_link,
                    'issue_link_source': link_source,  # Track whether link was extracted or generated
                    'found_in_body': True,
                    'original_row_index': idx  # Keep track of the original row
                })
                results.append(new_record)
                
            logger.info(f"Row {idx}: Found {len(issues_data)} issues in body: {[item['issue_number'] for item in issues_data]}")
        else:
            # No issues found in body, keep original record with null/false values for body extraction
            new_record = row.to_dict()
            new_record.update({
                'extracted_issue_number_body': None,
                'extracted_issue_link_body': None,
                'issue_link_source': None,
                'found_in_body': False,
                'original_row_index': idx
            })
            results.append(new_record)
            
        return results
        
    except Exception as e:
        logger.error(f"Error processing row {idx} for body extraction: {str(e)}")
        # Return original record with error info for body extraction
        new_record = row.to_dict()
        new_record.update({
            'extracted_issue_number_body': None,
            'extracted_issue_link_body': None,
            'issue_link_source': None,
            'found_in_body': False,
            'original_row_index': idx,
            'processing_error_body': str(e)
        })
        return [new_record]

def run_body_extraction(csv_file_path: str = None):
    """
    Main function to process the CSV for issues in 'pr_body' and create a new output file.
    
    Args:
        csv_file_path: Path to the CSV file to process. If None, will look for common default paths.
    """
    logger.info("Starting issue number extraction from PR BODY content")
    
    try:
        # Determine CSV path
        if csv_file_path:
            current_csv_path = csv_file_path
        elif 'csv_path' in globals() and globals()['csv_path']:
            current_csv_path = globals()['csv_path']
        else:
            # Try common default paths
            possible_paths = [
                "../data/keyword-linking/pullrequestToIssue/closed_prs_summary.csv",
                "./closed_prs_summary.csv",
                "closed_prs_summary.csv"
            ]
            current_csv_path = None
            for path in possible_paths:
                if os.path.exists(path):
                    current_csv_path = path
                    break
            
            if not current_csv_path:
                raise FileNotFoundError(f"Could not find CSV file. Tried paths: {possible_paths}")

        logger.info(f"Loading CSV file: {current_csv_path}")
        df = pd.read_csv(current_csv_path)
        # df = df.head(1000)  # Limit to first 1000 rows for testing; remove in production
        logger.info(f"Loaded {len(df)} records from CSV for body extraction")
        
        # Prepare data for parallel processing
        row_data_for_body = list(df.iterrows())
        
        all_results_body = []
        
        logger.info("Starting parallel processing for PR bodies...")
        # Using CPU count for optimal performance
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
            future_to_row_body = {executor.submit(process_row_for_body, row_bd): row_bd[0] for row_bd in row_data_for_body}
            
            for future_bd in tqdm(as_completed(future_to_row_body), total=len(future_to_row_body), desc="Processing PR Bodies"):
                try:
                    row_results_bd = future_bd.result()
                    with write_lock: 
                        all_results_body.extend(row_results_bd)
                except Exception as e_bd:
                    row_idx_bd = future_to_row_body[future_bd]
                    logger.error(f"Failed to process row {row_idx_bd} for body extraction in main loop: {str(e_bd)}")
        
        logger.info(f"Creating new DataFrame with {len(all_results_body)} records from body extraction")
        new_df_body = pd.DataFrame(all_results_body)
        
        # Generate output filename for body extraction results
        output_path_body = current_csv_path.replace('.csv', '_with_extracted_issues_from_body.csv')
        
        logger.info(f"Saving body extraction results to: {output_path_body}")
        new_df_body.to_csv(output_path_body, index=False)
        
        # Log statistics for body extraction - with safe column checking
        total_original_rows_bd = len(df)
        total_new_rows_bd = len(new_df_body)
        
        # Safe column checking to avoid KeyError
        rows_with_issues_in_body = 0
        unique_issues_found_in_body = 0
        extracted_links_count = 0
        generated_links_count = 0
        
        if 'found_in_body' in new_df_body.columns:
            rows_with_issues_in_body = len(new_df_body[new_df_body['found_in_body'] == True])
        
        if 'extracted_issue_number_body' in new_df_body.columns:
            unique_issues_found_in_body = len(new_df_body[new_df_body['extracted_issue_number_body'].notna()]['extracted_issue_number_body'].unique())
        
        if 'issue_link_source' in new_df_body.columns:
            extracted_links_count = len(new_df_body[new_df_body['issue_link_source'] == 'extracted_from_text'])
            generated_links_count = len(new_df_body[new_df_body['issue_link_source'] == 'generated'])
        
        logger.info("="*50)
        logger.info("PR BODY EXTRACTION PROCESSING SUMMARY")
        logger.info("="*50)
        logger.info(f"Original rows processed: {total_original_rows_bd}")
        logger.info(f"New rows generated (from body): {total_new_rows_bd}")
        logger.info(f"Rows with issues found in body: {rows_with_issues_in_body}")
        logger.info(f"Unique issues extracted from body: {unique_issues_found_in_body}")
        logger.info(f"Links extracted from text: {extracted_links_count}")
        logger.info(f"Links generated: {generated_links_count}")
        if total_original_rows_bd > 0:
            logger.info(f"Expansion ratio (body): {total_new_rows_bd/total_original_rows_bd:.2f}x")
        logger.info(f"Output file (body extraction): {output_path_body}")
        logger.info("="*50)
        
        print(f"\n‚úÖ PR Body issue extraction completed successfully!")
        print(f"üìä Original rows: {total_original_rows_bd}")
        print(f"üìä New rows (from body): {total_new_rows_bd}")
        print(f"üìä Rows with issues found in body: {rows_with_issues_in_body}")
        print(f"üìä Unique issues found in body: {unique_issues_found_in_body}")
        print(f"üîó Links extracted from text: {extracted_links_count}")
        print(f"üîó Links generated: {generated_links_count}")
        print(f"üíæ Output saved to: {output_path_body}")
        
        return new_df_body
        
    except FileNotFoundError as e:
        logger.error(f"Error: {str(e)}")
        print(f"‚ùå Error: {str(e)}")
        return None
    except Exception as e:
        logger.error(f"Fatal error in PR body extraction process: {str(e)}")
        print(f"‚ùå Fatal error in PR body extraction process: {str(e)}")
        raise

# Execute the body extraction process
# You can specify a custom CSV path or let it auto-detect
result_df_body = run_body_extraction()

# If you want to specify a custom path, use:
# result_df_body = run_body_extraction("path/to/your/csv/file.csv")

[32m2025-06-11 10:37:51.158[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_body_extraction[0m:[36m314[0m - [1mStarting issue number extraction from PR BODY content[0m
[32m2025-06-11 10:37:51.163[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_body_extraction[0m:[36m338[0m - [1mLoading CSV file: ../data/keyword-linking/pullrequestToIssue/closed_prs_summary.csv[0m
[32m2025-06-11 10:37:51.739[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_body_extraction[0m:[36m341[0m - [1mLoaded 25042 records from CSV for body extraction[0m
[32m2025-06-11 10:37:53.882[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_body_extraction[0m:[36m348[0m - [1mStarting parallel processing for PR bodies...[0m
[32m2025-06-11 10:37:53.916[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_row_for_body[0m:[36m278[0m - [1mRow 16: Found 4 issues in body: ['6490', '6882', '6883', '6892'][0m
[32m2025-06-11 10:37:53.932[0m | [1mINFO    [0m | [36m__main__[0m:[36mpro


‚úÖ PR Body issue extraction completed successfully!
üìä Original rows: 25042
üìä New rows (from body): 29715
üìä Rows with issues found in body: 15176
üìä Unique issues found in body: 11072
üîó Links extracted from text: 3977
üîó Links generated: 11199
üíæ Output saved to: ../data/keyword-linking/pullrequestToIssue/closed_prs_summary_with_extracted_issues_from_body.csv
