# SZZ analysis


## Data preprocessing


In [23]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

import pandas as pd
from tqdm import tqdm
from loguru import logger
import concurrent.futures
from pymongo import MongoClient

# GitHub Personal Access Token (optional, but recommended for higher rate limits)
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    logger.warning("GitHub token not found. Rate limits may apply.")

# Logger config
logger.add("logs/szz_analysis.log")

# --- Configuration ---
REPO = "ballerina-platform/ballerina-lang"


Mongo db connection

In [9]:
# Mongo connection
client = MongoClient("mongodb://localhost:27017/")
db = client.github_data
pull_request_collection = db.pull_requests
logger.info("🔌 MongoDB connected.")

[32m2025-06-19 11:25:39.694[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1m🔌 MongoDB connected.[0m


In [10]:
df = pd.read_csv("../data/keyword-linking/pullrequestToIssue/final.csv");
df.head()

Unnamed: 0,pr_number,pr_link,pr_title,pr_body,issue_number,issue_link
0,12762,https://github.com/ballerina-platform/ballerin...,Add examples for anonymous records and anonymo...,## Purpose\r\n> Fix #10825,10825.0,https://github.com/ballerina-platform/ballerin...
1,29650,https://github.com/ballerina-platform/ballerin...,[LS] Fix invalid type cast being applied for n...,## Purpose\r\n$subject\r\n\r\nFixes #29573\r\n...,29428.0,https://github.com/ballerina-platform/ballerin...
2,29650,https://github.com/ballerina-platform/ballerin...,[LS] Fix invalid type cast being applied for n...,## Purpose\r\n$subject\r\n\r\nFixes #29573\r\n...,29573.0,https://github.com/ballerina-platform/ballerin...
3,31849,https://github.com/ballerina-platform/ballerin...,Fix xml equals check in TypeChecker,## Purpose\r\n$subject\r\nFixes #25709 \r\n\r\...,25709.0,https://github.com/ballerina-platform/ballerin...
4,29642,https://github.com/ballerina-platform/ballerin...,Fix type check for `ordered` types,## Purpose\r\n$title\r\n\r\nFixes #29643\r\n\r...,29643.0,https://github.com/ballerina-platform/ballerin...


### adding the mergecommit


In [11]:
def find_merge_commit(pr_number, collection):
    """
    Queries MongoDB for a given PR number and returns its merge_commit_sha.

    Args:
        pr_number (int): The pull request number to look up.
        collection: The pymongo collection object to query.

    Returns:
        The merge commit hash as a string, or None if not found or not merged.
    """
    try:
        # Find the PR document in the collection. The 'number' field should store the PR number.
        pr_data = collection.find_one({"pull_request.number": pr_number})

        if not pr_data:
            logger.warning(f"PR #{pr_number}: Not found in the database.")
            return None

        # The 'merged_at' field is a reliable indicator of a merged PR. It's null if not merged.
        if not pr_data["pull_request"].get("merged_at"):
            logger.info(f"PR #{pr_number}: Found in DB, but is not merged. Skipping.")
            return None

        merge_commit_sha = pr_data["pull_request"].get("merge_commit_sha")
        if not merge_commit_sha:
            logger.warning(f"PR #{pr_number}: Is merged, but 'merge_commit_sha' field is missing.")
            return None

        return merge_commit_sha

    except Exception as e:
        logger.error(f"An error occurred while processing PR #{pr_number}: {e}")
        return None

In [12]:
find_merge_commit(44121, pull_request_collection)

'b72f76eaa3db59ee54ecfad1d68efde3e4e5d768'

In [13]:
from concurrent.futures import ThreadPoolExecutor, as_completed

logger.info("--- Starting PR Data Enrichment Process ---")

# df = df.head(100)  # Limit to first 1000 rows for testing; remove this line for full dataset

pr_numbers = df['pr_number'].unique().tolist()  # Use unique PR numbers to avoid duplicates
results = {}

# Use ThreadPoolExecutor for concurrent I/O-bound tasks (querying DB)
with ThreadPoolExecutor(max_workers=6) as executor:
    # Create a dictionary to map future to pr_number
    future_to_pr = {executor.submit(find_merge_commit, int(pr), pull_request_collection): pr for pr in pr_numbers}
    
    # Process futures as they complete, with a tqdm progress bar
    for future in tqdm(as_completed(future_to_pr), total=len(pr_numbers), desc="Querying PRs from DB"):
        pr_num = future_to_pr[future]
        try:
            merge_hash = future.result()
            if merge_hash:
                results[pr_num] = merge_hash
            else:
                logger.debug(f"No merge hash found for PR #{pr_num}")
        except Exception as e:
            logger.error(f"A future raised an exception for PR #{pr_num}: {e}")

logger.info(f"Successfully processed all PRs. Found merge hashes for {len(results)} of them.")

# Map the results back to the DataFrame
df['merge_commit_hash'] = df['pr_number'].map(lambda x: results.get(x))

# Filter out rows where the merge commit couldn't be found
original_count = len(df)
df.dropna(subset=['merge_commit_hash'], inplace=True)
final_count = len(df)
logger.success(f"Enrichment complete. Filtered down to {final_count} merged PRs (from {original_count} total).")

# Save the result
output_path = "../data/final/data_with_merge_commit.csv"
df.to_csv(output_path, index=False)
logger.success(f"Data with merge commits saved to {output_path}")


[32m2025-06-19 11:25:41.101[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m--- Starting PR Data Enrichment Process ---[0m
[32m2025-06-19 11:25:41.991[0m | [1mINFO    [0m | [36m__main__[0m:[36mfind_merge_commit[0m:[36m22[0m - [1mPR #16627: Found in DB, but is not merged. Skipping.[0m
Querying PRs from DB:   0%|          | 0/10635 [00:00<?, ?it/s][32m2025-06-19 11:25:42.087[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [34m[1mNo merge hash found for PR #16627[0m
Querying PRs from DB:   0%|          | 17/10635 [00:00<01:04, 164.49it/s][32m2025-06-19 11:25:42.185[0m | [1mINFO    [0m | [36m__main__[0m:[36mfind_merge_commit[0m:[36m22[0m - [1mPR #12762: Found in DB, but is not merged. Skipping.[0m
[32m2025-06-19 11:25:42.195[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [34m[1mNo merge hash found for PR #12762[0m
[32m2025-06-19 11:25:42.199[0m | [1mINFO    [

## SZZ phase 2 
### Identifying the bugitrodusing prs

In [15]:
import pandas as pd
import git
import os
import re
from loguru import logger
from tqdm.notebook import tqdm

# --- Logging Setup ---
# Logger config
logger.add("logs/szz_analysis.log")

logger.info("Libraries imported and logger configured.")

[32m2025-06-20 11:09:04.665[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mLibraries imported and logger configured.[0m


In [20]:
# --- UPDATE THESE VALUES ---

# 1. The local file path to the Git repository you want to analyze.
#    Make sure this repository is up-to-date.
REPO_PATH = "../../ballerina-lang/"

# 2. The enriched CSV file created in the previous step.
INPUT_CSV = "../data/final/data_with_merge_commit.csv"

# 3. The name of the final output file that will contain the SZZ results.
OUTPUT_CSV = "../data/final/szz_bug_introducing_commits.csv"

# 4. (Optional) List of file extensions to analyze.
#    This helps focus on source code and ignore other files.
SOURCE_CODE_EXTENSIONS = [
    # Primary programming languages
    '.java', '.bal', '.js', '.py', 
    
    # Configuration files
    '.yml', '.yaml', '.xml', '.json', '.toml', '.properties',
    
    # Documentation and markup
    '.md', '.html', '.css',
    
    # Build and script files
    '.gradle', '.sh', '.bat'
]

IGNORE_PATTERNS = [
    # Compiled files
    '*.class',
    
    # Log files
    '*.log', '*.log.*',
    
    # Ballerina specific
    'Ballerina.lock',
    
    # Java package files
    '*.jar', '*.war', '*.ear',
    
    # IDE files
    '.idea/', '*.iml', '*.ipr', '*.iws',
    '.classpath', '.project', '.settings/',
    
    # Generated directories
    'target/', 'results/', '.ballerina/', '/gen/',
    
    # Gradle related
    '.gradle/', 'build/',
    
    # Other common ignores
    '.DS_Store',
    '.mtj.tmp/',
    'velocity.log',
    'extractedDistribution/',
    'node_modules/'
]

# --- End of Configuration ---

# Validate the repository path
if not os.path.exists(REPO_PATH):
    logger.error(f"Repository path not found: {REPO_PATH}")
else:
    logger.success(f"Repository found at: {REPO_PATH}")

if not os.path.exists(INPUT_CSV):
    logger.error(f"Input CSV not found: {INPUT_CSV}")
else:
    logger.success(f"Input CSV found at: {INPUT_CSV}")

[32m2025-06-20 12:06:54.520[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m66[0m - [32m[1mRepository found at: ../../ballerina-lang/[0m
[32m2025-06-20 12:06:54.525[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m71[0m - [32m[1mInput CSV found at: ../data/final/data_with_merge_commit.csv[0m


In [25]:
def parse_diff_to_get_deleted_lines(diff_text: str) -> list[int]:
    """
    Parses the text of a diff to find the line numbers that were deleted from the 'A' file.
    """
    deleted_line_nums = []
    lines = diff_text.split('\n')
    current_line_num_a = 0

    for line in lines:
        if line.startswith('@@'):
            match = re.search(r'@@ -(\d+),?\d* \+', line)
            if match:
                current_line_num_a = int(match.group(1))
        elif line.startswith('-') and not line.startswith('---'):
            deleted_line_nums.append(current_line_num_a)
            current_line_num_a += 1
        elif not line.startswith('+') and not line.startswith('@@'):
            current_line_num_a += 1
            
    return deleted_line_nums


# --- THIS IS THE CORRECTED FUNCTION ---
def run_szz_for_commit(repo: git.Repo, fix_commit_hash: str, extensions: list) -> list[dict]:
    """
    Performs the SZZ analysis for a single bug-fixing (merge) commit.
    
    Args:
        repo: The GitPython Repo object.
        fix_commit_hash: The hash of the bug-fixing merge commit.
        extensions: A list of file extensions to analyze.

    Returns:
        A list of dictionaries, each representing a potential bug-introducing commit.
    """
    bug_introducing_candidates = []
    try:
        fix_commit = repo.commit(fix_commit_hash)
    except Exception as e:
        logger.warning(f"Could not access commit {fix_commit_hash}. Skipping. Error: {e}")
        return []

    if len(fix_commit.parents) < 1:
        logger.info(f"Skipping commit {fix_commit_hash}: not a merge commit or has no parents.")
        return []
    
    parent_commit = fix_commit.parents[0]
    diffs = parent_commit.diff(fix_commit, create_patch=True)

    for diff in diffs:
        if diff.a_path is None or not any(diff.a_path.endswith(ext) for ext in extensions):
            continue

        deleted_lines_in_parent = parse_diff_to_get_deleted_lines(diff.diff.decode('utf-8', 'ignore'))
        
        if not deleted_lines_in_parent:
            continue
            
        try:
            blame_output = repo.blame(parent_commit, diff.a_path)
        except git.exc.GitCommandError as e:
            logger.warning(f"Blame failed for file {diff.a_path} in commit {parent_commit.hexsha}. Skipping file. Error: {e}")
            continue

        # **FIX STARTS HERE**
        # We need to manually track the line number, as the blame output doesn't contain it directly.
        current_line_number = 1
        for commit, lines_content in blame_output:
            for _ in lines_content: # We iterate for each line of content
                # Check if the current line number is one of the lines that was fixed
                if current_line_number in deleted_lines_in_parent:
                    candidate = {
                        'bug_introducing_commit': commit.hexsha,
                        'commit_message': commit.message.strip().split('\n')[0],
                        'author': commit.author.name,
                        'date': commit.authored_datetime.isoformat(),
                        'file_path': diff.a_path,
                    }
                    if candidate not in bug_introducing_candidates:
                        bug_introducing_candidates.append(candidate)
                
                # Increment the line counter for the next line
                current_line_number += 1
        # **FIX ENDS HERE**

    return bug_introducing_candidates

logger.info("SZZ helper functions defined (Corrected Version).")

[32m2025-06-20 12:22:01.745[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m88[0m - [1mSZZ helper functions defined (Corrected Version).[0m


In [27]:
logger.info("--- Starting SZZ Analysis ---")

# Load the enriched data from the previous step
try:
    df = pd.read_csv(INPUT_CSV)
    df = df.head(10)
    logger.info(f"Loaded {len(df)} merged PRs from '{INPUT_CSV}'.")
except FileNotFoundError:
    logger.error(f"Could not execute: Input file '{INPUT_CSV}' not found.")
    # Stop execution if file not found
    raise

# Initialize the Git Repo object
try:
    repo = git.Repo(REPO_PATH)
    logger.success("Git repository initialized successfully.")
except Exception as e:
    logger.error(f"Failed to initialize Git repo at '{REPO_PATH}'. Error: {e}")
    # Stop execution if repo is invalid
    raise


all_bics = [] # This list will store all found bug-introducing commits (BICs)

# Use tqdm to create a progress bar for the loop
# We iterate over a list of dictionaries for easier access to row data
pr_fix_list = df.to_dict('records')

for row in tqdm(pr_fix_list, desc="Analyzing Commits"):
    merge_hash = row.get('merge_commit_hash')
    if not merge_hash or pd.isna(merge_hash):
        continue

    # Run the core SZZ analysis for the current merge commit
    bics = run_szz_for_commit(repo, merge_hash, SOURCE_CODE_EXTENSIONS)
    
    # Add original PR and Issue info back to the results
    for bic in bics:
        bic['blamed_by_pr_number'] = row.get('pr_number')
        bic['blamed_by_merge_commit'] = merge_hash
        bic['original_issue_number'] = row.get('issue_number', 'N/A')
        all_bics.append(bic)

logger.success(f"\n--- Analysis Complete ---")
logger.info(f"Found {len(all_bics)} potential bug-introducing change instances.")

# Convert the list of results into a pandas DataFrame for easy viewing and saving
if all_bics:
    results_df = pd.DataFrame(all_bics)
    
    # Save the final results to a CSV file
    results_df.to_csv(OUTPUT_CSV, index=False)
    logger.success(f"Results saved to '{OUTPUT_CSV}'.")
    
    # Display the first few rows of the result right here in the notebook
    print("\n--- SZZ Analysis Results (Sample) ---")
    display(results_df.head())
else:
    logger.info("No bug-introducing commits were found with the current criteria.")

[32m2025-06-20 12:29:51.771[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1m--- Starting SZZ Analysis ---[0m


[32m2025-06-20 12:29:52.788[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mLoaded 10 merged PRs from '../data/final/data_with_merge_commit.csv'.[0m
[32m2025-06-20 12:29:52.796[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [32m[1mGit repository initialized successfully.[0m
Analyzing Commits: 100%|██████████| 10/10 [01:19<00:00,  7.99s/it]
[32m2025-06-20 12:31:12.770[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m44[0m - [32m[1m
--- Analysis Complete ---[0m
[32m2025-06-20 12:31:12.773[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m45[0m - [1mFound 372 potential bug-introducing change instances.[0m
[32m2025-06-20 12:31:12.812[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m53[0m - [32m[1mResults saved to '../data/final/szz_bug_introducing_commits.csv'.[0m



--- SZZ Analysis Results (Sample) ---


Unnamed: 0,bug_introducing_commit,commit_message,author,date,file_path,blamed_by_pr_number,blamed_by_merge_commit,original_issue_number
0,bc0d77d31770ecc0b86ae8deafafd58038dbc9a5,Move code-actions into new structure,Rasika,2020-10-28T15:58:53+05:30,language-server/modules/langserver-core/src/ma...,29650,00bca10f96884fb7ed2e1eb3a6d46e1173e74276,29428.0
1,fcdc84257666653f4bb9f16f30a5bf0c8dcdc7ba,Move code-actions' type-infer logic into diagn...,Rasika Perera,2021-02-24T11:15:29+05:30,language-server/modules/langserver-core/src/ma...,29650,00bca10f96884fb7ed2e1eb3a6d46e1173e74276,29428.0
2,bb98d5334c0f38ad49ebaeb478e7f37fd33761fd,Update type cast code action to use diagnostic...,Imesha Sudasingha,2021-03-04T23:38:04+05:30,language-server/modules/langserver-core/src/ma...,29650,00bca10f96884fb7ed2e1eb3a6d46e1173e74276,29428.0
3,974ec063c5743f31e89ff0a712d103ae5a7cc452,Restructure code-action tests,Rasika,2020-11-20T19:48:56+05:30,language-server/modules/langserver-core/src/te...,29650,00bca10f96884fb7ed2e1eb3a6d46e1173e74276,29428.0
4,bc0d77d31770ecc0b86ae8deafafd58038dbc9a5,Move code-actions into new structure,Rasika,2020-10-28T15:58:53+05:30,language-server/modules/langserver-core/src/ma...,29650,00bca10f96884fb7ed2e1eb3a6d46e1173e74276,29573.0
