# Labling

In [1]:
import pandas as pd
import os

# --- Configuration ---
# The file with all 16 features we've extracted
FEATURES_PATH = "../data/final/final_training_dataset.csv"
# The SZZ analysis output containing the bug-introducing commit hashes
SZZ_RESULTS_PATH = "../data/final/szz_bug_introducing_commits.csv" 
# The new, final file that will be used for training the model
FINAL_TRAINING_DATASET_PATH = "../data/final/final_labeled_training_dataset.csv"

print("--- Assembling Final Labeled Dataset ---")

try:
    # --- Step 1: Load your two datasets ---
    print(f"Loading features from '{FEATURES_PATH}'...")
    features_df = pd.read_csv(FEATURES_PATH)
    
    print(f"Loading SZZ labels from '{SZZ_RESULTS_PATH}'...")
    szz_results_df = pd.read_csv(SZZ_RESULTS_PATH)
    print("Data loaded successfully.")

    # --- Step 2: Create a fast "lookup set" of bug-introducing commit hashes ---
    bug_introducing_hashes = set(szz_results_df['bug_introducing_commit'].unique())
    print(f"Found {len(bug_introducing_hashes)} unique bug-introducing commits to be used as labels.")

    # --- Step 3: Add the 'is_bug_introducing' label to the main DataFrame ---
    # This is the core "labeling" step.
    # .isin() is highly optimized and checks if each commit_hash exists in our set.
    # .astype(int) converts True/False to 1/0 for the model.
    print("Labeling the dataset...")
    features_df['is_bug_introducing'] = features_df['commit_hash'].isin(bug_introducing_hashes).astype(int)
    
    # This is your final, complete dataset ready for training
    final_labeled_df = features_df
    
    # --- Step 4: Save the final dataset to a new file ---
    final_labeled_df.to_csv(FINAL_TRAINING_DATASET_PATH, index=False)
    
    print("\n--- Process Complete ---")
    print(f"Final labeled dataset saved to: '{FINAL_TRAINING_DATASET_PATH}'")
    
    print("\nClass distribution in the final dataset:")
    print(final_labeled_df['is_bug_introducing'].value_counts())
    
    print("\nSample of the final labeled dataset:")
    display(final_labeled_df.head())

except FileNotFoundError as e:
    print(f"\nERROR: A required file was not found. Please check your file paths.")
    print(f"Details: {e}")
except Exception as e:
    print(f"\nAn error occurred: {e}")

--- Assembling Final Labeled Dataset ---
Loading features from '../data/final/final_training_dataset.csv'...
Loading SZZ labels from '../data/final/szz_bug_introducing_commits.csv'...
Data loaded successfully.
Found 43186 unique bug-introducing commits to be used as labels.
Labeling the dataset...

--- Process Complete ---
Final labeled dataset saved to: '../data/final/final_labeled_training_dataset.csv'

Class distribution in the final dataset:
is_bug_introducing
0    83782
1    42043
Name: count, dtype: int64

Sample of the final labeled dataset:


Unnamed: 0,commit_hash,author_email,commit_date,lines_added,lines_deleted,files_changed,num_modified_subsystems,num_modified_dirs,entropy,previous_total_size,author_total_commits,time_since_last_commit,recent_commits,prior_committers,ft14_highly_coupled_files,ft15_any_coupled_files,ft16_non_modified_coupled_files,is_bug_introducing
0,cfa03556d2a50f5d3381bd12a97f25a1e9e8476b,tharik.kanaka@gmail.com,2025-06-14 08:32:25+05:30,1,1,1,1,1,0.0,18750,1714,76501.0,5,20,0,0,0,0
1,ecef3657b88330f2cae320f14ef30407dcb67b5a,azinneera@gmail.com,2025-06-13 15:11:12+05:30,4655,1983,156,5,57,5.454747,788109,1079,669696.0,16,199,8,19,13,0
2,ac47d27abdb60e0b1a3146324183513aadb8fa27,tharik.kanaka@gmail.com,2025-06-13 11:17:24+05:30,2,2,2,2,2,1.0,17327,1713,954112.0,4,166,0,0,1,0
3,b2b3f93fba80d28c5b244baf3b6754b662eb872f,azinneera@gmail.com,2025-06-05 21:09:36+05:30,1565,999,34,3,12,3.406617,485212,1078,692500.0,16,120,0,2,2,0
4,e965df23ea6af12075af0901c2ea13880bbfdede,azinneera@gmail.com,2025-05-28 20:47:56+05:30,6,6,4,2,3,1.625815,129048,1077,89930.0,7,116,0,0,2,0
