# Feature extraction

In [1]:
import pandas as pd
import git
import os
from loguru import logger
from tqdm import tqdm
import math
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed # New import

# Logger config
logger.add("logs/szz_analysis.log")

logger.info("Libraries imported and logger configured for Feature Extraction.")

[32m2025-06-27 11:34:09.254[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mLibraries imported and logger configured for Feature Extraction.[0m


## configurations

In [2]:
os.makedirs('./data', exist_ok=True)

# 1. The local file path to the Git repository you want to analyze.
REPO_PATH = "../../ballerina-lang/"

# 2. The branch you want to analyze (e.g., 'main', 'master', 'develop').
TARGET_BRANCH = 'master' 

# 3. The name of the output file for the extracted features.
OUTPUT_CSV = "/data/feature_extraction_results.csv"

# --- New Configuration for Caching ---
PRECOMPUTED_DATA_FILE = "author_history_cache.pkl"

# 4. (Optional) List of file extensions to analyze.
SOURCE_CODE_EXTENSIONS = [
    # Primary programming languages
    '.java', '.bal', '.js', '.py', 
    
    # Configuration files
    '.yml', '.yaml', '.xml', '.json', '.toml', '.properties',
    
    # Documentation and markup
    '.md', '.html', '.css',
    
    # Build and script files
    '.gradle', '.sh', '.bat'
]

IGNORE_PATTERNS = [
    # Compiled files
    '*.class',
    
    # Log files
    '*.log', '*.log.*',
    
    # Ballerina specific
    'Ballerina.lock',
    
    # Java package files
    '*.jar', '*.war', '*.ear',
    
    # IDE files
    '.idea/', '*.iml', '*.ipr', '*.iws',
    '.classpath', '.project', '.settings/',
    
    # Generated directories
    'target/', 'results/', '.ballerina/', '/gen/',
    
    # Gradle related
    '.gradle/', 'build/',
    
    # Other common ignores
    '.DS_Store',
    '.mtj.tmp/',
    'velocity.log',
    'extractedDistribution/',
    'node_modules/'
]

# --- New Configuration for Performance and Fault Tolerance ---

# 5. Number of parallel processes to use. A good starting point is os.cpu_count() - 1.
MAX_WORKERS = os.cpu_count() - 2 if os.cpu_count() else 1

# 6. How many results to collect before saving a batch to the CSV.
BATCH_SIZE = 500

# --- End of Configuration ---

logger.info(f"Configuration set. Using up to {MAX_WORKERS} worker processes.")

[32m2025-06-27 11:34:11.271[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m71[0m - [1mConfiguration set. Using up to 6 worker processes.[0m


## precomputing to get user experiance matrix

In [10]:
import pickle
from collections import defaultdict
import bisect

precomputed_data = {}

# --- Step 1: Check for a cached version first ---
if os.path.exists(PRECOMPUTED_DATA_FILE):
    logger.info(f"Found cached history file at '{PRECOMPUTED_DATA_FILE}'. Loading...")
    with open(PRECOMPUTED_DATA_FILE, 'rb') as f:
        precomputed_data = pickle.load(f)
    logger.success("Successfully loaded pre-computed history from cache.")
else:
    logger.info(f"No cached history file found. Starting full pre-computation...")
    
    # --- Step 2: Run the one-time computation if no cache exists ---
    repo = git.Repo(REPO_PATH)
    commits_in_order = list(repo.iter_commits(TARGET_BRANCH, reverse=True))

    author_history = defaultdict(list)
    file_history = defaultdict(set)

    for commit in tqdm(commits_in_order, desc="Building Author & File History"):
        author_email = commit.author.email
        commit_time = commit.authored_datetime
        
        # Calculate metrics based on history *before* this commit
        past_commits_timestamps = author_history[author_email]
        time_since_last = (commit_time - past_commits_timestamps[-1]).total_seconds() if past_commits_timestamps else 0
        
        sixty_days_ago = commit_time - pd.Timedelta(days=60)
        start_index = bisect.bisect_left(past_commits_timestamps, sixty_days_ago)
        recent_commits_count = len(past_commits_timestamps) - start_index

        previous_committers = set()
        if commit.parents:
            for diff in commit.diff(commit.parents[0]):
                file_path = diff.a_path or diff.b_path
                if file_path:
                    previous_committers.update(file_history[file_path])
        
        precomputed_data[commit.hexsha] = {
            'author_total_commits': len(past_commits_timestamps),
            'time_since_last_commit_seconds': time_since_last,
            'recent_commits_60d': recent_commits_count,
            'unique_prior_committers_on_files': len(previous_committers),
        }
        
        # Update the history databases *after* processing
        author_history[author_email].append(commit_time)
        if commit.parents:
            for diff in commit.diff(commit.parents[0]):
                file_path = diff.a_path or diff.b_path
                if file_path:
                    file_history[file_path].add(author_email)

    # --- Step 3: Save the results to the cache file for next time ---
    with open(PRECOMPUTED_DATA_FILE, 'wb') as f:
        pickle.dump(precomputed_data, f)
    logger.success(f"History pre-computation complete and saved to '{PRECOMPUTED_DATA_FILE}'.")

NameError: name 'os' is not defined

## worker function

In [None]:
def extract_features_for_commit_worker(commit, history_data):
    """
    Worker function that takes a live Commit object and the pre-computed history data.
    """
    try:
        # --- Basic Features (as before) ---
        stats = commit.stats.total
        lines_added = stats['insertions']
        lines_deleted = stats['deletions']
        files_changed = stats['files']
        subsystems = {path.split('/')[0] for path in commit.stats.files.keys()}
        modified_dirs = {os.path.dirname(f) for f in commit.stats.files.keys()}
        
        total_lines_changed = lines_added + lines_deleted
        entropy = 0.0
        if total_lines_changed > 0:
            for file_path, file_stats in commit.stats.files.items():
                file_lines_changed = file_stats['insertions'] + file_stats['deletions']
                if file_lines_changed > 0:
                    change_proportion = file_lines_changed / total_lines_changed
                    entropy -= change_proportion * math.log2(change_proportion)
        
        previous_total_size = 0
        if commit.parents:
            parent = commit.parents[0]
            for diff in commit.diff(parent):
                if diff.a_blob:
                    previous_total_size += diff.a_blob.size

        # --- NEW: Look up pre-computed experience features ---
        experience_metrics = history_data.get(commit.hexsha, {})

        feature_dict = {
            'commit_hash': commit.hexsha,
            'author_email': commit.author.email,
            'commit_date': commit.authored_datetime,
            'lines_added': lines_added,
            'lines_deleted': lines_deleted,
            'files_changed': files_changed,
            'num_modified_subsystems': len(subsystems),          # Ft5
            'num_modified_dirs': len(modified_dirs),             # Ft6
            'entropy': entropy,                                  # Ft7
            'previous_total_size': previous_total_size,        # Ft4
            # Add the experience features from our pre-computed dictionary
            'author_total_commits': experience_metrics.get('author_total_commits', 0), # Ft11 & Ft12
            'time_since_last_commit': experience_metrics.get('time_since_last_commit_seconds', 0), # Ft10
            'recent_commits': experience_metrics.get('recent_commits_60d', 0), # Ft13
            'prior_committers': experience_metrics.get('unique_prior_committers_on_files', 0), # Ft9
        }
        return feature_dict

    except Exception as e:
        logger.warning(f"Could not process commit {commit.hexsha}. Error: {e}")
        return None

logger.info("Final worker function defined, ready to use pre-computed history.")

[32m2025-06-27 11:34:58.127[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m56[0m - [1mFinal worker function (v3) defined, ready to use pre-computed history.[0m


In [None]:
import pandas as pd
import git
import os
from tqdm import tqdm

logger.info("--- Starting Final Feature Extraction using Pre-computed History ---")

# --- Step 1: Checkpointing - Load Already Processed Commits ---
# This ensures we can resume if the script is interrupted.
processed_hashes = set()
if os.path.exists(OUTPUT_CSV):
    try:
        logger.info(f"Output file '{OUTPUT_CSV}' found. Loading processed commits to resume.")
        processed_df = pd.read_csv(OUTPUT_CSV)
        processed_hashes = set(processed_df['commit_hash'])
        logger.success(f"Found {len(processed_hashes)} commits already processed. They will be skipped.")
    except Exception as e:
        logger.error(f"Could not read existing CSV file. Starting from scratch. Error: {e}")
        processed_hashes = set()

# --- Step 2: Get the List of Commits to Process ---
# We assume the 'repo' object is already initialized from the pre-computation step.
# If not, uncomment the following line:
repo = git.Repo(REPO_PATH) 

logger.info(f"Fetching commits from branch '{TARGET_BRANCH}'...")
all_commits = list(repo.iter_commits(TARGET_BRANCH))

# Filter out commits that have already been processed
# This uses the 'commit' objects directly, which is efficient.
commits_to_process = [c for c in all_commits if c.hexsha not in processed_hashes]
logger.info(f"Total commits in repo: {len(all_commits)}. New commits to process: {len(commits_to_process)}.")

[32m2025-06-27 11:35:20.000[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1m--- Starting Final Feature Extraction using Pre-computed History ---[0m
[32m2025-06-27 11:35:20.007[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m28[0m - [1mFetching commits from branch 'master'...[0m
[32m2025-06-27 11:35:23.384[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m34[0m - [1mTotal commits in repo: 126808. New commits to process: 126808.[0m


In [None]:
# --- TEMPORARY DEBUGGING CELL ---
# This runs the code sequentially to find the exact point of failure.

logger.info("--- Starting Feature Extraction in DEBUG MODE (Sequential) ---")

# --- We assume 'precomputed_data' and other variables are already loaded from previous cells ---

all_features = []
error_count = 0
problematic_commits = []

# Use the 'commits_to_process' list you already created in the previous cell
for commit in tqdm(commits_to_process, desc="Extracting Features (Debug Mode)"):
    try:
        # Call the worker function directly
        result = extract_features_for_commit_worker(commit, precomputed_data)
        if result:
            all_features.append(result)
        else:
            # This handles cases where the worker function returns None intentionally
            error_count += 1
            problematic_commits.append(commit.hexsha)

    except Exception as e:
        # This will catch the crash and tell us exactly which commit failed
        logger.error(f"FATAL ERROR on commit: {commit.hexsha}")
        logger.error(f"Error details: {e}")
        # We re-raise the exception to see the full traceback
        raise e

logger.success("--- Sequential Processing Finished ---")
if problematic_commits:
    logger.warning(f"Found {len(problematic_commits)} commits that returned None. Hashes: {problematic_commits}")

if all_features:
    results_df = pd.DataFrame(all_features)
    display(results_df.head())

[32m2025-06-27 11:35:30.017[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m--- Starting Feature Extraction in DEBUG MODE (Sequential) ---[0m
Extracting Features (Debug Mode): 100%|██████████| 126808/126808 [46:48:46<00:00,  1.33s/it]  
[32m2025-06-29 10:24:16.152[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [32m[1m--- Sequential Processing Finished ---[0m


Unnamed: 0,commit_hash,author_email,commit_date,lines_added,lines_deleted,files_changed,num_modified_subsystems,num_modified_dirs,entropy,previous_total_size,author_total_commits,time_since_last_commit,recent_commits,prior_committers
0,cfa03556d2a50f5d3381bd12a97f25a1e9e8476b,tharik.kanaka@gmail.com,2025-06-14 08:32:25+05:30,1,1,1,1,1,0.0,18750,1714,76501.0,5,20
1,ecef3657b88330f2cae320f14ef30407dcb67b5a,azinneera@gmail.com,2025-06-13 15:11:12+05:30,4655,1983,156,5,57,5.454747,788109,1079,669696.0,16,199
2,ac47d27abdb60e0b1a3146324183513aadb8fa27,tharik.kanaka@gmail.com,2025-06-13 11:17:24+05:30,2,2,2,2,2,1.0,17327,1713,954112.0,4,166
3,b2b3f93fba80d28c5b244baf3b6754b662eb872f,azinneera@gmail.com,2025-06-05 21:09:36+05:30,1565,999,34,3,12,3.406617,485212,1078,692500.0,16,120
4,e965df23ea6af12075af0901c2ea13880bbfdede,azinneera@gmail.com,2025-05-28 20:47:56+05:30,6,6,4,2,3,1.625815,129048,1077,89930.0,7,116


In [11]:
results_df.to_csv("data/extract_features.csv", index=False)

NameError: name 'results_df' is not defined

## code-maat data processing

before process the data here we need to perfrom data extraction using [code-maat](https://github.com/adamtornhill/code-maat) tool 
 - generate git log file 

    - first clone the repository
    - then generate the git log file **use gitbash**
    >`git log --pretty=format:'[%h] %aN %ad %s' --date=short --numstat --all > gitlog.log`
 - perform code-maat coupling analysis
    - download the latest [release](https://github.com/adamtornhill/code-maat/releases) stanalone jar
    - place that jar in the repository location
    >run the tool `java -jar code-maat-1.0.4-standalone.jar -l gitlog.log -c git -a coupling -o coupling.csv`

### extracting features from the code-maat output

I'm assuming  Number of highly coupled files Ft14 as the threshold level 75

In [1]:
import pandas as pd
import git
from collections import defaultdict
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from loguru import logger

# Logger config
logger.add("logs/code_maat.log")

# --- Configuration (ensure these are set correctly) ---
COUPLING_RESULTS_PATH = "../data/final/coupling.csv"
MAIN_FEATURES_PATH = "../data/final/extract_features.csv"
FINAL_OUTPUT_PATH = "../data/final/final_training_dataset.csv"
REPO_PATH = "../../ballerina-lang/"
TARGET_BRANCH = 'master'
HIGH_COUPLING_THRESHOLD = 75
MAX_WORKERS = 6 # Number of threads to use


In [2]:

# --- Step 1: Load and Process code-maat's Coupling Data (Sequential - this is fast) ---
logger.info(f"Loading coupling data from '{COUPLING_RESULTS_PATH}'...")
try:
    coupling_df = pd.read_csv(COUPLING_RESULTS_PATH)
    coupling_df.dropna(inplace=True)
    
    coupling_map = defaultdict(set)
    high_coupling_map = defaultdict(set)

    for _, row in tqdm(coupling_df.iterrows(), total=len(coupling_df), desc="Building coupling map"):
        coupling_map[row['entity']].add(row['coupled'])
        coupling_map[row['coupled']].add(row['entity'])
        if row['degree'] >= HIGH_COUPLING_THRESHOLD:
            high_coupling_map[row['entity']].add(row['coupled'])
            high_coupling_map[row['coupled']].add(row['entity'])
            
    logger.success("Built coupling lookup maps for fast processing.")

except FileNotFoundError:
    logger.error(f"Coupling results file not found at '{COUPLING_RESULTS_PATH}'. Please run code-maat first.")
    raise


[32m2025-07-03 10:10:01.936[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoading coupling data from '../data/final/coupling.csv'...[0m
Building coupling map: 100%|██████████| 15899/15899 [00:00<00:00, 16202.81it/s]
[32m2025-07-03 10:10:02.982[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [32m[1mBuilt coupling lookup maps for fast processing.[0m


In [3]:

# --- Step 2: Define the Worker Function for Parallel Execution ---
def calculate_coupling_for_commit(commit, coupling_map, high_coupling_map):
    """
    Worker function to calculate coupling features for a single commit.
    This function is safe to be called from multiple threads.
    """
    if not commit.parents:
        return None

    try:
        modified_files = set(commit.stats.files.keys())
        highly_coupled_in_commit = set()
        any_coupled_in_commit = set()
        files_with_external_coupling = set()

        for file_path in modified_files:
            # Check for any coupling (for Ft15)
            all_coupled_files = coupling_map.get(file_path, set())
            if all_coupled_files.intersection(modified_files):
                any_coupled_in_commit.add(file_path)

            # Check for high coupling (for Ft14)
            high_coupled_files = high_coupling_map.get(file_path, set())
            if high_coupled_files.intersection(modified_files):
                highly_coupled_in_commit.add(file_path)

            # Check for non-modified coupled files (for Ft16)
            if all_coupled_files.difference(modified_files):
                files_with_external_coupling.add(file_path)

        return {
            'commit_hash': commit.hexsha,
            'ft14_highly_coupled_files': len(highly_coupled_in_commit),
            'ft15_any_coupled_files': len(any_coupled_in_commit),
            'ft16_non_modified_coupled_files': len(files_with_external_coupling)
        }
    except Exception as e:
        logger.warning(f"Could not process commit {commit.hexsha} for coupling. Error: {e}")
        return None


In [None]:

# --- Step 3: Run Per-Commit Calculations in Parallel ---
logger.info("Calculating per-commit coupling features using parallel processing...")
repo = git.Repo(REPO_PATH)
commits_to_process = list(repo.iter_commits(TARGET_BRANCH))
coupling_features_list = []

# Use functools.partial to "pre-fill" the map arguments for the worker
worker_with_maps = partial(calculate_coupling_for_commit, 
                           coupling_map=coupling_map, 
                           high_coupling_map=high_coupling_map)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Use executor.map to apply the worker function to all commits
    results_iterator = executor.map(worker_with_maps, commits_to_process)
    
    # Process results as they complete with a tqdm progress bar
    for result in tqdm(results_iterator, total=len(commits_to_process), desc="Analyzing Commits for Coupling"):
        if result:
            coupling_features_list.append(result)

coupling_features_df = pd.DataFrame(coupling_features_list)
logger.success("Finished calculating per-commit coupling features.")


[32m2025-07-03 10:10:16.035[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mCalculating per-commit coupling features using parallel processing...[0m
Analyzing Commits for Coupling:   0%|          | 0/126808 [00:00<?, ?it/s]


In [None]:

# --- Step 4: Merge Coupling Features with Main Feature Set (Sequential - this is fast) ---
logger.info("Merging coupling features into the main dataset...")
try:
    main_df = pd.read_csv(MAIN_FEATURES_PATH)
    final_df = pd.merge(main_df, coupling_features_df, on='commit_hash', how='left')
    
    coupling_cols = ['ft14_highly_coupled_files', 'ft15_any_coupled_files', 'ft16_non_modified_coupled_files']
    final_df[coupling_cols] = final_df[coupling_cols].fillna(0).astype(int)

    final_df.to_csv(FINAL_OUTPUT_PATH, index=False)
    
    logger.success(f"Successfully merged coupling features. Final dataset saved to '{FINAL_OUTPUT_PATH}'")
    display(final_df.head())
    
except FileNotFoundError:
    logger.error(f"Main features file not found at '{MAIN_FEATURES_PATH}'. Please ensure it has been generated.")
except Exception as e:
    logger.error(f"An error occurred during merging: {e}")

In [7]:
import pandas as pd
import git
from collections import defaultdict
from tqdm import tqdm

# --- Configuration ---
COUPLING_RESULTS_PATH = "../data/final/coupling.csv"
MAIN_FEATURES_PATH = "../data/final/extract_features.csv"
FINAL_OUTPUT_PATH = "../data/final/final_training_dataset.csv"
REPO_PATH = "../../ballerina-lang/"
TARGET_BRANCH = 'master'
HIGH_COUPLING_THRESHOLD = 75

# --- Step 1: Load code-maat data and create a fast lookup map ---
# This part is still necessary to avoid re-reading the coupling file for every commit.
print("Loading coupling data and building lookup map...")
coupling_df = pd.read_csv(COUPLING_RESULTS_PATH)
coupling_df.dropna(inplace=True)

coupling_map = defaultdict(set)
high_coupling_map = defaultdict(set)

for _, row in coupling_df.iterrows():
    coupling_map[row['entity']].add(row['coupled'])
    coupling_map[row['coupled']].add(row['entity'])
    if row['degree'] >= HIGH_COUPLING_THRESHOLD:
        high_coupling_map[row['entity']].add(row['coupled'])
        high_coupling_map[row['coupled']].add(row['entity'])
print("Coupling map ready.")


# --- Step 2: Process each commit one by one (Your proposed logic) ---
print("Analyzing each commit sequentially...")
repo = git.Repo(REPO_PATH)
# commits_to_process = list(repo.iter_commits(TARGET_BRANCH, max_count=1000)) # testing with 1000 commits
commits_to_process = list(repo.iter_commits(TARGET_BRANCH))

coupling_features_list = []

for commit in tqdm(commits_to_process, desc="Analyzing Commits"):
    if not commit.parents:
        continue

    # Get the set of files modified in this commit
    modified_files = set(commit.stats.files.keys())

    # Calculate the three coupling features for this single commit
    highly_coupled_in_commit = {f for f in modified_files if high_coupling_map.get(f, set()).intersection(modified_files)}
    any_coupled_in_commit = {f for f in modified_files if coupling_map.get(f, set()).intersection(modified_files)}
    files_with_external_coupling = {f for f in modified_files if coupling_map.get(f, set()).difference(modified_files)}
    
    # Append the results for this commit to our list
    coupling_features_list.append({
        'commit_hash': commit.hexsha,
        'ft14_highly_coupled_files': len(highly_coupled_in_commit),
        'ft15_any_coupled_files': len(any_coupled_in_commit),
        'ft16_non_modified_coupled_files': len(files_with_external_coupling)
    })

coupling_features_df = pd.DataFrame(coupling_features_list)
print("Finished calculating all per-commit coupling features.")


# --- Step 3: Merge the new features into your main dataset ---
print("Merging coupling features into the main dataset...")
main_df = pd.read_csv(MAIN_FEATURES_PATH)
final_df = pd.merge(main_df, coupling_features_df, on='commit_hash', how='left')

coupling_cols = ['ft14_highly_coupled_files', 'ft15_any_coupled_files', 'ft16_non_modified_coupled_files']
final_df[coupling_cols] = final_df[coupling_cols].fillna(0).astype(int)

final_df.to_csv(FINAL_OUTPUT_PATH, index=False)
print(f"Success! Final dataset with all features saved to '{FINAL_OUTPUT_PATH}'")
display(final_df.head())

Loading coupling data and building lookup map...
Coupling map ready.
Analyzing each commit sequentially...


Analyzing Commits: 100%|██████████| 126808/126808 [8:37:41<00:00,  4.08it/s]       


Finished calculating all per-commit coupling features.
Merging coupling features into the main dataset...
Success! Final dataset with all features saved to '../data/final/final_training_dataset.csv'


Unnamed: 0,commit_hash,author_email,commit_date,lines_added,lines_deleted,files_changed,num_modified_subsystems,num_modified_dirs,entropy,previous_total_size,author_total_commits,time_since_last_commit,recent_commits,prior_committers,ft14_highly_coupled_files,ft15_any_coupled_files,ft16_non_modified_coupled_files
0,cfa03556d2a50f5d3381bd12a97f25a1e9e8476b,tharik.kanaka@gmail.com,2025-06-14 08:32:25+05:30,1,1,1,1,1,0.0,18750,1714,76501.0,5,20,0,0,0
1,ecef3657b88330f2cae320f14ef30407dcb67b5a,azinneera@gmail.com,2025-06-13 15:11:12+05:30,4655,1983,156,5,57,5.454747,788109,1079,669696.0,16,199,8,19,13
2,ac47d27abdb60e0b1a3146324183513aadb8fa27,tharik.kanaka@gmail.com,2025-06-13 11:17:24+05:30,2,2,2,2,2,1.0,17327,1713,954112.0,4,166,0,0,1
3,b2b3f93fba80d28c5b244baf3b6754b662eb872f,azinneera@gmail.com,2025-06-05 21:09:36+05:30,1565,999,34,3,12,3.406617,485212,1078,692500.0,16,120,0,2,2
4,e965df23ea6af12075af0901c2ea13880bbfdede,azinneera@gmail.com,2025-05-28 20:47:56+05:30,6,6,4,2,3,1.625815,129048,1077,89930.0,7,116,0,0,2


In [5]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# --- Configuration ---
# All paths should be correct as per your setup
COUPLING_RESULTS_PATH = "../data/final/coupling.csv"
MAIN_FEATURES_PATH = "../data/final/extract_features.csv"
LOG_WITH_FILES_PATH = "../data/final/commit_gitlog.txt" # The new file from Step 1
FINAL_OUTPUT_PATH = "../data/final/final_training_dataset.csv"

HIGH_COUPLING_THRESHOLD = 75

# --- Step 1: Load code-maat data (Same as before) ---
print("Loading coupling data and building lookup map...")
coupling_df = pd.read_csv(COUPLING_RESULTS_PATH) # Assuming this is tab-separated from previous steps
coupling_df.dropna(inplace=True)
coupling_map = defaultdict(set)
high_coupling_map = defaultdict(set)
for _, row in coupling_df.iterrows():
    coupling_map[row['entity']].add(row['coupled'])
    coupling_map[row['coupled']].add(row['entity'])
    if row['degree'] >= HIGH_COUPLING_THRESHOLD:
        high_coupling_map[row['entity']].add(row['coupled'])
        high_coupling_map[row['coupled']].add(row['entity'])
print("Coupling map ready.")

# --- Step 2: Parse the Pre-generated Log File ---
print(f"Parsing the pre-generated log file: '{LOG_WITH_FILES_PATH}'...")
all_commit_data = []
current_commit_hash = None
current_files = []

with open(LOG_WITH_FILES_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        if line.startswith('--'): # This marks a new commit
            # Save the previous commit's data before starting a new one
            if current_commit_hash:
                all_commit_data.append({'commit_hash': current_commit_hash, 'modified_files': set(current_files)})
            
            current_commit_hash = line.replace('--', '')
            current_files = []
        else: # This is a file path
            current_files.append(line)
# Add the very last commit in the file
if current_commit_hash:
    all_commit_data.append({'commit_hash': current_commit_hash, 'modified_files': set(current_files)})
print(f"Parsed {len(all_commit_data)} commits from the log file.")

# --- Step 3: Calculate Coupling Features Sequentially ---
coupling_features_list = []
for commit_data in tqdm(all_commit_data, desc="Calculating Coupling Features"):
    modified_files = commit_data['modified_files']
    
    highly_coupled_in_commit = {f for f in modified_files if high_coupling_map.get(f, set()).intersection(modified_files)}
    any_coupled_in_commit = {f for f in modified_files if coupling_map.get(f, set()).intersection(modified_files)}
    files_with_external_coupling = {f for f in modified_files if coupling_map.get(f, set()).difference(modified_files)}
    
    coupling_features_list.append({
        'commit_hash': commit_data['commit_hash'],
        'ft14_highly_coupled_files': len(highly_coupled_in_commit),
        'ft15_any_coupled_files': len(any_coupled_in_commit),
        'ft16_non_modified_coupled_files': len(files_with_external_coupling)
    })

coupling_features_df = pd.DataFrame(coupling_features_list)
print("Finished calculating coupling features.")

# --- Step 4: Merge and Save (Same as before) ---
print("Merging all features into the final dataset...")
main_df = pd.read_csv(MAIN_FEATURES_PATH)
final_df = pd.merge(main_df, coupling_features_df, on='commit_hash', how='left')
coupling_cols = ['ft14_highly_coupled_files', 'ft15_any_coupled_files', 'ft16_non_modified_coupled_files']
final_df[coupling_cols] = final_df[coupling_cols].fillna(0).astype(int)
final_df.to_csv(FINAL_OUTPUT_PATH, index=False)
print(f"Success! Final dataset with all features saved to '{FINAL_OUTPUT_PATH}'")
display(final_df.head())

Loading coupling data and building lookup map...
Coupling map ready.
Parsing the pre-generated log file: '../data/final/commit_gitlog.txt'...
Parsed 136013 commits from the log file.


Calculating Coupling Features: 100%|██████████| 136013/136013 [00:00<00:00, 764303.39it/s]


Finished calculating coupling features.
Merging all features into the final dataset...
Success! Final dataset with all features saved to '../data/final/final_training_dataset.csv'


Unnamed: 0,commit_hash,author_email,commit_date,lines_added,lines_deleted,files_changed,num_modified_subsystems,num_modified_dirs,entropy,previous_total_size,author_total_commits,time_since_last_commit,recent_commits,prior_committers,ft14_highly_coupled_files,ft15_any_coupled_files,ft16_non_modified_coupled_files
0,cfa03556d2a50f5d3381bd12a97f25a1e9e8476b,tharik.kanaka@gmail.com,2025-06-14 08:32:25+05:30,1,1,1,1,1,0.0,18750,1714,76501.0,5,20,0,0,0
1,ecef3657b88330f2cae320f14ef30407dcb67b5a,azinneera@gmail.com,2025-06-13 15:11:12+05:30,4655,1983,156,5,57,5.454747,788109,1079,669696.0,16,199,0,0,0
2,ac47d27abdb60e0b1a3146324183513aadb8fa27,tharik.kanaka@gmail.com,2025-06-13 11:17:24+05:30,2,2,2,2,2,1.0,17327,1713,954112.0,4,166,0,0,0
3,b2b3f93fba80d28c5b244baf3b6754b662eb872f,azinneera@gmail.com,2025-06-05 21:09:36+05:30,1565,999,34,3,12,3.406617,485212,1078,692500.0,16,120,0,0,0
4,e965df23ea6af12075af0901c2ea13880bbfdede,azinneera@gmail.com,2025-05-28 20:47:56+05:30,6,6,4,2,3,1.625815,129048,1077,89930.0,7,116,0,0,0
