In [3]:
import pandas as pd
import git
import os
from loguru import logger
from tqdm import tqdm

# Logger config
logger.add("logs/precomputing.log")

logger.info("Libraries imported and logger configured for Feature Extraction.")

[32m2025-06-26 14:13:58.470[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mLibraries imported and logger configured for Feature Extraction.[0m


In [7]:
# Final Optimized Cell: Pre-computation with Caching

import pickle
from collections import defaultdict
import bisect

# 1. The local file path to the Git repository you want to analyze.
REPO_PATH = "../../ballerina-lang/"

# --- New Configuration for Caching ---
PRECOMPUTED_DATA_FILE = "author_history_cache.pkl"

# 2. The branch you want to analyze (e.g., 'main', 'master', 'develop').
TARGET_BRANCH = 'master' 

# 4. (Optional) List of file extensions to analyze.
SOURCE_CODE_EXTENSIONS = [
    # Primary programming languages
    '.java', '.bal', '.js', '.py', 
    
    # Configuration files
    '.yml', '.yaml', '.xml', '.json', '.toml', '.properties',
    
    # Documentation and markup
    '.md', '.html', '.css',
    
    # Build and script files
    '.gradle', '.sh', '.bat'
]

precomputed_data = {}

# --- Step 1: Check for a cached version first ---
if os.path.exists(PRECOMPUTED_DATA_FILE):
    logger.info(f"Found cached history file at '{PRECOMPUTED_DATA_FILE}'. Loading...")
    with open(PRECOMPUTED_DATA_FILE, 'rb') as f:
        precomputed_data = pickle.load(f)
    logger.success("Successfully loaded pre-computed history from cache.")
else:
    logger.info(f"No cached history file found. Starting full pre-computation...")
    
    # --- Step 2: Run the one-time computation if no cache exists ---
    repo = git.Repo(REPO_PATH)
    commits_in_order = list(repo.iter_commits(TARGET_BRANCH, reverse=True))

    author_history = defaultdict(list)
    file_history = defaultdict(set)

    for commit in tqdm(commits_in_order, desc="Building Author & File History"):
        author_email = commit.author.email
        commit_time = commit.authored_datetime
        
        # Calculate metrics based on history *before* this commit
        past_commits_timestamps = author_history[author_email]
        time_since_last = (commit_time - past_commits_timestamps[-1]).total_seconds() if past_commits_timestamps else 0
        
        sixty_days_ago = commit_time - pd.Timedelta(days=60)
        start_index = bisect.bisect_left(past_commits_timestamps, sixty_days_ago)
        recent_commits_count = len(past_commits_timestamps) - start_index

        previous_committers = set()
        if commit.parents:
            for diff in commit.diff(commit.parents[0]):
                file_path = diff.a_path or diff.b_path
                if file_path:
                    previous_committers.update(file_history[file_path])
        
        precomputed_data[commit.hexsha] = {
            'author_total_commits': len(past_commits_timestamps),
            'time_since_last_commit_seconds': time_since_last,
            'recent_commits_60d': recent_commits_count,
            'unique_prior_committers_on_files': len(previous_committers),
        }
        
        # Update the history databases *after* processing
        author_history[author_email].append(commit_time)
        if commit.parents:
            for diff in commit.diff(commit.parents[0]):
                file_path = diff.a_path or diff.b_path
                if file_path:
                    file_history[file_path].add(author_email)

    # --- Step 3: Save the results to the cache file for next time ---
    with open(PRECOMPUTED_DATA_FILE, 'wb') as f:
        pickle.dump(precomputed_data, f)
    logger.success(f"History pre-computation complete and saved to '{PRECOMPUTED_DATA_FILE}'.")

[32m2025-06-26 14:15:37.223[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m40[0m - [1mNo cached history file found. Starting full pre-computation...[0m
Building Author & File History: 100%|██████████| 126808/126808 [20:30:36<00:00,  1.72it/s]      
[32m2025-06-27 10:46:15.303[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m86[0m - [32m[1mHistory pre-computation complete and saved to 'author_history_cache.pkl'.[0m
