In [8]:
# ===================================================================
# CELL 1: SETUP, PATH CORRECTION, AND REPOSITORY DIAGNOSTICS
# ===================================================================

import os
import git
import pandas as pd
from IPython.display import display

# --- Part 1: Path Correction ---
print(f"Initial Current Working Directory: {os.getcwd()}")
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
    print(f"Changed directory to project root: {os.getcwd()}")

# --- Part 2: Configuration ---
PROJECT_ROOT = os.getcwd()
REPO_NAME = 'redis'
REPO_PATH = os.path.join(PROJECT_ROOT, 'data', '01_raw', 'repositories', REPO_NAME)
TARGET_FILE = 'src/server.c' 
TARGET_BRANCH = 'unstable' 

# --- Part 3: Import Custom Modules ---
from feature_extraction.repo_metrics import get_file_history_metrics

# --- Part 4: Diagnostics ---
print(f"\n--- Running Diagnostics ---")
print(f"Project Root is: {PROJECT_ROOT}")
print(f"Targeting repository at: {REPO_PATH}")

# --- A) Check if the repository can be opened ---
try:
    repo = git.Repo(REPO_PATH)
    print("✅ Repository opened successfully.")
except Exception as e:
    print(f"---!!! FATAL ERROR !!!---")
    print(f"Could not open the repository. Error: {e}")
    repo = None

if repo:
    # --- B) List available branches for user ---
    print("\nAvailable branches in this repository:")
    try:
        branches = [b.name for b in repo.branches]
        print(branches)
    except Exception as e:
        print(f"Could not list branches. Error: {e}")

    # --- C) Try to get the specific commit ---
    print(f"\nAttempting to get latest commit from branch: '{TARGET_BRANCH}'...")
    try:
        target_commit = repo.commit(TARGET_BRANCH)
        commit_hash = target_commit.hexsha
        print("\n✅ Successfully found commit.")
        print(f"   Target file for analysis: {TARGET_FILE}")
        print(f"   Analysis will run up to commit: {commit_hash[:10]}")
    except git.exc.BadName:
        print(f"\n---!!! ACTION REQUIRED !!!---")
        print(f"ERROR: The branch named '{TARGET_BRANCH}' does not exist in this repository.")
        print("Please look at the 'Available branches' list and change the TARGET_BRANCH variable in this cell.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

Initial Current Working Directory: D:\Work\zero_day_predictor

--- Running Diagnostics ---
Project Root is: D:\Work\zero_day_predictor
Targeting repository at: D:\Work\zero_day_predictor\data\01_raw\repositories\redis
✅ Repository opened successfully.

Available branches in this repository:
['unstable']

Attempting to get latest commit from branch: 'unstable'...

✅ Successfully found commit.
   Target file for analysis: src/server.c
   Analysis will run up to commit: 66b3d2d98e


In [9]:
# Step 2: Run the History Analysis
# Now we call our new function. It will traverse the Git history for `src/server.c`,
# calculate the metrics, and return them as a dictionary.

if 'commit_hash' in locals():
    history_metrics = get_file_history_metrics(REPO_PATH, TARGET_FILE, commit_hash)
    
    if history_metrics:
        print("\nSuccessfully calculated history metrics:")
        # Pretty print the dictionary
        for key, value in history_metrics.items():
            print(f"- {key}: {value}")
    else:
        print(f"\nCould not calculate metrics for {TARGET_FILE}. It might not exist at this commit or an error occurred.")


Successfully calculated history metrics:
- commit_count: 1014
- author_count: 151
- lines_added: 10801
- lines_deleted: 20417
- days_since_first_commit: 3609
- days_since_last_commit: 8


In [None]:
# Step 3: Analyze Another File for Comparison
# Let's analyze a different file, maybe a less central one, to see how the metrics differ.
# `src/crc64.c` is a utility file for calculating checksums.

TARGET_FILE_2 = 'src/crc64.c'
print(f"\n--- Analyzing second file: {TARGET_FILE_2} ---")

if 'commit_hash' in locals():
    history_metrics_2 = get_file_history_metrics(REPO_PATH, TARGET_FILE_2, commit_hash)

    if history_metrics_2:
        print("\nSuccessfully calculated history metrics:")
        for key, value in history_metrics_2.items():
            print(f"- {key}: {value}")
    else:
        print(f"\nCould not calculate metrics for {TARGET_FILE_2}.")

In [7]:
### Conclusion

Comparing the two files, you will likely see that `src/server.c` has a much higher `commit_count`, `author_count`, and `lines_added`/`deleted` than `src/crc64.c`. 

This indicates it is a more central, "hot" part of the codebase. Our hypothesis is that such files are more likely to contain vulnerabilities. We have now successfully extracted these signals from the repository's history.

SyntaxError: unterminated string literal (detected at line 5) (3932515958.py, line 5)