In [8]:
import os
import git
import pandas as pd
from IPython.display import display

print(f"Initial Current Working Directory: {os.getcwd()}")

# --- SELF-CORRECTING PATH LOGIC ---
# If the current directory is 'notebooks', we go up one level to the project root
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
    print(f"Changed directory to project root: {os.getcwd()}")
# ------------------------------------

# Now, we are guaranteed to be in the correct project root
PROJECT_ROOT = os.getcwd()
REPO_NAME = 'redis'
REPO_PATH = os.path.join(PROJECT_ROOT, 'data', '01_raw', 'repositories', REPO_NAME)

print(f"Project Root is: {PROJECT_ROOT}")
print(f"Using repository at: {REPO_PATH}")

# Verify that the repository path actually exists
if not os.path.exists(REPO_PATH):
    print("\n---!!! ERROR !!!---")
    print(f"The repository path does not exist: {REPO_PATH}")
    print("Please ensure you have run the 'clone_repos.py' script successfully.")
else:
    print("\nRepository path confirmed to exist. Ready to proceed.")

# --- Import our custom modules AFTER setting the correct path ---
from feature_extraction.code_metrics import analyze_repo_at_commit
from feature_extraction.repo_metrics import get_file_history_metrics

Initial Current Working Directory: D:\Work\zero_day_predictor\notebooks
Changed directory to project root: D:\Work\zero_day_predictor
Project Root is: D:\Work\zero_day_predictor
Using repository at: D:\Work\zero_day_predictor\data\01_raw\repositories\redis

Repository path confirmed to exist. Ready to proceed.


In [9]:
# Step 1: Select a Commit to Analyze
# Let's grab a commit to simulate looking at the code at a past point in time.
# We'll pick a relatively recent commit from the `unstable` branch of Redis as an example.
try:
    repo = git.Repo(REPO_PATH)
    # Get the latest commit from the 'unstable' branch as an example
    target_commit = repo.commit('unstable')
    commit_hash = target_commit.hexsha
    print(f"Selected repository: {REPO_NAME}")
    print(f"Target commit hash: {commit_hash}")
    print(f"Commit date: {target_commit.committed_datetime}")
    print(f"Commit message: {target_commit.message.strip()}")
except Exception as e:
    print(f"Error accessing repo at {REPO_PATH}: {e}")

Selected repository: redis
Target commit hash: 66b3d2d98e4f5d79cac467266f17a4319cda6f50
Commit date: 2025-06-13 16:59:34+08:00
Commit message: Add 2K software prefetch to improve BITCOUNT performance (#14103)

Adds a software prefetch with a 2K stride to the scalar popcount loop in
redisPopcount().
Prefetching improved BITCOUNT throughput by up to 41.6%, reduced p50
latency by up to 43.9%, and significantly lowered L3 memory stalls,
confirming effective mitigation of memory-bound bottlenecks, with no
negative impact on L1/L2 usage or cache pollution (confirmed with HW
counters).

Note: The 2K stride was the best starting from 128,256,512,1024,2048,4096.
4K gave the same outcome so it's best to avoid larger strides without reason.


In [10]:
# Step 2: Run the Analysis
# Now we call our main function from `code_metrics.py`. This will check out the commit, 
# walk through all files, analyze them with `lizard`, and then restore the repo to its original state.

# It's a good practice to check if a commit hash was found before proceeding
if 'commit_hash' in locals():
    file_metrics = analyze_repo_at_commit(REPO_PATH, commit_hash)

    # Convert the list of dictionaries to a pandas DataFrame for easy analysis
    df_metrics = pd.DataFrame(file_metrics)

    print(f"\nAnalysis complete. Found metrics for {len(df_metrics)} files.")
else:
    print("\nSkipping analysis because a target commit could not be found.")


Checking out commit 66b3d2d...
Analyzing files in commit 66b3d2d...


Analyzing files: 100%|████████████████████████████████████████████████| 765/765 [00:10<00:00, 70.91file/s]



Returning repo to original state (66b3d2d)...
Repo state restored.

Analysis complete. Found metrics for 765 files.


In [11]:
# Step 3: Explore the Results
# Let's look at the data we've generated. We can see basic stats and find the most 
# complex files, which might be candidates for closer inspection.

# Display the first few rows of the DataFrame
if 'df_metrics' in locals() and not df_metrics.empty:
    display(df_metrics.head())

Unnamed: 0,nloc,complexity,token_count,function_count,average_complexity,file_path
0,405,139,2739,87,1.6,deps/fast_float/fast_float.h
1,11,3,77,1,3.0,deps/fast_float/fast_float_strtod.cpp
2,0,0,0,0,0.0,deps/fast_float/fast_float_strtod.h
3,216,43,1618,11,3.91,deps/fpconv/fpconv_dtoa.c
4,0,0,0,0,0.0,deps/fpconv/fpconv_dtoa.h


In [12]:
# Get a statistical summary of our metrics
if 'df_metrics' in locals() and not df_metrics.empty:
    display(df_metrics.describe())

Unnamed: 0,nloc,complexity,token_count,function_count,average_complexity
count,765.0,765.0,765.0,765.0,765.0
mean,238.777778,61.203922,1719.645752,13.47451,2.786379
std,625.023459,178.712074,4507.007136,30.634536,3.870473
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,62.0,13.0,437.0,5.0,2.19
75%,196.0,44.0,1409.0,14.0,3.64
max,8511.0,2477.0,60202.0,545.0,59.0


In [13]:
# Find the top 10 most complex files in this commit
if 'df_metrics' in locals() and not df_metrics.empty:
    print("\n--- Top 10 Most Complex Files (by total cyclomatic complexity) ---")
    display(df_metrics.sort_values(by='complexity', ascending=False).head(10))


--- Top 10 Most Complex Files (by total cyclomatic complexity) ---


Unnamed: 0,nloc,complexity,token_count,function_count,average_complexity,file_path
648,8511,2477,60202,231,10.72,src/redis-cli.c
622,7178,2164,51001,545,3.97,src/module.c
666,4992,1432,36318,221,6.48,src/server.c
553,4069,1215,28559,205,5.93,src/cluster_legacy.c
665,3520,1064,26010,125,8.51,src/sentinel.c
698,3075,958,22749,143,6.7,src/t_zset.c
630,3075,951,22099,158,6.02,src/networking.c
643,3015,886,21077,81,10.94,src/rdb.c
653,3112,821,19717,103,7.97,src/replication.c
696,2582,721,20109,77,9.36,src/t_stream.c


In [14]:
# Find the top 10 largest files by Non-Comment Lines of Code (NLOC)
if 'df_metrics' in locals() and not df_metrics.empty:
    print("\n--- Top 10 Largest Files (by NLOC) ---")
    display(df_metrics.sort_values(by='nloc', ascending=False).head(10))


--- Top 10 Largest Files (by NLOC) ---


Unnamed: 0,nloc,complexity,token_count,function_count,average_complexity,file_path
648,8511,2477,60202,231,10.72,src/redis-cli.c
622,7178,2164,51001,545,3.97,src/module.c
666,4992,1432,36318,221,6.48,src/server.c
553,4069,1215,28559,205,5.93,src/cluster_legacy.c
665,3520,1064,26010,125,8.51,src/sentinel.c
653,3112,821,19717,103,7.97,src/replication.c
630,3075,951,22099,158,6.02,src/networking.c
698,3075,958,22749,143,6.7,src/t_zset.c
643,3015,886,21077,81,10.94,src/rdb.c
222,2796,706,16757,121,5.83,deps/jemalloc/src/jemalloc.c
