# Feature Analysis

This notebook analyzes the features extracted from the code.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../'))
from src.features.feature_extractor import FeatureExtractor

# Load data
df = pd.read_csv('../data/processed/labeled_functions_clean.csv')

# Extract Features
extractor = FeatureExtractor()
df_features = extractor.extract_features_from_dataset(df)

print(f"Features shape: {df_features.shape}")
display(df_features.head())

## Feature Correlations

In [None]:
feature_cols = [col for col in df_features.columns if col not in df.columns]
target_cols = ['quality_score', 'has_long_method', 'has_high_complexity']

correlations = df_features[feature_cols + target_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlations.loc[feature_cols, target_cols], annot=True, cmap='coolwarm')
plt.title('Feature-Target Correlations')
plt.show()

## Feature Importance (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

X = df_features[feature_cols]
y = df_features['quality_score']

# Handle NaNs
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_imputed, y)

importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=importance_df.head(15))
plt.title('Top 15 Important Features for Quality Score')
plt.show()