# Exploratory Data Analysis (EDA) Deep Dive

This notebook performs a comprehensive analysis of the CodeGuard dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/processed/labeled_functions_clean.csv')

# Basic Stats
print(f"Dataset shape: {df.shape}")
display(df.head())

## Code Smell Distribution

In [None]:
smells = ['has_long_method', 'has_high_complexity', 'has_too_many_params', 'has_deep_nesting', 'has_no_docstring']
counts = df[smells].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=counts.values, y=counts.index)
plt.title('Code Smell Counts')
plt.xlabel('Count')
plt.show()

## Co-occurrence Matrix

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df[smells].corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Code Smell Co-occurrence')
plt.show()

## Quality Score Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['quality_score'], bins=20, kde=True)
plt.axvline(df['quality_score'].mean(), color='r', linestyle='--', label='Mean')
plt.title('Quality Score Distribution')
plt.legend()
plt.show()