# Code Quality Label Analysis

This notebook analyzes the distribution of code smells in the labeled dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/processed/labeled_functions.csv')
print(f"Total functions: {len(df)}")
display(df.head())

## Code Smell Distribution

In [None]:
smells = ['has_long_method', 'has_high_complexity', 'has_too_many_params', 'has_deep_nesting', 'has_no_docstring']
counts = df[smells].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=counts.values, y=counts.index)
plt.title('Prevalence of Code Smells')
plt.xlabel('Count')
plt.show()

print("Percentage of functions with each smell:")
print((counts / len(df) * 100).round(1))

## Quality Score Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['quality_score'], bins=20, kde=True)
plt.title('Distribution of Quality Scores')
plt.xlabel('Score (0-100)')
plt.show()

print(f"Average Score: {df['quality_score'].mean():.2f}")
print(f"Median Score: {df['quality_score'].median()}")

## Correlation Analysis

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df[smells].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Code Smells')
plt.show()