# Feature Exploratory Data Analysis
## Mammography Dataset - Feature Distribution and Correlation Analysis


In [6]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent / 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_processing.data_loader import load_mammography_data

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


ModuleNotFoundError: No module named 'pandas'

## 1. Load Data and Create DataFrame


In [None]:
# Load the mammography dataset
X, y = load_mammography_data()

# Feature names
feature_names = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness']

# Convert feature matrix to DataFrame
df = pd.DataFrame(X, columns=feature_names)

# Add target column
df['target'] = y

# Map target values to class names for better visualization
df['class'] = df['target'].map({0: 'Benign', 1: 'Malignant'})

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


## 2. Feature Statistics Summary


In [None]:
# Display basic statistics
print("=" * 70)
print("FEATURE STATISTICS SUMMARY")
print("=" * 70)
print(df[feature_names].describe())
print("\n" + "=" * 70)


FEATURE STATISTICS SUMMARY


NameError: name 'df' is not defined

## 3. Boxplots: Feature Distributions by Class


In [None]:
# Create boxplots for each feature comparing benign vs malignant
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

# Color palette for classes
palette = {'Benign': '#3498db', 'Malignant': '#e74c3c'}

for idx, feature in enumerate(feature_names):
    sns.boxplot(
        data=df,
        x='class',
        y=feature,
        hue='class',
        palette=palette,
        ax=axes[idx],
        width=0.6,
        showmeans=True,
        meanprops={'marker': 'D', 'markerfacecolor': 'white', 'markeredgecolor': 'black', 'markersize': 8}
    )
    axes[idx].set_title(f'{feature.capitalize()} Distribution by Class', 
                        fontsize=14, fontweight='bold', pad=10)
    axes[idx].set_xlabel('Class', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(f'{feature.capitalize()}', fontsize=12, fontweight='bold')
    axes[idx].tick_params(labelsize=10)
    axes[idx].grid(axis='y', alpha=0.3, linestyle='--')
    axes[idx].legend().remove()  # Remove legend from individual plots

plt.suptitle('Feature Distributions: Benign vs Malignant Comparison', 
             fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()


## 4. Correlation Heatmap


In [None]:
# Calculate correlation matrix (including target)
correlation_matrix = df[feature_names + ['target']].corr()

# Create correlation heatmap using seaborn
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Mask upper triangle for cleaner look

sns.heatmap(
    correlation_matrix,
    mask=mask,
    annot=True,
    fmt='.3f',
    cmap='coolwarm',
    center=0,
    square=True,
    linewidths=1.5,
    cbar_kws={'shrink': 0.8, 'label': 'Correlation Coefficient'},
    vmin=-1,
    vmax=1,
    annot_kws={'size': 11, 'weight': 'bold'}
)

plt.title('Feature Correlation Heatmap\n(Including Target Variable)', 
          fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Display correlation with target
print("=" * 70)
print("CORRELATION WITH TARGET VARIABLE")
print("=" * 70)
target_corr = correlation_matrix['target'].drop('target').sort_values(ascending=False)
for feature, corr in target_corr.items():
    print(f"{feature:15s}: {corr:7.4f}")
print("=" * 70)


## 5. Feature-to-Feature Correlation Analysis


In [None]:
# Correlation matrix for features only (excluding target)
feature_correlation = df[feature_names].corr()

# Create heatmap for features only
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(feature_correlation, dtype=bool))  # Mask upper triangle

sns.heatmap(
    feature_correlation,
    mask=mask,
    annot=True,
    fmt='.3f',
    cmap='RdYlBu_r',
    center=0,
    square=True,
    linewidths=2,
    cbar_kws={'shrink': 0.8, 'label': 'Correlation Coefficient'},
    vmin=-1,
    vmax=1,
    annot_kws={'size': 12, 'weight': 'bold'}
)

plt.title('Feature-to-Feature Correlation Heatmap', 
          fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Identify highly correlated features
print("=" * 70)
print("HIGHLY CORRELATED FEATURE PAIRS (|correlation| > 0.7)")
print("=" * 70)
high_corr_pairs = []
for i in range(len(feature_names)):
    for j in range(i+1, len(feature_names)):
        corr_val = feature_correlation.iloc[i, j]
        if abs(corr_val) > 0.7:
            high_corr_pairs.append((feature_names[i], feature_names[j], corr_val))

if high_corr_pairs:
    for feat1, feat2, corr in high_corr_pairs:
        print(f"{feat1:15s} <-> {feat2:15s}: {corr:7.4f}")
else:
    print("No highly correlated feature pairs found (threshold: 0.7)")
print("=" * 70)


## 6. Summary Statistics by Class


In [None]:
# Group statistics by class
print("=" * 70)
print("FEATURE STATISTICS BY CLASS")
print("=" * 70)

for feature in feature_names:
    print(f"\n{feature.upper()}:")
    print("-" * 70)
    stats_by_class = df.groupby('class')[feature].agg(['mean', 'std', 'min', 'max', 'median'])
    print(stats_by_class)
    
print("\n" + "=" * 70)
