# Exploratory Data Analysis
## Privacy-Preserving Medical AI Pipeline

This notebook explores the medical datasets that will be used for training ML models.

**Datasets:**
1. Breast Cancer Wisconsin - Binary classification (Malignant/Benign)
2. Cleveland Heart Disease - Binary classification (Disease/No Disease)

**Goal:** Understand the data before training models that will run on encrypted data.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Libraries loaded successfully!')

## 1. Breast Cancer Dataset

In [None]:
# Load breast cancer data
X_train_bc = pd.read_csv('../data/breast_cancer/X_train.csv')
X_test_bc = pd.read_csv('../data/breast_cancer/X_test.csv')
y_train_bc = pd.read_csv('../data/breast_cancer/y_train.csv')
y_test_bc = pd.read_csv('../data/breast_cancer/y_test.csv')

print(f'Training samples: {len(X_train_bc)}')
print(f'Test samples: {len(X_test_bc)}')
print(f'Features: {len(X_train_bc.columns)}')
X_train_bc.head()

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Pie chart
y_train_bc['target'].value_counts().plot.pie(
    ax=axes[0],
    labels=['Benign', 'Malignant'],
    autopct='%1.1f%%',
    colors=['#2ecc71', '#e74c3c']
)
axes[0].set_title('Breast Cancer - Target Distribution')
axes[0].set_ylabel('')

# Bar chart
y_train_bc['target'].value_counts().plot.bar(ax=axes[1], color=['#2ecc71', '#e74c3c'])
axes[1].set_title('Breast Cancer - Sample Counts')
axes[1].set_xlabel('Target (0=Malignant, 1=Benign)')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Feature correlations with target
correlations = X_train_bc.corrwith(y_train_bc['target']).sort_values(ascending=False)

plt.figure(figsize=(10, 8))
correlations.plot.barh(color=['#e74c3c' if x < 0 else '#2ecc71' for x in correlations])
plt.title('Feature Correlations with Target (Breast Cancer)')
plt.xlabel('Correlation')
plt.tight_layout()
plt.show()

print('\nTop 5 positive correlations:')
print(correlations.head())
print('\nTop 5 negative correlations:')
print(correlations.tail())

## 2. Heart Disease Dataset

In [None]:
# Load heart disease data
X_train_hd = pd.read_csv('../data/heart_disease/X_train.csv')
X_test_hd = pd.read_csv('../data/heart_disease/X_test.csv')
y_train_hd = pd.read_csv('../data/heart_disease/y_train.csv')
y_test_hd = pd.read_csv('../data/heart_disease/y_test.csv')

# Load feature descriptions
feature_info = pd.read_csv('../data/heart_disease/feature_info.csv')

print(f'Training samples: {len(X_train_hd)}')
print(f'Test samples: {len(X_test_hd)}')
print(f'Features: {len(X_train_hd.columns)}')
print('\nFeature descriptions:')
feature_info

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Pie chart
y_train_hd['target'].value_counts().plot.pie(
    ax=axes[0],
    labels=['No Disease', 'Disease'],
    autopct='%1.1f%%',
    colors=['#2ecc71', '#e74c3c']
)
axes[0].set_title('Heart Disease - Target Distribution')
axes[0].set_ylabel('')

# Bar chart
y_train_hd['target'].value_counts().plot.bar(ax=axes[1], color=['#2ecc71', '#e74c3c'])
axes[1].set_title('Heart Disease - Sample Counts')
axes[1].set_xlabel('Target (0=No Disease, 1=Disease)')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Feature correlations
correlations_hd = X_train_hd.corrwith(y_train_hd['target']).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
correlations_hd.plot.barh(color=['#e74c3c' if x < 0 else '#2ecc71' for x in correlations_hd])
plt.title('Feature Correlations with Target (Heart Disease)')
plt.xlabel('Correlation')
plt.tight_layout()
plt.show()

## 3. Summary Statistics

In [None]:
print('='*60)
print('DATASET SUMMARY')
print('='*60)
print(f'''
Breast Cancer Wisconsin:
  - Samples: {len(X_train_bc) + len(X_test_bc)}
  - Features: {len(X_train_bc.columns)}
  - Train/Test: {len(X_train_bc)}/{len(X_test_bc)}
  - Class balance: {y_train_bc['target'].value_counts().to_dict()}

Cleveland Heart Disease:
  - Samples: {len(X_train_hd) + len(X_test_hd)}
  - Features: {len(X_train_hd.columns)}
  - Train/Test: {len(X_train_hd)}/{len(X_test_hd)}
  - Class balance: {y_train_hd['target'].value_counts().to_dict()}
''')
print('='*60)
print('\nNext: Train models with scripts/train_model.py')