# EDA: AMI Mortality / Arrhythmia
Set DATASET_PATH as env var or provide path here.

In [None]:
# Updated to use new modular EDA structure
import os
import pandas as pd
from src.data import load_dataset
from src.eda import EDAAnalyzer, quick_eda

# Load data
DATASET_PATH = os.environ.get('DATASET_PATH', '../DATA/recuima-020425-fragment.csv')
df = load_dataset(DATASET_PATH)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Quick EDA
print("\nPerforming quick EDA...")
analyzer = quick_eda(df, run_pca=False)

# Get summary report
summary = analyzer.generate_summary_report()
print("\nSummary Report:")
for key, value in summary.items():
    print(f"  {key}: {value}")

# Display univariate stats for first few columns
print("\nUnivariate Statistics (first 5 columns):")
for col_name, stats in list(analyzer.univariate_results.items())[:5]:
    print(f"\n{col_name} ({stats.variable_type}):")
    if stats.variable_type == 'numerical':
        print(f"  Mean: {stats.mean:.2f if stats.mean else 'N/A'}")
        print(f"  Median: {stats.median:.2f if stats.median else 'N/A'}")
        print(f"  Std: {stats.std:.2f if stats.std else 'N/A'}")
    else:
        print(f"  Categories: {stats.n_categories}")
        print(f"  Mode: {stats.mode}")


In [None]:
# Plot distribution for a numerical variable
if len(analyzer.numeric_cols) > 0:
    col = analyzer.numeric_cols[0]
    print(f"\nPlotting distribution for: {col}")
    fig = analyzer.plot_distribution(col, plot_type='histogram')
    fig.show()

# Plot correlation matrix
if len(analyzer.numeric_cols) >= 2:
    print("\nGenerating correlation matrix...")
    fig = analyzer.plot_correlation_matrix(method='pearson')
    fig.show()

# Perform PCA if enough numerical variables
if len(analyzer.numeric_cols) >= 2:
    print(f"\nPerforming PCA on {len(analyzer.numeric_cols)} numerical variables...")
    try:
        pca_results = analyzer.perform_pca(n_components=None, variance_threshold=0.95)
        print(f"  Components selected: {pca_results.n_components}")
        print(f"  Total variance explained: {sum(pca_results.explained_variance_ratio):.2%}")
        
        # Plot scree plot
        fig = analyzer.plot_pca_scree()
        fig.show()
        
        # Get feature importance
        importance_df = analyzer.get_feature_importance_pca(n_components=3)
        print("\nTop 10 important features:")
        print(importance_df.head(10))
    except Exception as e:
        print(f"PCA failed: {e}")
