In [None]:
# eda_model_comparison.py
# Performs EDA and model comparison for the fitness and wellness platform

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Set matplotlib backend for non-interactive environments
plt.switch_backend('agg')  # Use 'agg' for saving plots without display

In [None]:
# 1. Load and Inspect Datasets
print("Loading datasets...")
heart_df = pd.read_csv('../backend/data/heart_failure.csv')
sleep_df = pd.read_csv('../backend/data/sleep_health.csv')[['Heart Rate', 'Daily Steps', 'Sleep Duration']]
sleep_df.columns = ['MaxHR', 'Steps', 'SleepHours']  # Rename for consistency

# Display basic info
print("\nHeart Failure Dataset Info:")
print(heart_df.info())
print("\nSleep Health Dataset Info:")
print(sleep_df.info())

# Check for missing values
print("\nMissing Values in Heart Failure:")
print(heart_df.isnull().sum())
print("\nMissing Values in Sleep Health:")
print(sleep_df.isnull().sum())

In [None]:
# 2. Exploratory Data Analysis (EDA)
print("\nGenerating EDA visualizations...")

# Distribution of key features in heart failure dataset
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
sns.histplot(heart_df['Age'], kde=True, color='blue')
plt.title('Age Distribution')
plt.subplot(2, 2, 2)
sns.histplot(heart_df['RestingBP'], kde=True, color='green')
plt.title('Resting BP Distribution')
plt.subplot(2, 2, 3)
sns.histplot(heart_df['Cholesterol'], kde=True, color='red')
plt.title('Cholesterol Distribution')
plt.subplot(2, 2, 4)
sns.histplot(heart_df['MaxHR'], kde=True, color='purple')
plt.title('Max Heart Rate Distribution')
plt.tight_layout()
plt.savefig('heart_failure_distributions.png')
plt.close()

# Correlation matrix for heart failure dataset
plt.figure(figsize=(10, 8))
sns.heatmap(heart_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix - Heart Failure Dataset')
plt.savefig('heart_failure_correlation.png')
plt.close()

# Distribution of sleep dataset features
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
sns.histplot(sleep_df['MaxHR'], kde=True, color='blue')
plt.title('Heart Rate (Sleep Dataset)')
plt.subplot(1, 3, 2)
sns.histplot(sleep_df['Steps'], kde=True, color='green')
plt.title('Daily Steps')
plt.subplot(1, 3, 3)
sns.histplot(sleep_df['SleepHours'], kde=True, color='purple')
plt.title('Sleep Duration')
plt.tight_layout()
plt.savefig('sleep_health_distributions.png')
plt.close()

In [None]:
# 3. Data Preprocessing
print("\nPreprocessing data...")
X = heart_df[['Age', 'RestingBP', 'Cholesterol', 'MaxHR']].copy()
X['Steps'] = sleep_df['Steps'].iloc[:len(X)].fillna(sleep_df['Steps'].mean())
X['SleepHours'] = sleep_df['SleepHours'].iloc[:len(X)].fillna(sleep_df['SleepHours'].mean())
y = heart_df['HeartDisease']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 4. Model Training and Comparison
print("\nTraining and evaluating models...")
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Support Vector Machine': SVC(probability=True, random_state=42)
}

# Train and evaluate models
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    })

# Display results
results_df = pd.DataFrame(results)
print("\nModel Performance Comparison:")
print(results_df)

# Visualize accuracy comparison
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Accuracy', data=results_df, palette='viridis')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.savefig('model_accuracy_comparison.png')
plt.close()

# Visualize other metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
plt.figure(figsize=(12, 8))
for i, metric in enumerate(metrics, 1):
    plt.subplot(2, 2, i)
    sns.barplot(x='Model', y=metric, data=results_df, palette='magma')
    plt.title(f'{metric} Comparison')
    plt.ylabel(metric)
plt.tight_layout()
plt.savefig('model_metrics_comparison.png')
plt.close()

In [None]:
# 5. Conclusion
print("\nConclusion:")
print("Based on the performance metrics, Random Forest is recommended for the fitness platform due to its typically higher accuracy and robustness to non-linear patterns. Update `ml_model.py` to use Random Forest, or switch to Logistic Regression or SVM if they perform better in specific scenarios.")