In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
import shap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
# Set up paths
BASE_PATH = '/workspace/COMP-3608---PROJECT'
DATA_PATH = os.path.join(BASE_PATH, 'data')
MODELS_PATH = os.path.join(BASE_PATH, 'models')
RESULTS_PATH = os.path.join(BASE_PATH, 'results')
FEATURE_ANALYSIS_PATH = os.path.join(RESULTS_PATH, 'feature_analysis')

In [3]:
# Create feature analysis directory if it doesn't exist
if not os.path.exists(FEATURE_ANALYSIS_PATH):
    os.makedirs(FEATURE_ANALYSIS_PATH)


In [4]:
# Load datasets
print("Loading datasets...")
diabetes_df = pd.read_csv(os.path.join(DATA_PATH, 'feature_engineering/diabetes_feature_engineering'))
stroke_df = pd.read_csv(os.path.join(DATA_PATH, 'feature_engineering/Stroke_feature_engineering'))
heart_df = pd.read_csv(os.path.join(DATA_PATH, 'feature_engineering/heart_feature_engineering'))


Loading datasets...


In [5]:
print(f"Diabetes dataset shape: {diabetes_df.shape}")
print(f"Stroke dataset shape: {stroke_df.shape}")
print(f"Heart Disease dataset shape: {heart_df.shape}")

Diabetes dataset shape: (100000, 12)
Stroke dataset shape: (5110, 20)
Heart Disease dataset shape: (920, 16)


In [None]:
# Function to prepare data for feature importance analysis
def prepare_data(df, target_column, categorical_cols=None, numerical_cols=None):
    """Prepare data for feature importance analysis with SMOTE"""
    
    if categorical_cols is None:
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    if numerical_cols is None:
        numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        # Remove ID columns and target from numerical features if present
        numerical_cols = [col for col in numerical_cols if col != target_column and 'id' not in col.lower()]
    
    # Ensure target column is removed from features
    categorical_cols = [col for col in categorical_cols if col != target_column]
    numerical_cols = [col for col in numerical_cols if col != target_column]
    
    # Create preprocessing pipeline
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    
    # Split data
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Apply preprocessing
    preprocessor.fit(X_train)
    X_train_processed = preprocessor.transform(X_train)
    X_test_processed = preprocessor.transform(X_test)
    
    # Apply SMOTE for balancing the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
    
    # Get feature names after one-hot encoding
    feature_names = []
    
    # Add numerical feature names
    feature_names.extend(numerical_cols)
    
    # Add categorical feature names
    if categorical_cols:
        cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
        feature_names.extend(cat_feature_names)
    
    print(f"Original class distribution: {np.bincount(y_train)}")
    print(f"After SMOTE: {np.bincount(y_train_resampled)}")
    print(f"Feature count: {len(feature_names)}")
    
    return (X_train, X_test, X_train_processed, X_test_processed, 
            X_train_resampled, y_train_resampled, 
            y_train, y_test, feature_names, preprocessor)
