<a href="https://colab.research.google.com/github/Sg134-ch/Machine-Learning-Projects-/blob/main/ML_LAB_EXP_05_HEART_DISEASE_PREDICTION_BY_23102C0051.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
import os
warnings.filterwarnings('ignore')

print("HEART DISEASE PREDICTION SOLUTION")

def find_file(filename, search_paths):
    for path in search_paths:
        full_path = os.path.join(path, filename)
        if os.path.exists(full_path):
            return full_path
    return None

search_paths = ['.', '/mnt/user-data/uploads', '/home/claude']

print("\nLoading data...")

train_path = find_file('train.csv', search_paths)
if not train_path:
    print("ERROR: train.csv not found!")
    exit(1)
train = pd.read_csv(train_path)
print(f"✓ Train: {train.shape}")

test_path = find_file('test.csv', search_paths)
if not test_path:
    print("ERROR: test.csv not found!")
    exit(1)
test = pd.read_csv(test_path)
print(f"✓ Test: {test.shape}")

print("DATA PREPARATION")


target_col = 'Heart Disease'
if target_col not in train.columns:
    for col in ['heart_disease', 'HeartDisease', 'target', 'y']:
        if col in train.columns:
            target_col = col
            break

print(f"\nTarget column: '{target_col}'")
print(f"Unique values: {train[target_col].unique()}")

# Separate features FIRST
X_train = train.drop([target_col, 'id'], axis=1, errors='ignore')
X_test = test.drop(['id'], axis=1, errors='ignore')
test_ids = test['id']

# Encode target separately
y_train_raw = train[target_col]
if y_train_raw.dtype == 'object':
    print(f"\n⚠ Encoding categorical target...")
    le_target = LabelEncoder()
    y_train = le_target.fit_transform(y_train_raw)
    print(f"Mapping: {dict(zip(le_target.classes_, range(len(le_target.classes_))))}")
else:
    y_train = y_train_raw.values

print(f"\nTarget distribution:")
unique, counts = np.unique(y_train, return_counts=True)
for val, count in zip(unique, counts):
    print(f"  Class {val}: {count} ({count/len(y_train)*100:.1f}%)")

print("FEATURE ENGINEERING")

def create_features(df):
    df = df.copy()

    # Heart disease features
    if 'age' in df.columns:
        df['age_squared'] = df['age'] ** 2
        df['age_risk'] = (df['age'] > 60).astype(int)

    if 'cholesterol' in df.columns or 'chol' in df.columns:
        chol_col = 'cholesterol' if 'cholesterol' in df.columns else 'chol'
        df['high_cholesterol'] = (df[chol_col] > 240).astype(int)

    if 'trestbps' in df.columns:
        df['high_bp'] = (df['trestbps'] > 140).astype(int)

    if 'thalach' in df.columns:
        df['low_heart_rate'] = (df['thalach'] < 100).astype(int)

    if 'age' in df.columns and 'thalach' in df.columns:
        df['age_hr'] = df['age'] * df['thalach']

    # Statistical features
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) > 0:
        df['num_mean'] = df[numeric_cols].mean(axis=1)
        df['num_std'] = df[numeric_cols].std(axis=1).fillna(0)

    return df

X_train = create_features(X_train)
X_test = create_features(X_test)

# Encode categorical features
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
if categorical_cols:
    print(f"Encoding {len(categorical_cols)} categorical features...")
    for col in categorical_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

# Fill missing
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_test.median())

print(f"Final features: {X_train.shape[1]}")

print("MODEL TRAINING")

class_ratio = np.sum(y_train == 0) / np.sum(y_train == 1)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

models = {
    'LightGBM': LGBMClassifier(
        n_estimators=150,
        learning_rate=0.08,
        max_depth=6,
        num_leaves=25,
        class_weight='balanced',
        random_state=42,
        verbose=-1,
        n_jobs=-1
    ),
    'XGBoost': XGBClassifier(
        n_estimators=150,
        learning_rate=0.08,
        max_depth=6,
        scale_pos_weight=class_ratio,
        random_state=42,
        eval_metric='logloss',
        verbosity=0,
        n_jobs=-1,
        use_label_encoder=False
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=150,
        max_depth=10,
        min_samples_split=20,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
}

predictions = {}
cv_scores = {}

for idx, (name, model) in enumerate(models.items(), 1):
    print(f"\n[{idx}/{len(models)}] Training {name}...")

    fold_scores = []
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_tr = X_train.iloc[train_idx]
        X_val = X_train.iloc[val_idx]
        y_tr = y_train[train_idx]
        y_val = y_train[val_idx]

        # Clone model
        model_clone = model.__class__(**model.get_params())

        # Fit model
        model_clone.fit(X_tr, y_tr)

        # Predict
        pred = model_clone.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, pred)
        fold_scores.append(score)

    cv_score = np.mean(fold_scores)
    cv_scores[name] = cv_score
    print(f"  CV ROC AUC: {cv_score:.5f} (+/- {np.std(fold_scores):.5f})")

    # Train on full data
    model.fit(X_train, y_train)
    predictions[name] = model.predict_proba(X_test)[:, 1]

print("ENSEMBLE")

weights = {name: score for name, score in cv_scores.items()}
total = sum(weights.values())
weights = {k: v/total for k, v in weights.items()}

print("\nWeights:")
for name, weight in sorted(weights.items(), key=lambda x: x[1], reverse=True):
    print(f"  {name:15s}: {weight:.4f}")

final_pred = sum(weights[name] * predictions[name] for name in predictions)

print("SAVING SUBMISSION")


submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': final_pred
})

output_dir = '/mnt/user-data/outputs'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'submission.csv')

submission.to_csv(output_path, index=False)
submission.to_csv('submission.csv', index=False)

print(f"\n✓ Saved: {output_path}")
print("STATISTICS")
print(f"Range: [{final_pred.min():.5f}, {final_pred.max():.5f}]")
print(f"Mean: {final_pred.mean():.5f}")
print(f"Expected CV AUC: {np.mean(list(cv_scores.values())):.5f}")

HEART DISEASE PREDICTION SOLUTION

Loading data...
✓ Train: (630000, 15)
✓ Test: (270000, 14)
DATA PREPARATION

Target column: 'Heart Disease'
Unique values: ['Presence' 'Absence']

⚠ Encoding categorical target...
Mapping: {'Absence': 0, 'Presence': 1}

Target distribution:
  Class 0: 347546 (55.2%)
  Class 1: 282454 (44.8%)
FEATURE ENGINEERING
Final features: 15
MODEL TRAINING

[1/3] Training LightGBM...
  CV ROC AUC: 0.95469 (+/- 0.00013)

[2/3] Training XGBoost...
  CV ROC AUC: 0.95469 (+/- 0.00019)

[3/3] Training RandomForest...
  CV ROC AUC: 0.95206 (+/- 0.00015)
ENSEMBLE

Weights:
  LightGBM       : 0.3336
  XGBoost        : 0.3336
  RandomForest   : 0.3327
SAVING SUBMISSION

✓ Saved: /mnt/user-data/outputs/submission.csv
STATISTICS
Range: [0.00351, 0.99880]
Mean: 0.46768
Expected CV AUC: 0.95381
