In [None]:
!pip install xgboost scikit-learn pandas numpy matplotlib --quiet


# Imports & Config

In [78]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb

TRAIN_CSV = "/content/Heart_Attack_training_dataset.csv"
TEST_CSV = "/content/Hear_Attack_evaluation_dataset.csv"
TEAM_CODE = "EM06"
TEAM_NAME = "Sindorai"
OUT_PRED_CSV = f"{TEAM_CODE}_{TEAM_NAME}_Task1_Predictions.csv"

TARGET_COL = "heart_attack_risk"
ID_COL = "patient_id"

# Preprocessing Functions

In [67]:
def parse_bp(df):
    sys_vals, dia_vals = [], []
    for val in df["bp"].astype(str).tolist():
        if "/" in val:
            try:
                s, d = val.split("/")
                sys_vals.append(float(s))
                dia_vals.append(float(d))
            except:
                sys_vals.append(np.nan)
                dia_vals.append(np.nan)
        else:
            sys_vals.append(np.nan)
            dia_vals.append(np.nan)
    df["bp_sys"] = sys_vals
    df["bp_dia"] = dia_vals
    return df.drop(columns=["bp"])

# Load
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print("Original shapes:")
print(f"Train: {train_df.shape}, Test: {test_df.shape}")
print(f"\nClass distribution:")
print(train_df[TARGET_COL].value_counts(normalize=True))

# Parse BP
train_df = parse_bp(train_df)
test_df = parse_bp(test_df)


Original shapes:
Train: (7963, 26), Test: (800, 25)

Class distribution:
heart_attack_risk
0    0.656034
1    0.343966
Name: proportion, dtype: float64


# Feature Selection

In [68]:
# REMOVED: country, continent, hemisphere (likely noise)
# KEPT: Only medically relevant features

# Binary categorical (keep as 0/1)
binary_cats = ["sex", "diabetes", "family_history", "smoking",
               "obesity", "alcohol", "prev_heart_prob", "med_use"]

# Ordinal categorical (needs proper ordering)
ordinal_cats = ["diet"]  # Poor < Average < Healthy

# Numeric features
num_cols = ["age", "chol", "hr", "exercise_hr_wk", "stress_lvl",
            "sedentary_hr", "income", "bmi", "triglycerides",
            "phys_act_days", "sleep_hr", "bp_sys", "bp_dia"]


In [69]:
diet_mapping = {
    'Poor': 0,
    'Average': 1,
    'Healthy': 2,
    'Unhealthy': 0,  # Treat as Poor
    np.nan: 1  # Default to Average
}

for df in [train_df, test_df]:
    df['diet'] = df['diet'].map(diet_mapping).fillna(1)

for col in binary_cats:
    for df in [train_df, test_df]:
        # Handle any text values
        if df[col].dtype == 'object':
            df[col] = df[col].map({'Male': 1, 'Female': 0,
                                   'Yes': 1, 'No': 0,
                                   '1': 1, '0': 0,
                                   1: 1, 0: 0})
        # Fill missing with mode
        mode_val = train_df[col].mode()[0] if len(train_df[col].mode()) > 0 else 0
        df[col].fillna(mode_val, inplace=True)
        df[col] = df[col].astype(int)



for col in num_cols:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    test_df[col].fillna(median_val, inplace=True)

# Scale numeric
scaler = StandardScaler()
train_scaled = pd.DataFrame(
    scaler.fit_transform(train_df[num_cols]),
    columns=num_cols,
    index=train_df.index
)
test_scaled = pd.DataFrame(
    scaler.transform(test_df[num_cols]),
    columns=num_cols,
    index=test_df.index
)


all_features = num_cols + binary_cats + ordinal_cats

X = pd.concat([
    train_scaled,
    train_df[binary_cats + ordinal_cats].reset_index(drop=True)
], axis=1)

X_test_final = pd.concat([
    test_scaled,
    test_df[binary_cats + ordinal_cats].reset_index(drop=True)
], axis=1)

y = train_df[TARGET_COL]

print(f"\nFinal features: {X.shape[1]}")
print(f"Features used: {list(X.columns)}")


Final features: 22
Features used: ['age', 'chol', 'hr', 'exercise_hr_wk', 'stress_lvl', 'sedentary_hr', 'income', 'bmi', 'triglycerides', 'phys_act_days', 'sleep_hr', 'bp_sys', 'bp_dia', 'sex', 'diabetes', 'family_history', 'smoking', 'obesity', 'alcohol', 'prev_heart_prob', 'med_use', 'diet']


# Dataset & DataLoader

In [70]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain: {X_train.shape}, Val: {X_val.shape}")


Train: (6370, 22), Val: (1593, 22)


#  XGBoost Model

In [71]:

print("\n" + "="*50)
print("Training XGBoost")
print("="*50)

scale_pos = (len(y_train) - sum(y_train)) / sum(y_train)
print(f"Scale pos weight: {scale_pos:.2f}")

params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.05,
    "max_depth": 5,
    "min_child_weight": 2,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "gamma": 0.5,
    "reg_alpha": 0.1,
    "reg_lambda": 1,
    "scale_pos_weight": scale_pos,
    "seed": 42
}

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
evals = [(dtrain, "train"), (dval, "val")]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=50
)

print(f"\nBest iteration: {model.best_iteration}")
print(f"Best val AUC: {model.best_score:.4f}")


Training XGBoost
Scale pos weight: 1.91
[0]	train-auc:0.56997	val-auc:0.49475
[50]	train-auc:0.82163	val-auc:0.49670
[73]	train-auc:0.85213	val-auc:0.49743

Best iteration: 24
Best val AUC: 0.5137


# Evaluate

In [72]:

val_probs = model.predict(dval)
val_preds = (val_probs >= 0.5).astype(int)

print("\n" + "="*50)
print("Validation Metrics (threshold=0.5)")
print("="*50)
print(f"AUC      : {roc_auc_score(y_val, val_probs):.4f}")
print(f"Accuracy : {accuracy_score(y_val, val_preds):.4f}")
print(f"Precision: {precision_score(y_val, val_preds):.4f}")
print(f"Recall   : {recall_score(y_val, val_preds):.4f}")
print(f"F1       : {f1_score(y_val, val_preds):.4f}")

print(f"\nVal prob distribution:")
print(f"  Min: {val_probs.min():.4f}")
print(f"  Max: {val_probs.max():.4f}")
print(f"  Mean: {val_probs.mean():.4f}")
print(f"  Std: {val_probs.std():.4f}")



Validation Metrics (threshold=0.5)
AUC      : 0.4976
Accuracy : 0.5229
Precision: 0.3455
Recall   : 0.4325
F1       : 0.3841

Val prob distribution:
  Min: 0.2376
  Max: 0.7236
  Mean: 0.4922
  Std: 0.0616


# Feature Importance

In [73]:
importance = model.get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': importance.keys(),
    'importance': importance.values()
}).sort_values('importance', ascending=False)

print("\n" + "="*50)
print("Top 10 Most Important Features")
print("="*50)
print(importance_df.head(10).to_string(index=False))


Top 10 Most Important Features
       feature  importance
    stress_lvl    8.465065
           bmi    8.146898
        income    7.559237
exercise_hr_wk    7.554323
        bp_sys    7.547535
 triglycerides    7.508315
        bp_dia    7.496134
  sedentary_hr    7.494213
          chol    7.362843
            hr    7.289238


# Threshold optimization for recall

In [76]:
print("\n" + "="*50)
print("Optimizing Threshold for Recall")
print("="*50)

best_recall = 0
best_threshold = 0.5
best_f1 = 0

# Try different thresholds and track metrics
threshold_results = []
for thr in np.linspace(0.3, 0.7, 41):
    preds = (val_probs >= thr).astype(int)
    rec = recall_score(y_val, preds)
    f1 = f1_score(y_val, preds)
    prec = precision_score(y_val, preds, zero_division=0)

    threshold_results.append({
        'threshold': thr,
        'recall': rec,
        'f1': f1,
        'precision': prec
    })

    # Optimize: High recall but keep F1 reasonable
    if rec > best_recall and f1 > 0.40:
        best_recall = rec
        best_threshold = thr
        best_f1 = f1

# Show top options
results_df = pd.DataFrame(threshold_results).sort_values('recall', ascending=False)
print("\nTop 5 thresholds by recall:")
print(results_df.head(5).to_string(index=False))

print(f"\nSelected threshold: {best_threshold:.3f}")
optimized_preds = (val_probs >= best_threshold).astype(int)
print(f"  Recall:    {recall_score(y_val, optimized_preds):.4f}")
print(f"  F1:        {f1_score(y_val, optimized_preds):.4f}")
print(f"  Precision: {precision_score(y_val, optimized_preds):.4f}")
print(f"  Accuracy:  {accuracy_score(y_val, optimized_preds):.4f}")


Optimizing Threshold for Recall

Top 5 thresholds by recall:
 threshold   recall       f1  precision
      0.30 0.998175 0.512172   0.344458
      0.31 0.998175 0.512412   0.344675
      0.32 0.998175 0.512893   0.345110
      0.33 0.998175 0.513133   0.345328
      0.34 0.996350 0.513158   0.345570

Selected threshold: 0.300
  Recall:    0.9982
  F1:        0.5122
  Precision: 0.3445
  Accuracy:  0.3459


# Final Prediction

In [79]:

print("\n" + "="*50)
print("Final Prediction on Test")
print("="*50)

dtrain_full = xgb.DMatrix(X, label=y)
dtest = xgb.DMatrix(X_test_final)

final_model = xgb.train(params, dtrain_full, num_boost_round=model.best_iteration)
test_probs = final_model.predict(dtest)

print(f"Test prob distribution:")
print(f"  Min: {test_probs.min():.4f}")
print(f"  Max: {test_probs.max():.4f}")
print(f"  Mean: {test_probs.mean():.4f}")
print(f"  Std: {test_probs.std():.4f}")

prob_spread = test_probs.max() - test_probs.min()
print(f"  Spread: {prob_spread:.4f}")

# Decision logic
if prob_spread > 0.15:
    # Model has reasonable confidence
    test_preds = (test_probs >= best_threshold).astype(int)
    print(f"\nUsing optimized threshold: {best_threshold:.3f}")

    # Safety check - if predicting too many positives, adjust
    predicted_pos_rate = test_preds.mean()
    val_pos_rate = y_val.mean()

    print(f"Predicted positive rate: {predicted_pos_rate:.2%}")
    print(f"Validation positive rate: {val_pos_rate:.2%}")

    # If predicting >80% positive, use more conservative threshold
    if predicted_pos_rate > 0.8:
        print("Warning: Predicting too many positives, adjusting threshold")
        # Find threshold that gives reasonable positive rate
        sorted_probs = np.sort(test_probs)
        target_positives = int(len(test_probs) * min(val_pos_rate * 1.2, 0.6))
        adjusted_threshold = sorted_probs[-target_positives]
        test_preds = (test_probs >= adjusted_threshold).astype(int)
        print(f"Adjusted threshold: {adjusted_threshold:.3f}")
        print(f"New positive rate: {test_preds.mean():.2%}")

else:
    # Model still guessing - use safer approach
    print(f"\nWarning: Low spread ({prob_spread:.4f}), using distribution matching")
    val_pos_rate = y_val.mean()
    num_positive = int(len(test_probs) * val_pos_rate)
    test_preds = np.zeros(len(test_probs), dtype=int)
    top_indices = np.argsort(test_probs)[-num_positive:]
    test_preds[top_indices] = 1

# Save
submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    TARGET_COL: test_preds
})

submission.to_csv(OUT_PRED_CSV, index=False)
print(f"\nSaved: {OUT_PRED_CSV}")
print(f"Predictions: Class 0={np.sum(test_preds==0)}, Class 1={np.sum(test_preds==1)}")
print(f"Positive rate: {test_preds.mean():.2%}")


Final Prediction on Test
Test prob distribution:
  Min: 0.3550
  Max: 0.6005
  Mean: 0.4951
  Std: 0.0264
  Spread: 0.2456

Using optimized threshold: 0.300
Predicted positive rate: 100.00%
Validation positive rate: 34.40%
Adjusted threshold: 0.500
New positive rate: 41.25%

Saved: EM06_Sindorai_Task1_Predictions.csv
Predictions: Class 0=470, Class 1=330
Positive rate: 41.25%
