In [29]:
import pandas as pd

In [30]:

# Load raw data
df = pd.read_csv("cardio_train.csv", sep=';')


In [31]:
# --- 1Ô∏è‚É£ Basic sanity checks ---
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB
None
                 id           age        gender        height        weight  \
count  70000.000000  70000.000000  70000.000000  70000.000000  70000.000000   
mean   49972.419900  19468.865814      1.349571

In [32]:
# --- 2Ô∏è‚É£ Convert and engineer features ---
df['age'] = (df['age'] / 365).round().astype(int)              # convert age from days ‚Üí years
df['BMI'] = (df['weight'] / ((df['height']/100) ** 2)).round(2)
df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']

In [33]:
# --- 3Ô∏è‚É£ Remove impossible / extreme values ---
df = df[(df['height'] >= 120) & (df['height'] <= 210)]
df = df[(df['weight'] >= 30) & (df['weight'] <= 180)]
df = df[(df['ap_hi'] >= 80) & (df['ap_hi'] <= 240)]
df = df[(df['ap_lo'] >= 40) & (df['ap_lo'] <= 200)]
df = df[df['pulse_pressure'] >= 0]

In [34]:
# --- 4Ô∏è‚É£ Handle missing / duplicates ---
df = df.drop_duplicates()
df = df.dropna()

In [35]:
# --- 6Ô∏è‚É£ Save cleaned data ---
df.to_csv("cardio_clean.csv", index=False)

In [36]:
print("‚úÖ Cleaned dataset saved as cardio_clean.csv")
print(df.shape)

‚úÖ Cleaned dataset saved as cardio_clean.csv
(68610, 15)


In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [38]:
# Load cleaned data
df = pd.read_csv("cardio_clean.csv")

In [39]:
# Split features and target
X = df.drop(columns=['cardio', 'id'])
y = df['cardio']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Train a simple logistic regression model
model = LogisticRegression(max_iter=500)
model.fit(X_train_scaled, y_train)

In [40]:
# Evaluate
y_pred = model.predict(X_test_scaled)
print("‚úÖ Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

‚úÖ Accuracy: 0.73
[[5469 1464]
 [2237 4552]]
              precision    recall  f1-score   support

           0       0.71      0.79      0.75      6933
           1       0.76      0.67      0.71      6789

    accuracy                           0.73     13722
   macro avg       0.73      0.73      0.73     13722
weighted avg       0.73      0.73      0.73     13722



In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [42]:
# --- 1Ô∏è‚É£ Load cleaned dataset ---
df = pd.read_csv("cardio_clean.csv")

In [43]:
# --- 2Ô∏è‚É£ Feature engineering ---
df['age_years'] = df['age']
df['BMI'] = (df['weight'] / ((df['height']/100) ** 2)).round(2)
df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']
df['bp_ratio'] = (df['ap_hi'] / df['ap_lo']).round(2)
df['obesity_flag'] = (df['BMI'] > 30).astype(int)

In [44]:
# --- 3Ô∏è‚É£ Define features/target ---
X = df.drop(columns=['cardio', 'id'])
y = df['cardio']

# --- 4Ô∏è‚É£ Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 5Ô∏è‚É£ Scale numeric features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --- 6Ô∏è‚É£ Base Random Forest model ---
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=3,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_scaled, y_train)

In [45]:
# --- 7Ô∏è‚É£ Evaluate performance ---
y_pred = rf.predict(X_test_scaled)
print("‚úÖ Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

‚úÖ Accuracy: 0.74
[[5402 1531]
 [2036 4753]]
              precision    recall  f1-score   support

           0       0.73      0.78      0.75      6933
           1       0.76      0.70      0.73      6789

    accuracy                           0.74     13722
   macro avg       0.74      0.74      0.74     13722
weighted avg       0.74      0.74      0.74     13722



In [46]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly  = poly.transform(X_test_scaled)

rf.fit(X_train_poly, y_train)
y_pred = rf.predict(X_test_poly)
print("New Accuracy:", round(accuracy_score(y_test, y_pred), 3))


New Accuracy: 0.738


In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from scipy.stats import randint
import joblib





In [48]:
# --- 1Ô∏è‚É£ Load cleaned dataset ---
df = pd.read_csv("cardio_clean.csv")

In [49]:
# --- 2Ô∏è‚É£ Feature Engineering ---
df['age_years'] = df['age']
df['BMI'] = (df['weight'] / ((df['height']/100) ** 2)).round(2)
df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']
df['bp_ratio'] = (df['ap_hi'] / df['ap_lo']).round(2)
df['obesity_flag'] = (df['BMI'] > 30).astype(int)

# --- 3Ô∏è‚É£ Define features/target ---
X = df.drop(columns=['cardio', 'id'])
y = df['cardio']

# --- 4Ô∏è‚É£ Split dataset ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 5Ô∏è‚É£ Scale numeric features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --- 6Ô∏è‚É£ Optional Polynomial Features ---
use_polynomial_features = False
if use_polynomial_features:
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_train_scaled = poly.fit_transform(X_train_scaled)
    X_test_scaled  = poly.transform(X_test_scaled)
    print("‚úÖ Polynomial features applied")



In [50]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

param_dist = {
    'n_estimators': randint(150, 250),
    'max_depth': randint(6, 10),
    'min_samples_split': randint(2, 4),
    'min_samples_leaf': randint(1, 2),
}

random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=10,   # only 10 random trials
    cv=2,        # only 2-fold CV
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

sample_idx = np.random.choice(len(X_train_scaled), 5000, replace=False)
random_search.fit(X_train_scaled[sample_idx], y_train.iloc[sample_idx])

rf_best = random_search.best_estimator_
print(f"‚úÖ Best RF Params (quick): {random_search.best_params_}")

# Retrain on full data with these params
rf_best.fit(X_train_scaled, y_train)


‚úÖ Best RF Params (quick): {'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 232}


In [51]:
# --- 8Ô∏è‚É£ XGBoost Model ---
xgb = XGBClassifier(
    n_estimators=400,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb.fit(X_train_scaled, y_train)



In [52]:
# --- 9Ô∏è‚É£ Evaluate models ---
rf_pred = rf_best.predict(X_test_scaled)
xgb_pred = xgb.predict(X_test_scaled)

rf_acc = accuracy_score(y_test, rf_pred)
xgb_acc = accuracy_score(y_test, xgb_pred)

print("\nüéØ Model Performances:")
print(f"Random Forest Accuracy: {rf_acc:.3f}")
print(f"XGBoost Accuracy:       {xgb_acc:.3f}")




üéØ Model Performances:
Random Forest Accuracy: 0.735
XGBoost Accuracy:       0.736


In [53]:
# --- üîü Ensemble (average probabilities) ---
rf_probs = rf_best.predict_proba(X_test_scaled)[:, 1]
xgb_probs = xgb.predict_proba(X_test_scaled)[:, 1]
ensemble_probs = (rf_probs + xgb_probs) / 2
ensemble_pred = (ensemble_probs > 0.5).astype(int)
ensemble_acc = accuracy_score(y_test, ensemble_pred)

print(f"Ensemble Accuracy:      {ensemble_acc:.3f}")

Ensemble Accuracy:      0.739


In [54]:
# --- 1Ô∏è‚É£1Ô∏è‚É£ Pick Best Model ---
best_model = None
best_acc = 0

if ensemble_acc >= max(rf_acc, xgb_acc):
    best_model = ("ensemble", (rf_best, xgb))
    best_acc = ensemble_acc
    print("‚úÖ Best model: Ensemble")
elif xgb_acc > rf_acc:
    best_model = ("xgb", xgb)
    best_acc = xgb_acc
    print("‚úÖ Best model: XGBoost")
else:
    best_model = ("rf", rf_best)
    best_acc = rf_acc
    print("‚úÖ Best model: Random Forest")

print(f"\nüî• Final Accuracy: {best_acc:.3f}")
print("\nüìã Classification Report (best model):")
if best_model[0] == "ensemble":
    print(classification_report(y_test, ensemble_pred))
elif best_model[0] == "xgb":
    print(classification_report(y_test, xgb_pred))
else:
    print(classification_report(y_test, rf_pred))

‚úÖ Best model: Ensemble

üî• Final Accuracy: 0.739

üìã Classification Report (best model):
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      6933
           1       0.76      0.69      0.72      6789

    accuracy                           0.74     13722
   macro avg       0.74      0.74      0.74     13722
weighted avg       0.74      0.74      0.74     13722



In [55]:
# --- 1Ô∏è‚É£1Ô∏è‚É£ Save Models + Scaler ---
if best_model[0] == "ensemble":
    joblib.dump(best_model[1][0], "cardio_rf_model.pkl")
    joblib.dump(best_model[1][1], "cardio_xgb_model.pkl")
else:
    joblib.dump(best_model[1], f"cardio_{best_model[0]}_model.pkl")

joblib.dump(scaler, "scaler.pkl")
print("üíæ Saved model(s) and scaler successfully!")

üíæ Saved model(s) and scaler successfully!
