# CVD Risk Classification — Clean, High-Accuracy Pipeline
**Auto-generated** on 2025-08-21 02:58:54.  
This notebook rebuilds your workflow into clear, ordered cells and aims for **>85% accuracy** using robust preprocessing and a strong tree-based model (HistGradientBoosting / RandomForest / optional XGBoost).  
It keeps class imbalance in mind and saves a ready-to-use pipeline.


## 0. Environment & Imports
If you're running locally, ensure you have `scikit-learn`, `pandas`, `numpy`, and optionally `xgboost` installed.


In [17]:
# !pip install -q numpy pandas scikit-learn xgboost

import os, re, math, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Optional: XGBoost if available
try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except Exception:
    XGB_AVAILABLE = False

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


## 1. Load Data

In [18]:
import pandas as pd
url_string = 'https://github.com/Bishwaprotapi/Cardiovascular-Disease-Risk-Classification-Using-Machine-Learning-Techniques/blob/main/CVD_Dataset.csv'

In [19]:
# Reading Data Into Pandas Dataframe
df = pd.read_csv('CVD_Dataset.csv')
df

Unnamed: 0,Sex,Age,Weight (kg),Height (m),BMI,Abdominal Circumference (cm),Blood Pressure (mmHg),Total Cholesterol (mg/dL),HDL (mg/dL),Fasting Blood Sugar (mg/dL),...,Physical Activity Level,Family History of CVD,CVD Risk Level,Height (cm),Waist-to-Height Ratio,Systolic BP,Diastolic BP,Blood Pressure Category,Estimated LDL (mg/dL),CVD Risk Score
0,F,32.0,69.100,1.710,23.600,86.200,125/79,248.0,78.0,111.0,...,Low,N,INTERMEDIARY,171.000,0.504,125.0,79.0,Elevated,140.0,17.930
1,F,55.0,118.700,1.690,41.600,82.500,139/70,162.0,50.0,135.0,...,High,Y,HIGH,169.000,0.488,139.0,70.0,Hypertension Stage 1,82.0,20.510
2,M,,,1.830,26.900,106.700,104/77,103.0,73.0,114.0,...,High,Y,INTERMEDIARY,183.000,0.583,104.0,77.0,Normal,0.0,12.640
3,M,44.0,108.300,1.800,33.400,96.600,140/83,134.0,46.0,91.0,...,High,Y,INTERMEDIARY,,0.537,140.0,83.0,Hypertension Stage 1,58.0,16.360
4,F,32.0,99.500,1.860,28.800,102.700,144/83,146.0,64.0,141.0,...,High,N,INTERMEDIARY,186.000,0.552,144.0,83.0,Hypertension Stage 1,52.0,17.880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,F,40.0,72.070,1.889,,95.326,119/66,157.0,60.0,93.0,...,Moderate,Y,LOW,188.894,0.505,119.0,66.0,Normal,67.0,14.300
1525,F,78.0,85.877,1.825,24.426,112.340,102/115,241.0,84.0,,...,Low,N,INTERMEDIARY,182.485,0.616,102.0,115.0,Hypertension Stage 2,127.0,14.805
1526,M,39.0,98.626,1.521,20.055,77.193,150/90,237.0,82.0,147.0,...,High,N,INTERMEDIARY,152.119,0.507,150.0,90.0,Hypertension Stage 2,125.0,18.251
1527,M,71.0,116.163,1.841,29.279,114.197,112/63,193.0,84.0,123.0,...,High,Y,INTERMEDIARY,184.059,0.620,112.0,63.0,Normal,79.0,15.316


## 2. Inspect & Define Target

In [20]:
# In your original notebook the target was inferred as 'CVD Risk Level'
TARGET = 'CVD Risk Level'
assert TARGET in df.columns, f"Target column '{TARGET}' not found. Columns: {df.columns.tolist()}"

y = df[TARGET].astype(str).str.strip()
X = df.drop(columns=[TARGET])

print('Target distribution:\n', y.value_counts())
print('\nColumns by dtype:')
print(df.dtypes.value_counts())


Target distribution:
 CVD Risk Level
HIGH            728
INTERMEDIARY    581
LOW             220
Name: count, dtype: int64

Columns by dtype:
float64    14
object      8
Name: count, dtype: int64


## 3. Preprocessing
- Numeric: median imputation + standard scaling  
- Categorical: most-frequent imputation + one-hot encoding  
We also automatically handle sparse/dense output differences across scikit-learn versions.


In [21]:
from sklearn import __version__ as skver
sk_major, sk_minor = [int(x) for x in skver.split('.')[:2]]
ohe_kwargs = {'handle_unknown': 'ignore'}
if (sk_major, sk_minor) >= (1, 2):
    ohe_kwargs['sparse_output'] = False
else:
    ohe_kwargs['sparse'] = False

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(**ohe_kwargs))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

len(num_cols), len(cat_cols), X_train.shape, X_test.shape

(14, 7, (1223, 21), (306, 21))

In [22]:
# Add this import
from sklearn.preprocessing import LabelEncoder

# Add this after defining y
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Use encoded labels for training
y = y_encoded

## 4. Quick Baselines
We'll try a few fast models to get a sense of achievable accuracy.


In [23]:
candidates = {
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=RANDOM_STATE),
    'RandomForest': RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, class_weight='balanced_subsample'),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=400, random_state=RANDOM_STATE, class_weight='balanced'),
    'LogisticRegression': LogisticRegression(max_iter=2000, class_weight='balanced'),
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

rows = []
for name, clf in candidates.items():
    pipe = Pipeline(steps=[('prep', preprocess), ('clf', clf)])
    scores = cross_val_score(pipe, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    rows.append({'model': name, 'cv_mean_acc': scores.mean(), 'cv_std': scores.std(), 'cv_scores': scores})
    
if XGB_AVAILABLE:
    xgb = XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        max_depth=6,
        n_estimators=500,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=RANDOM_STATE
    )
    pipe = Pipeline(steps=[('prep', preprocess), ('clf', xgb)])
    scores = cross_val_score(pipe, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    rows.append({'model': 'XGBoost', 'cv_mean_acc': scores.mean(), 'cv_std': scores.std(), 'cv_scores': scores})

baseline_df = pd.DataFrame(rows).sort_values('cv_mean_acc', ascending=False).reset_index(drop=True)
baseline_df

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "d:\ML_ASIF_OWN PROJCET\CardioVascular_Disease_Risk_Prediction\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\ML_ASIF_OWN PROJCET\CardioVascular_Disease_Risk_Prediction\.venv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\ML_ASIF_OWN PROJCET\CardioVascular_Disease_Risk_Prediction\.venv\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "d:\ML_ASIF_OWN PROJCET\CardioVascular_Disease_Risk_Prediction\.venv\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "d:\ML_ASIF_OWN PROJCET\CardioVascular_Disease_Risk_Prediction\.venv\Lib\site-packages\xgboost\sklearn.py", line 1641, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['HIGH' 'INTERMEDIARY' 'LOW']


## 5. Hyperparameter Tuning (Top Model)
We tune the best-performing model from above. For speed, we use `RandomizedSearchCV` with a compact search space.


In [None]:
# Choose top model automatically from baseline results
top_name = baseline_df.iloc[0]['model']
print('Top baseline model:', top_name)

if top_name == 'HistGradientBoosting':
    base = HistGradientBoostingClassifier(random_state=RANDOM_STATE)
    param_distributions = {
        'clf__max_depth': [None, 3, 5, 7],
        'clf__learning_rate': [0.05, 0.1, 0.2],
        'clf__max_leaf_nodes': [15, 31, 63],
        'clf__min_samples_leaf': [10, 20, 30],
    }
elif top_name == 'RandomForest':
    base = RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced_subsample')
    param_distributions = {
        'clf__n_estimators': [200, 300, 400, 600],
        'clf__max_depth': [None, 8, 12, 16],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__max_features': ['auto', 'sqrt', 0.5],
    }
elif top_name == 'ExtraTrees':
    base = ExtraTreesClassifier(random_state=RANDOM_STATE, class_weight='balanced')
    param_distributions = {
        'clf__n_estimators': [300, 400, 600, 800],
        'clf__max_depth': [None, 8, 12, 16],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__max_features': ['auto', 'sqrt', 0.5],
    }
elif top_name == 'LogisticRegression':
    base = LogisticRegression(max_iter=4000, class_weight='balanced')
    param_distributions = {
        'clf__C': np.logspace(-2, 2, 10),
        'clf__penalty': ['l2'],
        'clf__solver': ['lbfgs', 'liblinear', 'saga'],
    }
elif top_name == 'XGBoost' and XGB_AVAILABLE:
    base = XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        random_state=RANDOM_STATE,
        tree_method='hist'
    )
    param_distributions = {
        'clf__max_depth': [4, 6, 8],
        'clf__n_estimators': [300, 500, 800],
        'clf__learning_rate': [0.03, 0.05, 0.1],
        'clf__subsample': [0.8, 1.0],
        'clf__colsample_bytree': [0.7, 0.9, 1.0],
    }
else:
    # Fallback to HistGradientBoosting if XGBoost wasn't available
    top_name = 'HistGradientBoosting'
    base = HistGradientBoostingClassifier(random_state=RANDOM_STATE)
    param_distributions = {
        'clf__max_depth': [None, 3, 5, 7],
        'clf__learning_rate': [0.05, 0.1, 0.2],
        'clf__max_leaf_nodes': [15, 31, 63],
        'clf__min_samples_leaf': [10, 20, 30],
    }

pipe = Pipeline(steps=[('prep', preprocess), ('clf', base)])

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=30,
    scoring='accuracy',
    n_jobs=-1,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    verbose=1,
    random_state=RANDOM_STATE
)
search.fit(X_train, y_train)

print('Best CV accuracy:', search.best_score_)
print('Best params:', search.best_params_)

best_model = search.best_estimator_


## 6. Final Evaluation on Holdout Test Set

In [None]:
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {acc:.4f}\n')
print(classification_report(y_test, y_pred))

# Confusion matrix
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(5,4))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title('Confusion Matrix')
plt.show()

## 7. Feature Importance (Permutation)

In [None]:
r = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)
importances = pd.DataFrame({
    'feature': best_model.named_steps['prep'].get_feature_names_out(),
    'importance_mean': r.importances_mean,
    'importance_std': r.importances_std
}).sort_values('importance_mean', ascending=False)
importances.head(20)

## 8. Save Trained Pipeline

In [None]:
import joblib, os
out_path = '/mnt/data/cvd_best_model.joblib'
joblib.dump(best_model, out_path)
print('Saved:', out_path)

## 9. Inference Helper
Use this snippet to load the model and predict on new data.


In [None]:
import joblib, pandas as pd
pipe = joblib.load('/mnt/data/cvd_best_model.joblib')

# Example: take first 3 rows from the original dataset (without target)
sample = X_test.iloc[:3].copy()
print('Predictions:', pipe.predict(sample).tolist())