In [13]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [16]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [17]:
DATA_PATH = "/content/drive/MyDrive/diabites/cardiac_lab.xlsx"

df = pd.read_excel(DATA_PATH)
df.head()


Unnamed: 0,Patient_ID,Age,Gender,BMI,BP_Systolic,BP_Diastolic,Heart_Rate,Troponin_T,CK_MB,BNP,...,Sodium,Potassium,Calcium,ALT,AST,CRP,ESR,Patient_Category,Cardiac_Risk_Level,Recommended_Tests
0,10000,43,Female,31.3,134,82,59,0.006,5.3,35,...,145,4.3,9.8,17,51,2.5,13,At_Risk,0.213,"Serial Cardiac Markers, ECG, Stress Test"
1,10001,32,Male,33.9,108,57,126,0.164,19.1,637,...,129,3.4,10.7,62,82,29.0,104,,1.0,"Serial Troponin/CK-MB, ECG/ECHO, Cardiac Cathe..."
2,10002,34,Male,27.6,127,61,75,0.056,10.0,243,...,139,4.1,9.7,20,58,2.5,16,,1.0,"Serial Troponin/CK-MB, ECG/ECHO, Cardiac Cathe..."
3,10003,37,Female,25.3,122,86,109,0.041,9.5,213,...,140,5.3,9.3,74,37,15.1,35,,1.0,"Serial Troponin/CK-MB, ECG/ECHO, Cardiac Cathe..."
4,10004,38,Male,25.7,105,81,77,0.014,3.5,64,...,144,3.9,9.8,42,30,2.8,8,Normal,0.0,Routine Annual Check-up


In [18]:
TARGET = "Recommended_Tests"   # change only if column name differs

X = df.drop(columns=[TARGET])
y_raw = df[TARGET].astype(str)


In [19]:
is_multilabel = y_raw.str.contains(",|;|\|").any()
print("Multi-label target:", is_multilabel)


Multi-label target: True


  is_multilabel = y_raw.str.contains(",|;|\|").any()


In [20]:
numeric_cols = X.select_dtypes(include=["number"]).columns
categorical_cols = X.select_dtypes(exclude=["number"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numeric_cols),
    ("cat", categorical_pipeline, categorical_cols)
])


In [21]:
if is_multilabel:
    y_list = y_raw.str.replace("|", ",", regex=False)\
                  .str.replace(";", ",", regex=False)\
                  .str.split(",")

    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(y_list)
else:
    le = LabelEncoder()
    y = le.fit_transform(y_raw)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [23]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

if is_multilabel:
    model = OneVsRestClassifier(rf)
else:
    model = rf

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", model)
])

pipeline.fit(X_train, y_train)


In [24]:
y_pred = pipeline.predict(X_test)

if is_multilabel:
    exact_match = (y_pred == y_test).all(axis=1).mean()
    print("Exact Match Accuracy:", exact_match)
else:
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


Exact Match Accuracy: 0.99825


In [25]:
joblib.dump(pipeline, "recommendation_model.joblib")

helpers = {"is_multilabel": is_multilabel}

if is_multilabel:
    helpers["classes"] = mlb.classes_.tolist()
else:
    helpers["classes"] = le.classes_.tolist()

joblib.dump(helpers, "model_helpers.joblib")

print("Model saved successfully")


Model saved successfully
