In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import joblib
import os


In [5]:
df = pd.read_csv("/content/heart_disease_uci.csv")
df.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [6]:
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)


Shape: (920, 16)

Columns:
 Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')


In [7]:
df.columns


Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [8]:
df["num"].value_counts()


Unnamed: 0_level_0,count
num,Unnamed: 1_level_1
0,411
1,265
2,109
3,107
4,28


In [9]:
X = df.drop("num", axis=1)
y = df["num"]


In [10]:
y = (y > 0).astype(int)


In [11]:
print(X.shape)
print(y.shape)
print(y.unique())


(920, 15)
(920,)
[0 1]


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [13]:
from sklearn.preprocessing import LabelEncoder

# Make a copy to avoid warnings
X_encoded = X.copy()

label_encoders = {}

for col in X_encoded.columns:
    if X_encoded[col].dtype == "object":
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col])
        label_encoders[col] = le


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [15]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [16]:
print(X_train.shape)
print(X_test.shape)


(736, 15)
(184, 15)


In [17]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


In [18]:
import numpy as np

print("NaNs in X_train:", np.isnan(X_train).sum())
print("NaNs in X_test:", np.isnan(X_test).sum())


NaNs in X_train: 0
NaNs in X_test: 0


In [19]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}


In [20]:
import os
os.makedirs("model", exist_ok=True)

results = []

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Metrics
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    results.append(metrics)

    # Save model (needed for Streamlit)
    joblib.dump(model, f"model/{name.replace(' ', '_')}.pkl")

# Results table
results_df = pd.DataFrame(results)
results_df


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.847826,0.918938,0.842593,0.892157,0.866667,0.691317
1,Decision Tree,0.836957,0.833812,0.846154,0.862745,0.854369,0.669386
2,KNN,0.831522,0.894907,0.84466,0.852941,0.84878,0.658647
3,Naive Bayes,0.826087,0.877451,0.836538,0.852941,0.84466,0.647329
4,Random Forest,0.896739,0.950024,0.880734,0.941176,0.909953,0.791635
5,XGBoost,0.880435,0.939861,0.87037,0.921569,0.895238,0.757938


In [21]:
# Save metrics for README and reference
results_df.to_csv("model/model_metrics.csv", index=False)

results_df


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.847826,0.918938,0.842593,0.892157,0.866667,0.691317
1,Decision Tree,0.836957,0.833812,0.846154,0.862745,0.854369,0.669386
2,KNN,0.831522,0.894907,0.84466,0.852941,0.84878,0.658647
3,Naive Bayes,0.826087,0.877451,0.836538,0.852941,0.84466,0.647329
4,Random Forest,0.896739,0.950024,0.880734,0.941176,0.909953,0.791635
5,XGBoost,0.880435,0.939861,0.87037,0.921569,0.895238,0.757938


In [22]:
# Use Random Forest for illustration
rf_model = models["Random Forest"]

y_pred = rf_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[69 13]
 [ 6 96]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.84      0.88        82
           1       0.88      0.94      0.91       102

    accuracy                           0.90       184
   macro avg       0.90      0.89      0.89       184
weighted avg       0.90      0.90      0.90       184



In [23]:
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.54.0-py3-none-any.whl.metadata (9.8 kB)
Collecting cachetools<7,>=5.5 (from streamlit)
  Downloading cachetools-6.2.6-py3-none-any.whl.metadata (5.6 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.54.0-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cachetools-6.2.6-py3-none-any.whl (11 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m102.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cachetools, pydeck, streamlit
  Attempting uninstall: cachetools
    Found existing installation: cachetools 7.0.0
    Uninstalling cachetools-7.0.0:
      Successfully uninstalled cachetools-7.0.0
Successfully installed cachetools-6.2.6 pydeck-

In [24]:
!pwd


/content


In [25]:
!ls


heart_disease_uci.csv  model  sample_data


In [26]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

st.set_page_config(page_title="ML Assignment 2", layout="centered")

st.title("Machine Learning Classification Models")

uploaded_file = st.file_uploader("Upload CSV test file", type=["csv"])

model_name = st.selectbox(
    "Select Model",
    [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ]
)

if uploaded_file:
    df = pd.read_csv(uploaded_file)

    if "num" not in df.columns:
        st.error("Target column 'num' not found")
        st.stop()

    X = df.drop("num", axis=1)
    y = (df["num"] > 0).astype(int)

    model = joblib.load(f"model/{model_name.replace(' ', '_')}.pkl")

    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    st.subheader("Metrics")
    st.write({
        "Accuracy": accuracy_score(y, y_pred),
        "AUC": roc_auc_score(y, y_prob),
        "Precision": precision_score(y, y_pred),
        "Recall": recall_score(y, y_pred),
        "F1": f1_score(y, y_pred),
        "MCC": matthews_corrcoef(y, y_pred)
    })

    st.subheader("Confusion Matrix")
    st.write(confusion_matrix(y, y_pred))

    st.subheader("Classification Report")
    st.text(classification_report(y, y_pred))


Writing app.py


In [27]:
!ls


app.py	heart_disease_uci.csv  model  sample_data


In [28]:
%%writefile requirements.txt

streamlit
pandas
numpy
scikit-learn
xgboost
joblib
matplotlib
seaborn


Writing requirements.txt


In [29]:
!ls


app.py	heart_disease_uci.csv  model  requirements.txt	sample_data


In [30]:
!cat requirements.txt



streamlit
pandas
numpy
scikit-learn
xgboost
joblib
matplotlib
seaborn


In [31]:
%%writefile README.md

# ML Assignment 2 – Classification Models Deployment

## a. Problem Statement
The objective of this assignment is to implement multiple machine learning classification models on a single dataset, evaluate their performance using standard metrics, and deploy the models through an interactive Streamlit web application.

## b. Dataset Description
The dataset used is the Heart Disease UCI dataset obtained from a public repository.
- Type: Binary Classification
- Number of instances: 920
- Number of features: 15
- Target variable: num (converted to binary: 0 = No disease, 1 = Disease)

## c. Models Used and Evaluation Metrics

The following six classification models were implemented and evaluated on the same dataset:

| ML Model | Accuracy | AUC | Precision | Recall | F1 Score | MCC |
|--------|----------|-----|-----------|--------|----------|-----|
| Logistic Regression | (from output) | (from output) | (from output) | (from output) | (from output) | (from output) |
| Decision Tree | (from output) | (from output) | (from output) | (from output) | (from output) | (from output) |
| KNN | (from output) | (from output) | (from output) | (from output) | (from output) | (from output) |
| Naive Bayes | (from output) | (from output) | (from output) | (from output) | (from output) | (from output) |
| Random Forest | (from output) | (from output) | (from output) | (from output) | (from output) | (from output) |
| XGBoost | (from output) | (from output) | (from output) | (from output) | (from output) | (from output) |

## d. Observations on Model Performance

| ML Model | Observation |
|--------|-------------|
| Logistic Regression | Performed well on linearly separable data but struggled with complex patterns |
| Decision Tree | Showed tendency to overfit the training data |
| KNN | Performance was sensitive to feature scaling |
| Naive Bayes | Fast and efficient but limited by independence assumption |
| Random Forest | Delivered strong and stable performance due to ensemble learning |
| XGBoost | Achieved the best overall performance with high predictive accuracy |


Writing README.md


In [32]:
!ls


app.py	heart_disease_uci.csv  model  README.md  requirements.txt  sample_data


In [33]:
%%writefile app.py
import streamlit as st
import pandas as pd
import joblib
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

st.set_page_config(page_title="ML Assignment 2", layout="centered")
st.title("Machine Learning Classification Models")

uploaded_file = st.file_uploader("Upload CSV test file", type=["csv"])

model_name = st.selectbox(
    "Select Model",
    [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ]
)

if uploaded_file:
    df = pd.read_csv(uploaded_file)

    if "num" not in df.columns:
        st.error("Target column 'num' not found in uploaded file.")
        st.stop()

    # ----- TARGET -----
    y = (df["num"] > 0).astype(int)
    X = df.drop("num", axis=1)

    # ----- ENCODE CATEGORICAL FEATURES -----
    for col in X.columns:
        if X[col].dtype == "object":
            le = LabelEncoder()
            X[col] = le.fit_transform(X[col].astype(str))

    # ----- HANDLE MISSING VALUES -----
    imputer = SimpleImputer(strategy="median")
    X = imputer.fit_transform(X)

    # ----- SCALE FEATURES -----
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # ----- LOAD MODEL -----
    model = joblib.load(f"model/{model_name.replace(' ', '_')}.pkl")

    # ----- PREDICT -----
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    # ----- METRICS -----
    st.subheader("Evaluation Metrics")
    st.write({
        "Accuracy": accuracy_score(y, y_pred),
        "AUC": roc_auc_score(y, y_prob),
        "Precision": precision_score(y, y_pred),
        "Recall": recall_score(y, y_pred),
        "F1": f1_score(y, y_pred),
        "MCC": matthews_corrcoef(y, y_pred)
    })

    st.subheader("Confusion Matrix")
    st.write(confusion_matrix(y, y_pred))

    st.subheader("Classification Report")
    st.text(classification_report(y, y_pred))


Overwriting app.py
