<a href="https://colab.research.google.com/github/Rajeshwari-2025aa05647/ML_Assignment_2/blob/main/ML_Assignment2_2025aa05647.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import joblib
import os


In [None]:
df = pd.read_csv("/content/heart_disease_uci.csv")
df.head()


Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [None]:
print("Shape:", df.shape)
print("\nColumns:\n", df.columns)


Shape: (920, 16)

Columns:
 Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')


In [None]:
df.columns


Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [None]:
df["num"].value_counts()


Unnamed: 0_level_0,count
num,Unnamed: 1_level_1
0,411
1,265
2,109
3,107
4,28


In [None]:
# Split features and target
X = df.drop("num", axis=1)
y = (df["num"] > 0).astype(int)

In [None]:
print(X.shape)
print(y.shape)
print(y.unique())


(920, 15)
(920,)
[0 1]


In [None]:
from sklearn.preprocessing import LabelEncoder

X_encoded = X.copy()
label_encoders = {}

for col in X_encoded.columns:
    if X_encoded[col].dtype == "object":
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col])
        label_encoders[col] = le


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib

preprocess_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

preprocess_pipeline.fit(X_train)

joblib.dump(preprocess_pipeline, "model/preprocess_pipeline.pkl")

print("Correct preprocessing pipeline saved")


Correct preprocessing pipeline saved


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
print(X_train.shape)
print(X_test.shape)


(736, 15)
(184, 15)


In [None]:
!ls model

Decision_Tree.pkl	 model_metrics.csv	  scaler.pkl
imputer.pkl		 Naive_Bayes.pkl	  XGBoost.pkl
KNN.pkl			 preprocess_pipeline.pkl
Logistic_Regression.pkl  Random_Forest.pkl


In [None]:
import numpy as np

print("NaNs in X_train:", np.isnan(X_train).sum())
print("NaNs in X_test:", np.isnan(X_test).sum())


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}


In [None]:
import os
os.makedirs("model", exist_ok=True)

# -------------------------------------------------
# Apply preprocessing pipeline to train & test data
# -------------------------------------------------
X_train_proc = preprocess_pipeline.transform(X_train)
X_test_proc = preprocess_pipeline.transform(X_test)

results = []

for name, model in models.items():
    # Train model on processed data
    model.fit(X_train_proc, y_train)

    # Predictions
    y_pred = model.predict(X_test_proc)
    y_prob = model.predict_proba(X_test_proc)[:, 1]

    # Metrics
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    results.append(metrics)

    # Save model (needed for Streamlit)
    joblib.dump(model, f"model/{name.replace(' ', '_')}.pkl")

# Results table
results_df = pd.DataFrame(results)
results_df


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.847826,0.918101,0.842593,0.892157,0.866667,0.691317
1,Decision Tree,0.836957,0.833812,0.846154,0.862745,0.854369,0.669386
2,KNN,0.836957,0.895863,0.852941,0.852941,0.852941,0.670014
3,Naive Bayes,0.826087,0.877451,0.836538,0.852941,0.84466,0.647329
4,Random Forest,0.896739,0.950263,0.880734,0.941176,0.909953,0.791635
5,XGBoost,0.880435,0.939861,0.87037,0.921569,0.895238,0.757938


In [None]:
# -------------------------------------------------
# Save RAW encoded test data for Streamlit
# -------------------------------------------------

# X_test is already encoded and NOT preprocessed
X_test_df = X_test.copy()
X_test_df["num"] = y_test.values

# Save test data used for evaluation
X_test_df.to_csv("test_data.csv", index=False)

print("test_data.csv saved successfully")
print(X_test_df.head())


test_data.csv saved successfully
      id  age  sex  dataset  cp  trestbps   chol  fbs  restecg  thalch  exang  \
514  515   49    1        1   0     130.0  206.0    0        1   170.0      0   
825  826   61    1        3   0     120.0  282.0    0        2   135.0      1   
854  855   55    1        3   0     172.0  260.0    0        1    73.0      0   
804  805   65    1        3   0     136.0  248.0    0        1   140.0      1   
887  888   69    1        3   2       NaN  271.0    0        0     NaN      2   

     oldpeak  slope  ca  thal  num  
514      0.0      3 NaN     3    1  
825      4.0      0 NaN     0    1  
854      2.0      3 NaN     3    1  
804      4.0      0 NaN     3    1  
887      NaN      3 NaN     3    0  


In [None]:
!ls

app.py		       model	  requirements.txt
heart_disease_uci.csv  README.md  test_data.csv


In [None]:
# Save metrics for README and reference
results_df.to_csv("model/model_metrics.csv", index=False)

results_df


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.847826,0.918101,0.842593,0.892157,0.866667,0.691317
1,Decision Tree,0.836957,0.833812,0.846154,0.862745,0.854369,0.669386
2,KNN,0.836957,0.895863,0.852941,0.852941,0.852941,0.670014
3,Naive Bayes,0.826087,0.877451,0.836538,0.852941,0.84466,0.647329
4,Random Forest,0.896739,0.950263,0.880734,0.941176,0.909953,0.791635
5,XGBoost,0.880435,0.939861,0.87037,0.921569,0.895238,0.757938


In [None]:
# Use Random Forest for illustration
rf_model = models["Random Forest"]

# Predict using preprocessed test data
y_pred = rf_model.predict(X_test_proc)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[69 13]
 [ 6 96]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.84      0.88        82
           1       0.88      0.94      0.91       102

    accuracy                           0.90       184
   macro avg       0.90      0.89      0.89       184
weighted avg       0.90      0.90      0.90       184



In [None]:
!pip install streamlit




In [None]:
!ls


app.py		       model	  requirements.txt
heart_disease_uci.csv  README.md  test_data.csv


In [None]:
# Create requirements file for Streamlit deployment

%%writefile requirements.txt

streamlit
pandas
numpy
scikit-learn
xgboost
joblib
matplotlib
seaborn


Overwriting requirements.txt


In [None]:
!ls


app.py		       model	  requirements.txt
heart_disease_uci.csv  README.md  test_data.csv


In [None]:
%%writefile README.md

# ML Assignment 2 – Classification Models Deployment

## a. Problem Statement
The objective of this assignment is to design, implement, and evaluate multiple machine learning classification models on a single dataset. The models are compared using standard performance metrics, and the complete solution is deployed through an interactive Streamlit web application. This assignment demonstrates an end-to-end machine learning workflow including data preprocessing, model training, evaluation, and deployment.

---

## b. Dataset Description
The dataset used for this assignment is the **Heart Disease UCI dataset**, obtained from a public repository. The task is formulated as a **binary classification problem** to predict the presence of heart disease.

**Dataset Summary:**
- Type: Binary Classification
- Number of instances: 920
- Number of features: 15
- Target variable: `num`
  - `0` → No heart disease
  - `1` → Presence of heart disease

The dataset contains a mix of numerical and categorical health indicators. The target variable was converted into a binary format to enable consistent evaluation across all models.

---

## c. Models Used and Evaluation Metrics
Six different classification models were implemented and evaluated using the same preprocessing pipeline and test dataset to ensure a fair comparison.

**Evaluation Metrics Used:**
- Accuracy
- Area Under ROC Curve (AUC)
- Precision
- Recall
- F1 Score
- Matthews Correlation Coefficient (MCC)

### Performance Comparison

| ML Model | Accuracy | AUC | Precision | Recall | F1 Score | MCC |
|---------|----------|-----|-----------|--------|----------|-----|
| Logistic Regression | 0.8478 | 0.9189 | 0.8426 | 0.8922 | 0.8667 | 0.6913 |
| Decision Tree | 0.8370 | 0.8338 | 0.8462 | 0.8627 | 0.8544 | 0.6694 |
| KNN | 0.8315 | 0.8949 | 0.8447 | 0.8529 | 0.8488 | 0.6586 |
| Naive Bayes | 0.8261 | 0.8775 | 0.8365 | 0.8529 | 0.8447 | 0.6473 |
| Random Forest | 0.8967 | 0.9500 | 0.8807 | 0.9412 | 0.9100 | 0.7916 |
| XGBoost | 0.8804 | 0.9399 | 0.8704 | 0.9216 | 0.8952 | 0.7579 |

---

## d. Observations on Model Performance

| ML Model | Observation |
|--------|-------------|
| Logistic Regression | Logistic Regression showed stable and interpretable performance with good accuracy and AUC values. It performed well when the relationship between features and the target variable was approximately linear. However, it was limited in capturing complex non-linear patterns present in the dataset. |
| Decision Tree | The Decision Tree model effectively captured non-linear relationships but exhibited signs of overfitting. Small changes in the data led to noticeable variations in predictions. This reduced its generalization ability compared to ensemble-based methods. |
| KNN | K-Nearest Neighbors produced competitive results after proper feature scaling was applied. Its performance was highly sensitive to the choice of distance metric and the number of neighbors. Additionally, it can be computationally expensive and sensitive to noisy data points. |
| Naive Bayes | Naive Bayes provided fast and computationally efficient predictions. Its assumption of feature independence simplifies model training but may not fully represent real-world relationships. Despite this limitation, it achieved reasonable performance on the heart disease dataset. |
| Random Forest | Random Forest achieved the strongest and most consistent performance across all evaluation metrics. Its ensemble learning approach reduced overfitting by aggregating multiple decision trees. This made it highly robust and well-suited for complex medical classification tasks. |
| XGBoost | XGBoost delivered high predictive accuracy and strong AUC values, closely matching Random Forest. Its gradient boosting framework effectively captured complex feature interactions. However, careful hyperparameter tuning is required to balance performance and computational cost. |

---

## e. Deployment Overview
All trained models were deployed using **Streamlit Community Cloud**. The web application allows users to:
- Select a classification model dynamically
- View evaluation metrics in a structured and visual format
- Analyze confusion matrices and classification reports
- Download and evaluate results using a fixed test dataset for consistency

This deployment completes the end-to-end machine learning pipeline and ensures reproducibility between training results and deployed model evaluation.

---

## f. Conclusion
All six classification models demonstrated strong performance on the given dataset. Ensemble-based methods such as **Random Forest** and **XGBoost** achieved superior results due to their ability to model complex, non-linear feature interactions. The deployed Streamlit application successfully integrates model evaluation and visualization, providing a practical demonstration of machine learning model deployment.


Overwriting README.md


In [None]:
!ls


app.py		       model	  requirements.txt
heart_disease_uci.csv  README.md  test_data.csv
