In [10]:
import pandas as pd
import io
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 1. Direct download from UCI (Bank Marketing - bank-additional.csv)
# This version has ~4,000 rows and 20 features
url = "https://raw.githubusercontent.com/madmashup/targeted-marketing-predictive-engine/master/banking.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))
df.to_csv('banking.csv', index=False)

# 2. Basic Preprocessing
# Convert categorical strings to numbers (Label Encoding)
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# 3. Separate Features (X) and Target (y)
X = df.drop('y', axis=1) # 'y' is the target (subscribed or not)
y = df['y']

print(f"âœ… Success! Dataset loaded.")
print(f"Total Features: {X.shape[1]} (Requirement: Min 12)")
print(f"Total Rows: {X.shape[0]} (Requirement: Min 500)")

# 4. Scaling and Split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

âœ… Success! Dataset loaded.
Total Features: 20 (Requirement: Min 12)
Total Rows: 41188 (Requirement: Min 500)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate required metrics
    results.append([
        name,
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ])

# Final Comparison Table
comparison_df = pd.DataFrame(results, columns=['ML Model Name', 'Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC'])
display(comparison_df)

Unnamed: 0,ML Model Name,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.906773,0.927216,0.643678,0.415695,0.505155,0.469629
1,Decision Tree,0.884924,0.73096,0.497517,0.531283,0.513846,0.448975
2,KNN,0.899369,0.865272,0.58432,0.418876,0.487956,0.441233
3,Naive Bayes,0.848386,0.859477,0.398136,0.634146,0.489162,0.420722
4,Random Forest,0.911993,0.944411,0.639031,0.531283,0.580197,0.534334
5,XGBoost,0.9143,0.94533,0.64831,0.549311,0.594719,0.549517


In [12]:
%%writefile README.md
# ML Assignment 2 - Bank Marketing Classification

## 1. Project Overview
This project implements six different machine learning models to predict whether a client will subscribe to a term deposit based on the Bank Marketing dataset.

## 2. Dataset Description
- **Source:** UCI Machine Learning Repository (Bank Marketing)
- **Instances:** 41,188
- **Features:** 20 (Input features including age, job, marital status, education, etc.)
- **Target:** 'y' (Binary: Yes/No for subscription)

## 3. Mandatory Preprocessing Steps
- **Label Encoding:** Converted categorical text data into numerical format for model compatibility.
- **Feature Scaling:** Applied `StandardScaler` to normalize feature distributions for models like KNN and Logistic Regression.
- **Data Splitting:** 80% Training and 20% Testing split to ensure robust evaluation.

## 4. Model Comparison Table
| ML Model Name | Accuracy | AUC | Precision | Recall | F1 Score | MCC |
| :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| Logistic Regression | 0.9068 | 0.9272 | 0.8953 | 0.9068 | 0.8978 | 0.4696 |
| Decision Tree | 0.8849 | 0.7282 | 0.8877 | 0.8849 | 0.8863 | 0.4459 |
| KNN | 0.8990 | 0.8654 | 0.8879 | 0.8990 | 0.8916 | 0.4394 |
| Naive Bayes | 0.8484 | 0.8595 | 0.8858 | 0.8484 | 0.8627 | 0.4207 |
| Random Forest | 0.9130 | 0.9433 | 0.9065 | 0.9130 | 0.9088 | 0.5349 |
| **XGBoost** | **0.9143** | **0.9453** | **0.9092** | **0.9143** | **0.9112** | **0.5495** |



## 5. Observations on Model Performance (3 Marks)
| ML Model Name | Observation about model performance |
| :--- | :--- |
| **Logistic Regression** | Solid baseline performance with **0.9068** accuracy; works well but assumes linear relationships between banking features. |
| **Decision Tree** | Faster to train but showed the lowest AUC (**0.7282**), indicating it is less effective at separating the classes than ensemble methods. |
| **kNN** | Performance is stable (**0.8990** accuracy) but prediction speed is slower due to distance calculations across 20 features. |
| **Naive Bayes** | Lowest accuracy at **0.8484**. Its assumption of feature independence likely hinders performance on this complex socio-economic dataset. |
| **Random Forest** | Strong ensemble performer with **0.9130** accuracy; effectively reduced overfitting through multiple tree bagging. |
| **XGBoost** | **Best Overall Performer.** Highest Accuracy (**0.9143**) and AUC (**0.9453**), demonstrating the best ability to handle complex patterns. |

## 6. How to Run
1. Install dependencies: `pip install -r requirements.txt`
2. Run the app: `streamlit run app.py`
3. View the live version here: [https://mlassignment2-irhu8kzrmqwxs6hhkdwmx7.streamlit.app/]

Overwriting README.md


In [13]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, classification_report

st.set_page_config(page_title="ML Evaluator", layout="wide")
st.title("ðŸ“Š ML Assignment 2: Classification Models")

uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    st.write("### 1. Dataset Preview", df.head())

    target_col = st.selectbox("Select Target Column (Select 'y')", df.columns, index=len(df.columns)-1)
    selected_model_name = st.selectbox("Select Model for Detailed Report",
                                      ["Logistic Regression", "Decision Tree", "KNN", "Naive Bayes", "Random Forest", "XGBoost"])

    if st.button("Run Evaluation"):
        X = df.drop(target_col, axis=1)
        y = df[target_col]

        le = LabelEncoder()
        for col in X.select_dtypes(include=['object']).columns:
            X[col] = le.fit_transform(X[col])
        if y.dtype == 'object': y = le.fit_transform(y)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        models_dict = {
            "Logistic Regression": LogisticRegression(max_iter=1000),
            "Decision Tree": DecisionTreeClassifier(),
            "KNN": KNeighborsClassifier(),
            "Naive Bayes": GaussianNB(),
            "Random Forest": RandomForestClassifier(),
            "XGBoost": XGBClassifier()
        }

        # --- ALL MODELS COMPARISON [Requirement 4c] ---
        st.write("### 2. Mandatory Model Comparison Table")
        all_results = []
        for name, m in models_dict.items():
            m.fit(X_train, y_train)
            pred = m.predict(X_test)
            # AUC requires probabilities
            prob = m.predict_proba(X_test)[:, 1] if hasattr(m, "predict_proba") else pred

            all_results.append({
                "Model": name,
                "Accuracy": accuracy_score(y_test, pred),
                "AUC": roc_auc_score(y_test, prob),
                "Precision": precision_score(y_test, pred, average='weighted'),
                "Recall": recall_score(y_test, pred, average='weighted'),
                "F1 Score": f1_score(y_test, pred, average='weighted'),
                "MCC": matthews_corrcoef(y_test, pred)
            })

        res_df = pd.DataFrame(all_results)
        st.dataframe(res_df.style.format(precision=4))

        # --- INDIVIDUAL REPORT [Requirement 4d] ---
        st.write(f"### 3. Detailed Report: {selected_model_name}")
        # Re-fit the specific selected model to show its report
        specific_model = models_dict[selected_model_name]
        specific_model.fit(X_train, y_train)
        st.text(classification_report(y_test, specific_model.predict(X_test)))

        st.write("### 4. Accuracy Comparison Chart")
        st.bar_chart(res_df.set_index('Model')['Accuracy'])

else:
    st.info("Please upload the banking.csv file to begin.")

Overwriting app.py


In [14]:
%%writefile requirements.txt
streamlit
pandas
numpy
scikit-learn
xgboost

Overwriting requirements.txt
