In [2]:
import pandas as pd
import io
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 1. Direct download from UCI (Bank Marketing - bank-additional.csv)
# This version has ~4,000 rows and 20 features
url = "https://raw.githubusercontent.com/madmashup/targeted-marketing-predictive-engine/master/banking.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))
df.to_csv('banking.csv', index=False)

# 2. Basic Preprocessing
# Convert categorical strings to numbers (Label Encoding)
le = LabelEncoder()
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# 3. Separate Features (X) and Target (y)
X = df.drop('y', axis=1) # 'y' is the target (subscribed or not)
y = df['y']

print(f"âœ… Success! Dataset loaded.")
print(f"Total Features: {X.shape[1]} (Requirement: Min 12)")
print(f"Total Rows: {X.shape[0]} (Requirement: Min 500)")

# 4. Scaling and Split
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

âœ… Success! Dataset loaded.
Total Features: 20 (Requirement: Min 12)
Total Rows: 41188 (Requirement: Min 500)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate required metrics
    results.append([
        name,
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ])

# Final Comparison Table
comparison_df = pd.DataFrame(results, columns=['ML Model Name', 'Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC'])
display(comparison_df)

Unnamed: 0,ML Model Name,Accuracy,AUC,Precision,Recall,F1,MCC
0,Logistic Regression,0.906773,0.927216,0.643678,0.415695,0.505155,0.469629
1,Decision Tree,0.884317,0.724154,0.494919,0.516437,0.505449,0.440107
2,KNN,0.899369,0.865272,0.58432,0.418876,0.487956,0.441233
3,Naive Bayes,0.848386,0.859477,0.398136,0.634146,0.489162,0.420722
4,Random Forest,0.910901,0.943896,0.637319,0.514316,0.569249,0.523913
5,XGBoost,0.9143,0.94533,0.64831,0.549311,0.594719,0.549517


In [4]:
%%writefile README.md
# ML Assignment 2 - Bank Marketing Classification

## 1. Project Overview
This project implements six different machine learning models to predict whether a client will subscribe to a term deposit based on the Bank Marketing dataset.

## 2. Dataset Description
- **Source:** UCI Machine Learning Repository (Bank Marketing)
- **Instances:** 41,188
- **Features:** 20 (Input features including age, job, marital status, education, etc.)
- **Target:** 'y' (Binary: Yes/No for subscription)

## 3. Mandatory Preprocessing Steps
- **Label Encoding:** Converted categorical text data into numerical format.
- **Feature Scaling:** Applied `StandardScaler` to normalize feature distributions for models like KNN and Logistic Regression.
- **Data Splitting:** 80% Training and 20% Testing split.

## 4. Model Comparison Table
| ML Model Name | Accuracy | AUC | Precision | Recall | F1 | MCC |
|---|---|---|---|---|---|---|
| Logistic Regression | 0.9067 | 0.9272 | 0.6436 | 0.4156 | 0.5051 | 0.4696 |
| Decision Tree | 0.8843 | 0.7241 | 0.4949 | 0.5164 | 0.5054 | 0.4401 |
| KNN | 0.8993 | 0.8652 | 0.5843 | 0.4188 | 0.4879 | 0.4412 |
| Naive Bayes | 0.8483 | 0.8594 | 0.3981 | 0.6341 | 0.4891 | 0.4207 |
| Random Forest | 0.9109 | 0.9438 | 0.6373 | 0.5143 | 0.5692 | 0.5239 |
| XGBoost | 0.9143 | 0.9453 | 0.6483 | 0.5493 | 0.5947 | 0.5495 |

## 5. Observations
1. **Best Model:** XGBoost achieved the highest Accuracy (91.43%) and AUC (0.945), making it the most reliable model for this dataset.
2. **Recall vs Precision:** Naive Bayes showed the highest Recall (0.634), which is useful if the bank wants to minimize missing potential customers, though it has more false positives.
3. **Complexity:** Ensemble methods (Random Forest/XGBoost) significantly outperformed linear and distance-based models.

## 6. How to Run
1. Install dependencies: `pip install -r requirements.txt`
2. Run the app: `streamlit run app.py`

Writing README.md


In [1]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score

st.set_page_config(page_title="ML Model Evaluator", layout="wide")

st.title("ðŸ“Š ML Assignment 2: Classification Model Comparison")
st.markdown("This app trains 6 models on an uploaded dataset and compares their performance.")

# 1. Sidebar - File Upload
st.sidebar.header("1. Upload Data")
uploaded_file = st.sidebar.file_saver = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    st.write("### Dataset Preview", df.head())

    # Simple Preprocessing (Logic from our Lab)
    target_col = st.selectbox("Select Target Column", df.columns, index=len(df.columns)-1)

    if st.button("Train Models"):
        X = df.drop(target_col, axis=1)
        y = df[target_col]

        # Basic Encoding for Categorical
        le = LabelEncoder()
        for col in X.select_dtypes(include=['object']).columns:
            X[col] = le.fit_transform(X[col])

        # Split and Scale
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Models Dictionary
        models = {
            "Logistic Regression": LogisticRegression(),
            "Decision Tree": DecisionTreeClassifier(),
            "KNN": KNeighborsClassifier(),
            "Naive Bayes": GaussianNB(),
            "Random Forest": RandomForestClassifier(),
            "XGBoost": XGBClassifier()
        }

        results = []

        with st.spinner('Training 6 models...'):
            for name, model in models.items():
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                # Metrics
                acc = accuracy_score(y_test, y_pred)
                prec = precision_score(y_test, y_pred, average='weighted')
                rec = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                mcc = matthews_corrcoef(y_test, y_pred)

                results.append([name, acc, prec, rec, f1, mcc])

        # Display Results
        res_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'MCC'])
        st.write("### Model Comparison Results")
        st.dataframe(res_df.style.highlight_max(axis=0))

        # Mandatory UI Element: Bar Chart of Accuracy
        st.write("### Accuracy Comparison Chart")
        st.bar_chart(res_df.set_index('Model')['Accuracy'])
else:
    st.info("Awaiting CSV file upload. Please upload a dataset to begin.")

Writing app.py


In [3]:
%%writefile requirements.txt
streamlit
pandas
numpy
scikit-learn
xgboost

Writing requirements.txt
