In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data"

columns = [
    "class", "cap-shape", "cap-surface", "cap-color", "bruises", "odor",
    "gill-attachment", "gill-spacing", "gill-size", "gill-color",
    "stalk-shape", "stalk-root", "stalk-surface-above-ring",
    "stalk-surface-below-ring", "stalk-color-above-ring",
    "stalk-color-below-ring", "veil-type", "veil-color",
    "ring-number", "ring-type", "spore-print-color",
    "population", "habitat"
]

df = pd.read_csv(url, header=None)
df.columns = columns

#df.to_csv("mushroom_dataset.csv", index=False)

df.head()



In [None]:
# Separate X and y
X = df.drop("class", axis=1)
y = df["class"]

# Encode y
from sklearn.preprocessing import LabelEncoder
label_encoder_y = LabelEncoder()
y_encoded = label_encoder_y.fit_transform(y)

# One-hot encode X (IMPORTANT FIX)
X_encoded = pd.get_dummies(X)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


LOGISTIC REGRESSION MODEL

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(
    max_iter=1000,
    C=0.1,
    solver='liblinear',
    random_state=42
)

log_model.fit(X_train, y_train)

y_pred_log = log_model.predict(X_test)


from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score
)

accuracy_log = accuracy_score(y_test, y_pred_log)

precision_log = precision_score(y_test, y_pred_log)

recall_log = recall_score(y_test, y_pred_log)

f1_log = f1_score(y_test, y_pred_log)

mcc_log = matthews_corrcoef(y_test, y_pred_log)

auc_log = roc_auc_score(y_test, y_pred_log)


print("Logistic Regression Performance:")

print("Accuracy:", accuracy_log)
print("Precision:", precision_log)
print("Recall:", recall_log)
print("F1 Score:", f1_log)
print("MCC Score:", mcc_log)
print("AUC Score:", auc_log)




####### CONFUSION MATRIX

from sklearn.metrics import confusion_matrix

cm_log = confusion_matrix(y_test, y_pred_log)

print(cm_log)



DECISION TREE CLASSIFIER

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    random_state=42
)

dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
mcc_dt = matthews_corrcoef(y_test, y_pred_dt)
auc_dt = roc_auc_score(y_test, y_pred_dt)

print("Decision Tree Performance:")
print("Accuracy:", accuracy_dt)
print("Precision:", precision_dt)
print("Recall:", recall_dt)
print("F1:", f1_dt)
print("MCC:", mcc_dt)
print("AUC:", auc_dt)


joblib.dump(dt_model, "models/decision_tree_model.pkl")


K-Nearest Neighbors (KNN)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)

knn_model.fit(X_train, y_train)

y_pred_knn = knn_model.predict(X_test)



accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
mcc_knn = matthews_corrcoef(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test, y_pred_knn)

print("KNN Performance:")
print("Accuracy:", accuracy_knn)
print("Precision:", precision_knn)
print("Recall:", recall_knn)
print("F1 Score:", f1_knn)
print("MCC Score:", mcc_knn)
print("AUC Score:", auc_knn)

joblib.dump(knn_model, "models/knn_model.pkl")


NAIVE BAYES MODEL


In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)
mcc_nb = matthews_corrcoef(y_test, y_pred_nb)
auc_nb = roc_auc_score(y_test, y_pred_nb)



print("Naive Bayes Performance:")

print("Accuracy:", accuracy_nb)
print("Precision:", precision_nb)
print("Recall:", recall_nb)
print("F1 Score:", f1_nb)
print("MCC Score:", mcc_nb)
print("AUC Score:", auc_nb)

joblib.dump(nb_model, "models/naive_bayes_model.pkl")


RANDOM FOREST

In [None]:
# ================================
# Random Forest Classifier
# ================================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score
)

import joblib
import os


# Step 1: Create Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)


# Step 2: Train the model
rf_model.fit(X_train, y_train)


# Step 3: Make predictions
y_pred_rf = rf_model.predict(X_test)


# Step 4: Calculate ALL required metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_pred_rf)


# Step 5: Print ALL metrics (Assignment Required Format)
print("\n========== Random Forest Performance ==========")

print(f"Accuracy        : {accuracy_rf:.6f}")
print(f"Precision       : {precision_rf:.6f}")
print(f"Recall          : {recall_rf:.6f}")
print(f"F1 Score        : {f1_rf:.6f}")
print(f"MCC Score       : {mcc_rf:.6f}")
print(f"AUC Score       : {auc_rf:.6f}")

print("===============================================")


# Step 6: Save model into Git repo models folder
os.makedirs("models", exist_ok=True)

joblib.dump(rf_model, "models/random_forest_model.pkl")

print("\nRandom Forest model saved successfully!")


XGBOOST MODEL

In [None]:
# =====================================
# XGBoost Classifier
# =====================================

from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    roc_auc_score
)

import joblib
import os


# Step 1: Create XGBoost model
xgb_model = XGBClassifier(
    eval_metric='logloss',
    random_state=42,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    n_jobs=-1
)


# Step 2: Train the model
xgb_model.fit(X_train, y_train)


# Step 3: Make predictions
y_pred_xgb = xgb_model.predict(X_test)


# Step 4: Calculate ALL required metrics
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
mcc_xgb = matthews_corrcoef(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_pred_xgb)


# Step 5: Print ALL metrics (Assignment Required)
print("\n========== XGBoost Performance ==========")

print(f"Accuracy        : {accuracy_xgb:.6f}")
print(f"Precision       : {precision_xgb:.6f}")
print(f"Recall          : {recall_xgb:.6f}")
print(f"F1 Score        : {f1_xgb:.6f}")
print(f"MCC Score       : {mcc_xgb:.6f}")
print(f"AUC Score       : {auc_xgb:.6f}")

print("=========================================")


# Step 6: Save the model to models folder
os.makedirs("models", exist_ok=True)

joblib.dump(xgb_model, "models/xgboost_model.pkl")

print("\nXGBoost model saved successfully!")


COMPARISION TABLE

In [None]:
import pandas as pd

# Create comparison table
results = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ],
    
    "Accuracy": [
        accuracy_log,
        accuracy_dt,
        accuracy_knn,
        accuracy_nb,
        accuracy_rf,
        accuracy_xgb
    ],
    
    "Precision": [
        precision_log,
        precision_dt,
        precision_knn,
        precision_nb,
        precision_rf,
        precision_xgb
    ],
    
    "Recall": [
        recall_log,
        recall_dt,
        recall_knn,
        recall_nb,
        recall_rf,
        recall_xgb
    ],
    
    "F1 Score": [
        f1_log,
        f1_dt,
        f1_knn,
        f1_nb,
        f1_rf,
        f1_xgb
    ],
    
    "MCC Score": [
        mcc_log,
        mcc_dt,
        mcc_knn,
        mcc_nb,
        mcc_rf,
        mcc_xgb
    ],
    
    "AUC Score": [
        auc_log,
        auc_dt,
        auc_knn,
        auc_nb,
        auc_rf,
        auc_xgb
    ]
})

# Display table
results


SAVING RESULTS TO CSV

In [None]:
results.to_csv("model_comparison_results.csv", index=False)


STREAMLIT APP

In [None]:
import streamlit as st
import pandas as pd
import joblib

# App title
st.title("Mushroom Classification App")

st.write("Upload a CSV file to predict whether mushrooms are edible or poisonous.")

# Model selection dropdown
model_name = st.selectbox(
    "Select Model",
    [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ]
)

# Function to load selected model
def load_model(name):

    if name == "Logistic Regression":
        return joblib.load("models/logistic_regression_model.pkl")

    elif name == "Decision Tree":
        return joblib.load("models/decision_tree_model.pkl")

    elif name == "KNN":
        return joblib.load("models/knn_model.pkl")

    elif name == "Naive Bayes":
        return joblib.load("models/naive_bayes_model.pkl")

    elif name == "Random Forest":
        return joblib.load("models/random_forest_model.pkl")

    elif name == "XGBoost":
        return joblib.load("models/xgboost_model.pkl")


# File uploader
uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])

if uploaded_file is not None:

    # Read uploaded dataset
    data = pd.read_csv(uploaded_file)

    st.write("Uploaded Dataset:")
    st.write(data.head())

    # Load model
    model = load_model(model_name)

    # Apply SAME encoding as training
    data_encoded = pd.get_dummies(data)

    # Get training columns
    training_columns = model.feature_names_in_

    # Add missing columns
    for col in training_columns:
        if col not in data_encoded.columns:
            data_encoded[col] = 0

    # Ensure correct order
    data_encoded = data_encoded[training_columns]

    # Predict
    predictions = model.predict(data_encoded)

    # Add predictions to original dataset
    data["Prediction"] = predictions

    st.write("Prediction Results:")
    st.write(data)