In [4]:
# ==========================================
# BITS Pilani M.Tech AI/ML - Assignment 2
# Mobile Price Classification
# ==========================================

import pandas as pd
import numpy as np
import joblib
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn & XGBoost Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, matthews_corrcoef)

# 1. Setup & Configuration
# ------------------------------------------
warnings.filterwarnings('ignore')  # Keep output clean
if not os.path.exists('models'):
    os.makedirs('models')          # Create folder for saving models

# 2. Data Loading & Preprocessing
# ------------------------------------------
print("Loading dataset...")
# Ensure 'mobile_price_data.csv' is uploaded to the lab
df = pd.read_csv('mobile_price_data.csv')

# Handle missing values just in case
if df.isnull().sum().sum() > 0:
    df.fillna(df.mean(), inplace=True)

# Separate Features (X) and Target (y)
X = df.drop('price_range', axis=1)
y = df['price_range']

# Split: 80% Training, 20% Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the Test Data separately (for App testing)
test_data_export = X_test.copy()
test_data_export['price_range'] = y_test
test_data_export.to_csv('test_data.csv', index=False)
print("‚úÖ Created 'test_data.csv' (Use this file to test your Streamlit App)")

# Feature Scaling (Crucial for KNN and Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save Scaler for the App
joblib.dump(scaler, 'models/scaler.pkl')

# 3. Model Initialization
# ------------------------------------------
models_dict = {
    "Logistic Regression": LogisticRegression(max_iter=2000, multi_class='multinomial'),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# 4. Training & Evaluation Loop
# ------------------------------------------
results_list = []

print("\nTraining 6 models...")

for name, model in models_dict.items():
    # Use scaled data for distance/linear based models
    if name in ["Logistic Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)
    else:
        # Tree-based models don't strictly need scaling
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)

    # Calculate all required metrics
    acc = accuracy_score(y_test, y_pred)
    # AUC (One-vs-Rest strategy for multi-class)
    auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    # Weighted average for multi-class precision/recall
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)

    # Append to results
    results_list.append({
        "ML Model Name": name,
        "Accuracy": round(acc, 3),
        "AUC": round(auc, 3),
        "Precision": round(prec, 3),
        "Recall": round(rec, 3),
        "F1 Score": round(f1, 3),
        "MCC": round(mcc, 3)
    })

    # Save the trained model
    file_name = f"models/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, file_name)

# 5. Final Output & Comparison
# ------------------------------------------
results_df = pd.DataFrame(results_list)

print("\n" + "="*40)
print("FINAL MODEL COMPARISON")
print("="*40)
display(results_df)

# Identify Best Model
best_acc = results_df['Accuracy'].max()
best_model_name = results_df.loc[results_df['Accuracy'] == best_acc, 'ML Model Name'].values[0]
print(f"\nüèÜ Best Model: {best_model_name} with {best_acc} Accuracy")
print("‚úÖ All models saved in 'models/' folder.")

Loading dataset...
‚úÖ Created 'test_data.csv' (Use this file to test your Streamlit App)

Training 6 models...

FINAL MODEL COMPARISON


Unnamed: 0,ML Model Name,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.975,1.0,0.976,0.975,0.975,0.967
1,Decision Tree,0.833,0.886,0.834,0.833,0.832,0.777
2,KNN,0.53,0.763,0.57,0.53,0.541,0.379
3,Naive Bayes,0.797,0.956,0.806,0.797,0.799,0.731
4,Random Forest,0.892,0.983,0.896,0.892,0.893,0.857
5,XGBoost,0.905,0.991,0.906,0.905,0.905,0.874



üèÜ Best Model: Logistic Regression with 0.975 Accuracy
‚úÖ All models saved in 'models/' folder.
