# ML Assignment 2 - Classification Models Training
## BITS Pilani M.Tech (AIML/DSE)

This notebook trains and evaluates 6 classification models:
1. Logistic Regression
2. Decision Tree Classifier
3. K-Nearest Neighbor Classifier
4. Naive Bayes Classifier
5. Random Forest (Ensemble)
6. XGBoost (Ensemble)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Explore Dataset

In [None]:
# Load your dataset
DATA_PATH = '../data/your_dataset.csv'  # Update this path
TARGET_COLUMN = 'target'  # Update this with your target column name

df = pd.read_csv(DATA_PATH)
print(f"Dataset Shape: {df.shape}")
print(f"\nDataset Info:")
df.info()

In [None]:
# Display first few rows
df.head()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print(f"\nTotal Missing Values: {df.isnull().sum().sum()}")

In [None]:
# Target variable distribution
print("Target Distribution:")
print(df[TARGET_COLUMN].value_counts())

plt.figure(figsize=(8, 5))
df[TARGET_COLUMN].value_counts().plot(kind='bar', color='steelblue')
plt.title('Target Variable Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

## 2. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Handle categorical features
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Encode target if categorical
if y.dtype == 'object' or y.dtype.name == 'category':
    label_encoder_target = LabelEncoder()
    y = label_encoder_target.fit_transform(y)
    print(f"Target encoded: {label_encoder_target.classes_}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")

## 3. Model Training and Evaluation

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbor': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss')
}

In [None]:
# Function to calculate all metrics
def calculate_metrics(y_true, y_pred, y_pred_proba=None):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_true, y_pred, average='weighted', zero_division=0),
        'F1': f1_score(y_true, y_pred, average='weighted', zero_division=0),
        'MCC': matthews_corrcoef(y_true, y_pred)
    }
    
    # AUC Score
    try:
        if y_pred_proba is not None:
            if len(np.unique(y_true)) == 2:
                metrics['AUC'] = roc_auc_score(y_true, y_pred_proba[:, 1])
            else:
                metrics['AUC'] = roc_auc_score(y_true, y_pred_proba, 
                                              multi_class='ovr', average='weighted')
        else:
            metrics['AUC'] = 0.0
    except:
        metrics['AUC'] = 0.0
    
    return metrics

In [None]:
# Train and evaluate all models
results = {}

for model_name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {model_name}...")
    print('='*60)
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    
    # Probability predictions
    try:
        y_pred_proba = model.predict_proba(X_test_scaled)
    except:
        y_pred_proba = None
    
    # Calculate metrics
    metrics = calculate_metrics(y_test, y_pred, y_pred_proba)
    
    # Store results
    results[model_name] = {
        'metrics': metrics,
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred)
    }
    
    # Print metrics
    print(f"\nMetrics for {model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")
    
    # Save model
    model_filename = f'{model_name.replace(" ", "_").lower()}.pkl'
    joblib.dump(model, model_filename)
    print(f"Model saved as: {model_filename}")

## 4. Results Comparison

In [None]:
# Create comparison dataframe
comparison_df = pd.DataFrame({
    model: {
        'Accuracy': res['metrics']['Accuracy'],
        'AUC': res['metrics']['AUC'],
        'Precision': res['metrics']['Precision'],
        'Recall': res['metrics']['Recall'],
        'F1': res['metrics']['F1'],
        'MCC': res['metrics']['MCC']
    }
    for model, res in results.items()
}).T

print("\n" + "="*80)
print("MODEL COMPARISON TABLE")
print("="*80)
print(comparison_df.round(4))
print("="*80)

# Save comparison
comparison_df.to_csv('model_comparison.csv')
print("\nComparison table saved as: model_comparison.csv")

In [None]:
# Visualize comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
metrics = comparison_df.columns.tolist()

for idx, metric in enumerate(metrics):
    row = idx // 3
    col = idx % 3
    
    axes[row, col].bar(comparison_df.index, comparison_df[metric], 
                       color='steelblue', alpha=0.7, edgecolor='black')
    axes[row, col].set_title(f'{metric} Comparison', fontsize=14, fontweight='bold')
    axes[row, col].set_xlabel('Models', fontsize=11)
    axes[row, col].set_ylabel(metric, fontsize=11)
    axes[row, col].tick_params(axis='x', rotation=45)
    axes[row, col].grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for i, v in enumerate(comparison_df[metric]):
        axes[row, col].text(i, v + 0.01, f'{v:.3f}', 
                           ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Comparison visualization saved as: model_comparison.png")

## 5. Confusion Matrices

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

for idx, (model_name, result) in enumerate(results.items()):
    row = idx // 3
    col = idx % 3
    
    cm = result['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                ax=axes[row, col], cbar=True)
    axes[row, col].set_title(f'{model_name}', fontsize=12, fontweight='bold')
    axes[row, col].set_ylabel('Actual', fontsize=10)
    axes[row, col].set_xlabel('Predicted', fontsize=10)

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

print("Confusion matrices saved as: confusion_matrices.png")

## 6. Save Preprocessing Objects

In [None]:
# Save scaler and label encoder
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved as: scaler.pkl")

if 'label_encoder_target' in locals():
    joblib.dump(label_encoder_target, 'label_encoder.pkl')
    print("Label encoder saved as: label_encoder.pkl")

print("\n" + "="*60)
print("All models and preprocessing objects saved successfully!")
print("="*60)

## 7. Best Model Analysis

In [None]:
# Find best model based on F1 score
best_model_name = comparison_df['F1'].idxmax()
best_f1_score = comparison_df['F1'].max()

print(f"\nBest Model: {best_model_name}")
print(f"Best F1 Score: {best_f1_score:.4f}")
print("\nDetailed metrics for best model:")
print(comparison_df.loc[best_model_name])

print("\nClassification Report:")
print(results[best_model_name]['classification_report'])

## Summary

This notebook has:
1. Loaded and explored the dataset
2. Preprocessed the data (encoding, scaling, splitting)
3. Trained 6 classification models
4. Calculated 6 evaluation metrics for each model
5. Created comparison visualizations
6. Saved all models and preprocessing objects

**Next Steps:**
- Copy the saved model files (*.pkl) to the Streamlit app's model/ directory
- Update the README.md with actual metrics
- Deploy the Streamlit app
- Submit the assignment