## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully!")

✓ Libraries imported successfully!


## Load Preprocessed data

In [2]:
# Load the preprocessed data we saved in Phase 3
X_train = pd.read_csv('data/X_train_scaled.csv')
X_test = pd.read_csv('data/X_test_scaled.csv')
y_train = pd.read_csv('data/y_train.csv').values.ravel()
y_test = pd.read_csv('data/y_test.csv').values.ravel()

print("✓ Data loaded successfully!")
print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Features: {X_train.shape[1]}")

✓ Data loaded successfully!

Training samples: 614
Testing samples: 154
Features: 8


## Initialize storage for results

In [3]:
# Dictionary to store trained models
models = {}

# List to store training results
results = []

print("✓ Storage initialized for models and results")

✓ Storage initialized for models and results


## Train Naive Bayes Model

In [4]:
print("=" * 70)
print("MODEL 1: NAÏVE BAYES CLASSIFIER")
print("=" * 70)

# Start timer
start_time = time.time()

# Create and train Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5, scoring='accuracy')

# Calculate training time
training_time = time.time() - start_time

# Display results
print(f"\n✓ Training completed in {training_time:.2f} seconds")
print(f"\nCross-Validation Scores (5 folds):")
for i, score in enumerate(cv_scores, 1):
    print(f"  Fold {i}: {score:.4f} ({score*100:.2f}%)")

print(f"\n📊 Mean CV Accuracy: {cv_scores.mean():.4f} ({cv_scores.mean()*100:.2f}%)")
print(f"   Standard Deviation: ±{cv_scores.std():.4f}")

# Store model and results
models['Naive_Bayes'] = nb_model
results.append({
    'Model': 'Naive Bayes',
    'Mean CV Accuracy': cv_scores.mean(),
    'Std CV Accuracy': cv_scores.std(),
    'Training Time (sec)': training_time
})

print("\n✓ Naïve Bayes model saved to memory")

MODEL 1: NAÏVE BAYES CLASSIFIER

✓ Training completed in 0.06 seconds

Cross-Validation Scores (5 folds):
  Fold 1: 0.7642 (76.42%)
  Fold 2: 0.7561 (75.61%)
  Fold 3: 0.7561 (75.61%)
  Fold 4: 0.7480 (74.80%)
  Fold 5: 0.7705 (77.05%)

📊 Mean CV Accuracy: 0.7590 (75.90%)
   Standard Deviation: ±0.0077

✓ Naïve Bayes model saved to memory


## Train SVM model with hyperparameter tunning

In [5]:
print("\n" + "=" * 70)
print("MODEL 2: SUPPORT VECTOR MACHINE (SVM)")
print("=" * 70)

# Define hyperparameter grid to search
svm_param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01]
}

print("\nHyperparameter Search Space:")
print(f"  C (Regularization): {svm_param_grid['C']}")
print(f"  Kernel Types: {svm_param_grid['kernel']}")
print(f"  Gamma Values: {svm_param_grid['gamma']}")
print(f"\n  Total combinations to test: {len(svm_param_grid['C']) * len(svm_param_grid['kernel']) * len(svm_param_grid['gamma'])}")

print("\n⏳ Starting Grid Search (this may take 2-5 minutes)...")

# Start timer
start_time = time.time()

# Create SVM model
svm_base = SVC(random_state=42)

# Grid Search with Cross-Validation
svm_grid = GridSearchCV(
    svm_base,
    svm_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,  # Use all CPU cores
    verbose=1
)

# Train with all parameter combinations
svm_grid.fit(X_train, y_train)

# Calculate training time
training_time = time.time() - start_time

# Display results
print(f"\n✓ Training completed in {training_time:.2f} seconds")
print(f"\n🏆 Best Parameters Found:")
for param, value in svm_grid.best_params_.items():
    print(f"  {param}: {value}")

print(f"\n📊 Best Cross-Validation Score: {svm_grid.best_score_:.4f} ({svm_grid.best_score_*100:.2f}%)")

# Store best model and results
models['SVM'] = svm_grid.best_estimator_
results.append({
    'Model': 'SVM',
    'Mean CV Accuracy': svm_grid.best_score_,
    'Std CV Accuracy': 0,  # GridSearchCV doesn't provide std directly
    'Training Time (sec)': training_time,
    'Best Parameters': svm_grid.best_params_
})

print("\n✓ Best SVM model saved to memory")


MODEL 2: SUPPORT VECTOR MACHINE (SVM)

Hyperparameter Search Space:
  C (Regularization): [0.1, 1, 10, 100]
  Kernel Types: ['linear', 'rbf', 'poly']
  Gamma Values: ['scale', 'auto', 0.001, 0.01]

  Total combinations to test: 48

⏳ Starting Grid Search (this may take 2-5 minutes)...
Fitting 5 folds for each of 48 candidates, totalling 240 fits

✓ Training completed in 10.17 seconds

🏆 Best Parameters Found:
  C: 100
  gamma: 0.001
  kernel: rbf

📊 Best Cross-Validation Score: 0.7769 (77.69%)

✓ Best SVM model saved to memory


## Train Decision tree with hyperparameter tuning

In [6]:
print("\n" + "=" * 70)
print("MODEL 3: DECISION TREE CLASSIFIER")
print("=" * 70)

# Define hyperparameter grid
dt_param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

print("\nHyperparameter Search Space:")
print(f"  Max Depth: {dt_param_grid['max_depth']}")
print(f"  Min Samples Split: {dt_param_grid['min_samples_split']}")
print(f"  Min Samples Leaf: {dt_param_grid['min_samples_leaf']}")
print(f"  Criterion: {dt_param_grid['criterion']}")
print(f"\n  Total combinations to test: {len(dt_param_grid['max_depth']) * len(dt_param_grid['min_samples_split']) * len(dt_param_grid['min_samples_leaf']) * len(dt_param_grid['criterion'])}")

print("\n⏳ Starting Grid Search (this may take 1-3 minutes)...")

# Start timer
start_time = time.time()

# Create Decision Tree model
dt_base = DecisionTreeClassifier(random_state=42)

# Grid Search with Cross-Validation
dt_grid = GridSearchCV(
    dt_base,
    dt_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Train with all parameter combinations
dt_grid.fit(X_train, y_train)

# Calculate training time
training_time = time.time() - start_time

# Display results
print(f"\n✓ Training completed in {training_time:.2f} seconds")
print(f"\n🏆 Best Parameters Found:")
for param, value in dt_grid.best_params_.items():
    print(f"  {param}: {value}")

print(f"\n📊 Best Cross-Validation Score: {dt_grid.best_score_:.4f} ({dt_grid.best_score_*100:.2f}%)")

# Store best model and results
models['Decision_Tree'] = dt_grid.best_estimator_
results.append({
    'Model': 'Decision Tree',
    'Mean CV Accuracy': dt_grid.best_score_,
    'Std CV Accuracy': 0,
    'Training Time (sec)': training_time,
    'Best Parameters': dt_grid.best_params_
})

print("\n✓ Best Decision Tree model saved to memory")


MODEL 3: DECISION TREE CLASSIFIER

Hyperparameter Search Space:
  Max Depth: [3, 5, 7, 10, None]
  Min Samples Split: [2, 5, 10, 20]
  Min Samples Leaf: [1, 2, 4]
  Criterion: ['gini', 'entropy']

  Total combinations to test: 120

⏳ Starting Grid Search (this may take 1-3 minutes)...
Fitting 5 folds for each of 120 candidates, totalling 600 fits

✓ Training completed in 1.97 seconds

🏆 Best Parameters Found:
  criterion: entropy
  max_depth: 3
  min_samples_leaf: 1
  min_samples_split: 20

📊 Best Cross-Validation Score: 0.7558 (75.58%)

✓ Best Decision Tree model saved to memory


## Display Training Result Summary

In [8]:
print("\n" + "=" * 70)
print("TRAINING RESULTS SUMMARY")
print("=" * 70)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display table
print("\n")
print(results_df.to_string(index=False))

# Find best model
best_idx = results_df['Mean CV Accuracy'].idxmax()
best_model = results_df.loc[best_idx, 'Model']
best_accuracy = results_df.loc[best_idx, 'Mean CV Accuracy']

print("\n" + "=" * 70)
print(f"🏆 BEST MODEL (Based on Cross-Validation): {best_model}")
print(f"   Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print("=" * 70)


TRAINING RESULTS SUMMARY


        Model  Mean CV Accuracy  Std CV Accuracy  Training Time (sec)                                                                          Best Parameters
  Naive Bayes          0.758976         0.007719             0.062002                                                                                      NaN
          SVM          0.776929         0.000000            10.174767                                              {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Decision Tree          0.755791         0.000000             1.969543 {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 20}

🏆 BEST MODEL (Based on Cross-Validation): SVM
   Accuracy: 0.7769 (77.69%)


In [10]:
# Display results with better formatting
print("\n" + "=" * 70)
print("TRAINING RESULTS - DETAILED VIEW")
print("=" * 70)

for result in results:
    print(f"\n{'='*70}")
    print(f"Model: {result['Model']}")
    print(f"{'='*70}")
    print(f"Mean CV Accuracy: {result['Mean CV Accuracy']:.4f} ({result['Mean CV Accuracy']*100:.2f}%)")
    print(f"Std CV Accuracy:  {result['Std CV Accuracy']:.4f}")
    print(f"Training Time:    {result['Training Time (sec)']:.2f} seconds")
    
    if 'Best Parameters' in result:
        print(f"\nBest Parameters:")
        for param, value in result['Best Parameters'].items():
            print(f"  • {param}: {value}")



TRAINING RESULTS - DETAILED VIEW

Model: Naive Bayes
Mean CV Accuracy: 0.7590 (75.90%)
Std CV Accuracy:  0.0077
Training Time:    0.06 seconds

Model: SVM
Mean CV Accuracy: 0.7769 (77.69%)
Std CV Accuracy:  0.0000
Training Time:    10.17 seconds

Best Parameters:
  • C: 100
  • gamma: 0.001
  • kernel: rbf

Model: Decision Tree
Mean CV Accuracy: 0.7558 (75.58%)
Std CV Accuracy:  0.0000
Training Time:    1.97 seconds

Best Parameters:
  • criterion: entropy
  • max_depth: 3
  • min_samples_leaf: 1
  • min_samples_split: 20


## Save all Trained Models

In [11]:
print("\n" + "=" * 70)
print("SAVING TRAINED MODELS")
print("=" * 70)

# Create models folder if it doesn't exist
import os
os.makedirs('models', exist_ok=True)

# Save each model
for name, model in models.items():
    filename = f'models/{name}.pkl'
    joblib.dump(model, filename)
    print(f"✓ Saved: {filename}")

# Save training results
results_df.to_csv('results/training_results.csv', index=False)
print(f"✓ Saved: results/training_results.csv")

print("\n✓ All models and results saved successfully!")


SAVING TRAINED MODELS
✓ Saved: models/Naive_Bayes.pkl
✓ Saved: models/SVM.pkl
✓ Saved: models/Decision_Tree.pkl
✓ Saved: results/training_results.csv

✓ All models and results saved successfully!


## Summary

In [12]:
print("\n" + "=" * 70)
print("✅ PHASE 4 COMPLETE: MODEL TRAINING")
print("=" * 70)

print("\n📦 Saved Models:")
print("  ✓ models/Naive_Bayes.pkl")
print("  ✓ models/SVM.pkl")
print("  ✓ models/Decision_Tree.pkl")

print("\n📊 Training Results:")
print("  ✓ results/training_results.csv")

print("\n📈 Performance Summary:")
for _, row in results_df.iterrows():
    print(f"  • {row['Model']:15s}: {row['Mean CV Accuracy']*100:.2f}% accuracy")


✅ PHASE 4 COMPLETE: MODEL TRAINING

📦 Saved Models:
  ✓ models/Naive_Bayes.pkl
  ✓ models/SVM.pkl
  ✓ models/Decision_Tree.pkl

📊 Training Results:
  ✓ results/training_results.csv

📈 Performance Summary:
  • Naive Bayes    : 75.90% accuracy
  • SVM            : 77.69% accuracy
  • Decision Tree  : 75.58% accuracy
