Extra Trees Classifier
---------------------------------

This model works like Random Forest, but it is more randomized and accurate

In [1]:
# Getting the dataset from the .npz file
import numpy as np

np.random.seed(42)

# Loading the data (NON-PCA)
data = np.load("preprocessed_data.npz")
x_train_flat, y_train = data["x_train_flat"], data["y_train"]
x_val_flat, y_val = data["x_val_flat"], data["y_val"]
x_test_flat, y_test = data["x_test_flat"], data["y_test"]

# Loading the data (PCA)
data2 = np.load("preprocessed_data_pca95.npz")
x_train_pca95, y_train = data2["x_train_pca95"], data2["y_train"]
x_val_pca95, y_val = data2["x_val_pca95"], data2["y_val"]
x_test_pca95, y_test = data2["x_test_pca95"], data2["y_test"]

In [2]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
import time

def train_evaluate_extra_trees_fast(x_train, y_train, x_val, y_val, x_test, y_test, n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):

    model = ExtraTreesClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1
    )

    start_time = time.time()
    model.fit(x_train, y_train)
    train_time = time.time() - start_time

    y_train_pred = model.predict(x_train)
    y_val_pred = model.predict(x_val)
    y_test_pred = model.predict(x_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    overfit_gap = train_acc - val_acc

    print(f"\nExtra Trees Model (Fast):")
    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Val Accuracy:   {val_acc:.4f}")
    print(f"Test Accuracy:  {test_acc:.4f}")
    print(f"Overfitting Gap: {overfit_gap:.4f}")
    print(f"Training Time:   {train_time:.2f} seconds")

    return {
        'train_acc': train_acc,
        'val_acc': val_acc,
        'test_acc': test_acc,
        'overfit_gap': overfit_gap,
        'train_time': train_time
    }


In [3]:
# Training on non-PCA Data
etc_results = train_evaluate_extra_trees_fast(x_train_flat, y_train, x_val_flat, y_val, x_test_flat, y_test, n_estimators=60, max_depth=12, min_samples_split=8, min_samples_leaf=4, max_features=0.5)


Extra Trees Model (Fast):
Train Accuracy: 0.9194
Val Accuracy:   0.8664
Test Accuracy:  0.8720
Overfitting Gap: 0.0530
Training Time:   74.90 seconds


In [4]:
# Training on PCA Data
etc_results = train_evaluate_extra_trees_fast(x_train_pca95, y_train, x_val_pca95, y_val, x_test_pca95, y_test, n_estimators=60, max_depth=12, min_samples_split=8, min_samples_leaf=4, max_features=0.5)


Extra Trees Model (Fast):
Train Accuracy: 0.8909
Val Accuracy:   0.8368
Test Accuracy:  0.8368
Overfitting Gap: 0.0541
Training Time:   11.93 seconds
