In [11]:
import time
import os
import polars as pl
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [12]:
# ==========================================
# 1. Data Loading & Preparation
# ==========================================

def load_data(filename):
    path = os.path.join("data", "tabular", filename)
    data = pl.read_csv(path).to_numpy()
    X = data[:, :-1].astype(float)
    y = data[:, -1]
    return X, y

try:
    X_train, y_train = load_data("train_processed.csv")
    # X_val, y_val = load_data("validation_processed.csv")
    X_test, y_test = load_data("test_processed.csv")
    print(f"Data loaded successfully:\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")
except FileNotFoundError:
    print("Error: CSV for train and/or test not found! Please ensure their existence in this universe!")

Data loaded successfully:
Train shape: (2791, 106), Test shape: (301, 106)


In [13]:
# ==========================================
# 2. Configuration: Models & Hyperparameters
# ==========================================

configs = {
    # 1. Logistic Regression
    'Logistic Regression': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()), 
            ('clf', LogisticRegression(max_iter=2000))
        ]),
        'params': {
            'clf__C': [0.1, 1, 10], 
            'clf__solver': ['lbfgs', 'liblinear']
        }
    },
    
    # 2. Logistic Regression + Non-Linear Transformation
    'Log. Reg. (Non-Linear)': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=2)), # The non-linear transform
            ('clf', LogisticRegression(max_iter=2000))
        ]),
        'params': {
            'clf__C': [0.1, 1, 10]
        }
    },
    
    # 3. Soft-margin SVM (Linear)
    'SVM (Linear)': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', SVC(kernel='linear', max_iter=10000))
        ]),
        'params': {
            'clf__C': [0.1, 1, 10] # Controls soft-margin strictness
        }
    },
    
    # 4. Soft-margin SVM (Kernel Trick)
    'SVM (Kernel RBF)': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', SVC(kernel='rbf', max_iter=10000))
        ]),
        'params': {
            'clf__C': [0.1, 1, 10],
            'clf__gamma': ['scale', 'auto']
        }
    },
    
    # 5. k-Nearest Neighbors
    'k-NN': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('clf', KNeighborsClassifier())
        ]),
        'params': {
            'clf__n_neighbors': [3, 5, 7, 9],
            'clf__weights': ['uniform', 'distance']
        }
    },
    
    # 6. Naive Bayes
    'Naive Bayes': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()), # Optional for NB, but often helps
            ('clf', GaussianNB())
        ]),
        'params': {
            'clf__var_smoothing': [1e-9, 1e-8]
        }
    },
    
    # 7. Random Forest
    'Random Forest': {
        'pipeline': Pipeline([
            # Scaling is not strictly necessary for RF, but doesn't hurt in pipeline
            ('clf', RandomForestClassifier(random_state=42))
        ]),
        'params': {
            'clf__n_estimators': [50, 100],
            'clf__max_depth': [None, 10, 20]
        }
    }
}

In [14]:
# ==========================================
# 3. Training & Benchmarking Loop
# ==========================================

results_data = []

print("Start Benchmark\n" + "="*50)

for name, config in configs.items():
    print(f"Training {name}")

    # Initialize GridSearch
    grid = GridSearchCV(
            config['pipeline'], 
            config['params'], 
            cv=5, 
            n_jobs=-1, # Use all CPU cores
            scoring='accuracy'
        )
    
    # Measure training time
    start_time = time.time()
    grid.fit(X_train, y_train)
    end_time = time.time()
    training_time = end_time - start_time

    # Get best model from GridSearch
    best_model = grid.best_estimator_
    best_params = grid.best_params_

    # Evaluate on Test Set
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    # Store results
    results_data.append({
        'Classifier': name,
        'Best Hyperparameters': str(best_params),
        'Training Time (sec)': round(training_time, 4),
        'Test Accuracy': round(acc, 4)
    })

Start Benchmark
Training Logistic Regression




Training Log. Reg. (Non-Linear)
Training SVM (Linear)
Training SVM (Kernel RBF)
Training k-NN
Training Naive Bayes
Training Random Forest


In [15]:
# ==========================================
# 4. Reporting
# ==========================================

results_df = pd.DataFrame(results_data)

print("\n" + "="*40)
print("FINAL BENCHMARKING RESULTS")
print("="*40)
print(results_df)

results_df.to_csv('benchmark_results_task1.csv', index=False)
print("\nResults saved to 'benchmark_results_task1.csv'.")


FINAL BENCHMARKING RESULTS
               Classifier                               Best Hyperparameters  \
0     Logistic Regression          {'clf__C': 1, 'clf__solver': 'liblinear'}   
1  Log. Reg. (Non-Linear)                                    {'clf__C': 0.1}   
2            SVM (Linear)                                    {'clf__C': 0.1}   
3        SVM (Kernel RBF)               {'clf__C': 1, 'clf__gamma': 'scale'}   
4                    k-NN  {'clf__n_neighbors': 9, 'clf__weights': 'dista...   
5             Naive Bayes                      {'clf__var_smoothing': 1e-08}   
6           Random Forest   {'clf__max_depth': 20, 'clf__n_estimators': 100}   

   Training Time (sec)  Test Accuracy  
0               8.0565         0.9003  
1               9.5813         0.7708  
2               0.5366         0.9103  
3               1.6303         0.8405  
4               0.6283         0.6146  
5               0.2184         0.8571  
6               5.7214         0.7342  

Results sa