In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load the data
train_data_path = '../data/clean/train_data.csv'
test_data_path = '../data/clean/test_data.csv'
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
# Assuming 'reference' is the label column and all other columns are features
label_col = 'reference'

In [4]:
# Define features and labels
X_train = train_df.drop(columns=[label_col])
y_train = train_df[label_col]
X_test = test_df.drop(columns=[label_col])
y_test = test_df[label_col]

In [5]:
# Standardize the feature columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Initialize classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': SVC(verbose=True),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [7]:
# Train and evaluate each classifier with progress tracking
results = {}
for name, clf in tqdm(classifiers.items(), desc="Training classifiers"):
    print(f"Training {name}...")
    clf.fit(X_train_scaled, y_train)
    print(f"Finished training {name}. Predicting...")
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'report': report
    }
    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("="*60)

Training classifiers:   0%|          | 0/3 [00:00<?, ?it/s]

Training Decision Tree...


Training classifiers:   0%|          | 0/3 [13:27<?, ?it/s]


MemoryError: could not allocate 24888999936 bytes