In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#from catboost import CatBoostClassifier

In [2]:
# Load the data
train_data_path = '../data/clean/train_data.csv'
test_data_path = '../data/clean/test_data.csv'
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [3]:
# Assuming 'reference' is the label column and all other columns are features
label_col = 'reference'

In [4]:
# Define features and labels
X_train = train_df.drop(columns=[label_col])
y_train = train_df[label_col]
X_test = test_df.drop(columns=[label_col])
y_test = test_df[label_col]

In [5]:
X_train, X_validate, y_train, y_validate = train_test_split(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), test_size=0.2, random_state=42)

In [6]:
# Standardize the feature columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Define parameter grids for each classifier
param_grid_dt = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'n_estimators': [10, 20, 50],
    'max_depth': [3, 6],
    'min_samples_split': [2, 6],
    'min_samples_leaf': [1, 2]
}

param_grid_lr = {
    'C': [0.1, 1, 5],
    'solver': ['liblinear', 'saga']
}

In [7]:
# Initialize classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
}

In [8]:
# Train and evaluate each classifier with progress tracking
results = {}
for name, clf in tqdm(classifiers.items(), desc="Training classifiers"):
    print(f"Training {name}...")
    clf.fit(X_train_scaled, y_train)
    print(f"Finished training {name}. Predicting...")
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'report': report
    }
    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("="*60)

Training classifiers:   0%|          | 0/3 [00:00<?, ?it/s]

Training Decision Tree...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Training classifiers:  33%|███▎      | 1/3 [00:06<00:12,  6.11s/it]

Finished training Decision Tree. Predicting...
Results for Decision Tree:
Accuracy: 0.9483204134366925
Classification Report:
                                   precision    recall  f1-score   support

                    0068136202989       1.00      1.00      1.00         4
                    0106076501615       0.00      0.00      0.00         0
                    0264625816112       1.00      1.00      1.00         1
                    0281865291974       1.00      1.00      1.00         1
                    0299974172879       1.00      1.00      1.00         1
                    0325257440233       1.00      1.00      1.00         2
                    0444531953105       0.00      0.00      0.00         0
                    0445561676590       1.00      0.67      0.80         3
                    0474506569192       1.00      1.00      1.00         2
                    0487127900312       1.00      1.00      1.00         1
                    0636655305100       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Training classifiers:  67%|██████▋   | 2/3 [02:20<01:21, 81.64s/it]

Results for Random Forest:
Accuracy: 0.9664082687338501
Classification Report:
                                   precision    recall  f1-score   support

                    0031242870397       0.00      0.00      0.00         0
                    0068136202989       1.00      1.00      1.00         4
                    0264625816112       1.00      1.00      1.00         1
                    0281865291974       1.00      1.00      1.00         1
                    0299974172879       1.00      1.00      1.00         1
                    0325257440233       1.00      1.00      1.00         2
                    0445561676590       1.00      1.00      1.00         3
                    0470140680576       0.00      0.00      0.00         0
                    0474506569192       1.00      1.00      1.00         2
                    0487127900312       1.00      1.00      1.00         1
                    0636655305100       1.00      1.00      1.00         1
                    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Training classifiers: 100%|██████████| 3/3 [02:44<00:00, 54.71s/it]

Finished training Logistic Regression. Predicting...
Results for Logistic Regression:
Accuracy: 0.6124031007751938
Classification Report:
                                   precision    recall  f1-score   support

                    0068136202989       0.50      1.00      0.67         4
                    0264625816112       1.00      1.00      1.00         1
                    0281865291974       0.00      0.00      0.00         1
                    0299974172879       0.00      0.00      0.00         1
                    0325257440233       1.00      1.00      1.00         2
                    0445561676590       1.00      1.00      1.00         3
                    0474506569192       0.00      0.00      0.00         2
                    0487127900312       0.00      0.00      0.00         1
                    0636655305100       0.00      0.00      0.00         1
                    0673961551647       0.00      0.00      0.00         1
                    0681112768530   


