In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, cross_validate, train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier

STATE = 42

In [2]:
data = np.load("processed.npy")
data.shape

(918, 43)

In [3]:
X = data[:, :-1]
y = data[:, -1]
X.shape, y.shape

((918, 42), (918,))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=STATE)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((734, 42), (734,), (184, 42), (184,))

# Sklearn

In [5]:
cv = KFold(n_splits=5, random_state=STATE, shuffle=True)
scoring = {"acc": "accuracy", "f1": "f1_macro"}  # roc_auc_ovr


def train_clf(clf, X, y) -> dict:
    scores = cross_validate(clf, X, y, scoring=scoring, cv=cv, n_jobs=-1)
    for name in scoring:
        print(f"{name} {scores[f'test_{name}'].mean()} {scores[f'test_{name}'].std()}")

## LogReg

In [6]:
train_clf(LogisticRegression(random_state=STATE), X, y)

acc 0.9400629603231172 0.017318396862096696
f1 0.9390173509563404 0.01597776143360865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

## SVM

In [7]:
train_clf(SVC(random_state=STATE), X, y)

acc 0.9237407935376574 0.01461774337556991
f1 0.9224893224087604 0.012199306514027564


## Forest

In [8]:
train_clf(RandomForestClassifier(random_state=STATE), X, y)

acc 0.9379128058921358 0.01564969092666756
f1 0.9368587063396007 0.014830681517364414


# Xgboost

In [9]:
train_clf(XGBClassifier(random_state=STATE), X, y)

acc 0.9368377286766452 0.013543651704257056
f1 0.9331904908049513 0.009557455010991615


# Catboost

In [10]:
from catboost import CatBoostClassifier

In [11]:
train_clf(CatBoostClassifier(random_state=STATE, verbose=False), X, y)

acc 0.9498990258968876 0.010541102249011645
f1 0.9477772473648457 0.010503361484246881
