In [45]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split

from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# X = np.load('drive/MyDrive/DATA/data_1/X.npy').astype(np.int8)
# y = np.load('drive/MyDrive/DATA/data_1/y.npy').astype(np.int8)

In [47]:
X = np.load('drive/MyDrive/DATA/data_2/X.npy').astype(np.int8)
y = np.load('drive/MyDrive/DATA/data_2/y.npy').astype(np.int8)

In [48]:
y = np.where(y == 1, 0, 1)
np.unique(y)

array([0, 1])

In [49]:
X.shape, y.shape

((1228, 67925), (1228,))

In [50]:
len(y[y == 1]), len(y[y == 1]) / len(y)

(923, 0.751628664495114)

In [51]:
model_names = []
for model in [DummyClassifier, LogisticRegression, SVC,
              DecisionTreeClassifier, RandomForestClassifier,
              KNeighborsClassifier, XGBClassifier]:
    model_names.append(model.__name__)

model_names

['DummyClassifier',
 'LogisticRegression',
 'SVC',
 'DecisionTreeClassifier',
 'RandomForestClassifier',
 'KNeighborsClassifier',
 'XGBClassifier']

In [52]:
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'F1-score': 'f1',
           'recall -': make_scorer(recall_score, pos_label=0),
           'precision -': make_scorer(precision_score, pos_label=0, zero_division=0),
           }

scores_df = pd.DataFrame(index=model_names, columns=list(scoring.keys()))

In [53]:
clf = DummyClassifier()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)

In [54]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.635182,0.755443,0.760564,0.757538,0.255738,0.26358
LogisticRegression,,,,,,
SVC,,,,,,
DecisionTreeClassifier,,,,,,
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,
XGBClassifier,,,,,,


In [55]:
clf = LogisticRegression(solver='liblinear', penalty='l1', class_weight='balanced')
scores = cross_validate(clf, X, y, cv=5, scoring=scoring)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            4.666138
score_time          0.048508
test_accuracy       0.702778
test_precision      0.764414
test_recall         0.874371
test_F1-score       0.815481
test_recall -       0.183607
test_precision -    0.321612
dtype: float64

In [56]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.635182,0.755443,0.760564,0.757538,0.255738,0.26358
LogisticRegression,0.702778,0.764414,0.874371,0.815481,0.183607,0.321612
SVC,,,,,,
DecisionTreeClassifier,,,,,,
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,
XGBClassifier,,,,,,


In [57]:
clf = DecisionTreeClassifier(class_weight='balanced')
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            19.235872
score_time           0.038547
test_accuracy        0.642535
test_precision       0.763299
test_recall          0.760652
test_F1-score        0.761256
test_recall -        0.285246
test_precision -     0.282338
dtype: float64

In [58]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.635182,0.755443,0.760564,0.757538,0.255738,0.26358
LogisticRegression,0.702778,0.764414,0.874371,0.815481,0.183607,0.321612
SVC,,,,,,
DecisionTreeClassifier,0.642535,0.763299,0.760652,0.761256,0.285246,0.282338
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,
XGBClassifier,,,,,,


In [59]:
clf = RandomForestClassifier(class_weight='balanced')
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            5.483033
score_time          0.064686
test_accuracy       0.751628
test_precision      0.751628
test_recall         1.000000
test_F1-score       0.858205
test_recall -       0.000000
test_precision -    0.000000
dtype: float64

In [60]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.635182,0.755443,0.760564,0.757538,0.255738,0.26358
LogisticRegression,0.702778,0.764414,0.874371,0.815481,0.183607,0.321612
SVC,,,,,,
DecisionTreeClassifier,0.642535,0.763299,0.760652,0.761256,0.285246,0.282338
RandomForestClassifier,0.751628,0.751628,1.0,0.858205,0.0,0.0
KNeighborsClassifier,,,,,,
XGBClassifier,,,,,,


In [61]:
clf = KNeighborsClassifier(weights='distance', p=1)
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores



fit_time             8.846984
score_time          49.978125
test_accuracy        0.702014
test_precision       0.751402
test_recall          0.901475
test_F1-score        0.819552
test_recall -        0.098361
test_precision -     0.261227
dtype: float64

In [62]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.635182,0.755443,0.760564,0.757538,0.255738,0.26358
LogisticRegression,0.702778,0.764414,0.874371,0.815481,0.183607,0.321612
SVC,,,,,,
DecisionTreeClassifier,0.642535,0.763299,0.760652,0.761256,0.285246,0.282338
RandomForestClassifier,0.751628,0.751628,1.0,0.858205,0.0,0.0
KNeighborsClassifier,0.702014,0.751402,0.901475,0.819552,0.0983607,0.261227
XGBClassifier,,,,,,


In [72]:
clf = XGBClassifier(n_estimators=150,
                    max_depth=3, # 1 - 5 (2-3)
                    # min_child_weight=30, # (? - 30) (30)
                    scale_pos_weight=0.3, # 0.2 - 0.5 (0.3)
                    learning_rate=0.1,
                    # tree_method='gpu_hist',
                    # predictor='gpu_predictor',
                    n_jobs=-1,
                    use_label_encoder=False)


scores = {'accuracy': [],
          'precision': [],
          'recall': [],
          'F1-score': [],
          'recall -': [],
          'precision -': [],
          }



X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  shuffle=True,
                                                  stratify=y,
                                                  random_state=42)

clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        eval_metric='error',
        early_stopping_rounds=45,
        verbose=True)

print('___')

y_pred = clf.predict(X_val, ntree_limit=clf.best_ntree_limit)

scores['accuracy'].append(accuracy_score(y_val, y_pred))
scores['precision'].append(precision_score(y_val, y_pred))
scores['recall'].append(recall_score(y_val, y_pred))
scores['F1-score'].append(f1_score(y_val, y_pred))
scores['recall -'].append(recall_score(y_val, y_pred, pos_label=0))
scores['precision -'].append(precision_score(y_val, y_pred, pos_label=0, zero_division=0))

scores

[0]	validation_0-error:0.373727	validation_1-error:0.52439
Multiple eval metrics have been passed: 'validation_1-error' will be used for early stopping.

Will train until validation_1-error hasn't improved in 25 rounds.
[1]	validation_0-error:0.310591	validation_1-error:0.443089
[2]	validation_0-error:0.280041	validation_1-error:0.5
[3]	validation_0-error:0.307536	validation_1-error:0.512195
[4]	validation_0-error:0.271894	validation_1-error:0.455285
[5]	validation_0-error:0.255601	validation_1-error:0.48374
[6]	validation_0-error:0.213849	validation_1-error:0.49187
[7]	validation_0-error:0.218941	validation_1-error:0.504065
[8]	validation_0-error:0.211813	validation_1-error:0.471545
[9]	validation_0-error:0.186354	validation_1-error:0.471545
[10]	validation_0-error:0.180244	validation_1-error:0.471545
[11]	validation_0-error:0.170061	validation_1-error:0.479675
[12]	validation_0-error:0.155804	validation_1-error:0.463415
[13]	validation_0-error:0.150713	validation_1-error:0.46748
[14]

KeyboardInterrupt: ignored

In [78]:
scores_df.loc[type(clf).__name__] = pd.DataFrame(scores).values
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.635182,0.755443,0.760564,0.757538,0.255738,0.26358
LogisticRegression,0.702778,0.764414,0.874371,0.815481,0.183607,0.321612
SVC,0.751628,0.751628,1.0,0.858205,0.0,0.0
DecisionTreeClassifier,0.642535,0.763299,0.760652,0.761256,0.285246,0.282338
RandomForestClassifier,0.751628,0.751628,1.0,0.858205,0.0,0.0
KNeighborsClassifier,0.702014,0.751402,0.901475,0.819552,0.0983607,0.261227
XGBClassifier,0.674797,0.751196,0.848649,0.796954,0.147541,0.243243


In [None]:
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

In [None]:
# scores_df.loc[type(clf).__name__] = mean_scores.values
# scores_df

In [67]:
clf = SVC()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores



fit_time            122.383586
score_time           30.417684
test_accuracy         0.751628
test_precision        0.751628
test_recall           1.000000
test_F1-score         0.858205
test_recall -         0.000000
test_precision -      0.000000
dtype: float64

In [68]:
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            122.383586
score_time           30.417684
test_accuracy         0.751628
test_precision        0.751628
test_recall           1.000000
test_F1-score         0.858205
test_recall -         0.000000
test_precision -      0.000000
dtype: float64

In [69]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.635182,0.755443,0.760564,0.757538,0.255738,0.26358
LogisticRegression,0.702778,0.764414,0.874371,0.815481,0.183607,0.321612
SVC,0.751628,0.751628,1.0,0.858205,0.0,0.0
DecisionTreeClassifier,0.642535,0.763299,0.760652,0.761256,0.285246,0.282338
RandomForestClassifier,0.751628,0.751628,1.0,0.858205,0.0,0.0
KNeighborsClassifier,0.702014,0.751402,0.901475,0.819552,0.0983607,0.261227
XGBClassifier,0.580601,0.760491,0.641369,0.694419,0.396721,0.277471
