In [10]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split

from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
X = np.load('data/data_1/X.npy').astype(np.int8)
y = np.load('data/data_1/y.npy').astype(np.int8)

In [3]:
X.shape, y.shape

((1323, 85142), (1323,))

In [4]:
len(y[y == 1]), len(y[y == 1]) / len(y)

(910, 0.6878306878306878)

In [5]:
model_names = []
for model in [DummyClassifier, LogisticRegression, SVC,
              DecisionTreeClassifier, RandomForestClassifier,
              KNeighborsClassifier, XGBClassifier]:
    model_names.append(model.__name__)

model_names

['DummyClassifier',
 'LogisticRegression',
 'SVC',
 'DecisionTreeClassifier',
 'RandomForestClassifier',
 'KNeighborsClassifier',
 'XGBClassifier']

In [6]:
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'F1-score': 'f1',
           'recall -': make_scorer(recall_score, pos_label=0),
           'precision -': make_scorer(precision_score, pos_label=0, zero_division=0),
           }

scores_df = pd.DataFrame(index=model_names, columns=list(scoring.keys()))

In [7]:
clf = DummyClassifier()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)

In [8]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.687833,0.687833,1.0,0.815048,0.0,0.0
LogisticRegression,,,,,,
SVC,,,,,,
DecisionTreeClassifier,,,,,,
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,
XGBClassifier,,,,,,


In [10]:
clf = LogisticRegression(n_jobs=-1)
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=6)
mean_scores = pd.DataFrame(scores).apply(np.mean)

In [11]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.687833,0.687833,1.0,0.815048,0.0,0.0
LogisticRegression,0.776907,0.769918,1.0,0.865461,0.284514,0.8
SVC,,,,,,
DecisionTreeClassifier,,,,,,
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,
XGBClassifier,,,,,,


In [12]:
clf = LogisticRegression(solver='liblinear', penalty='l1')
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=2)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            5.982448
score_time          0.091822
test_accuracy       0.874531
test_precision      0.851439
test_recall         1.000000
test_F1-score       0.918207
test_recall -       0.598090
test_precision -    1.000000
dtype: float64

In [13]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.687833,0.687833,1.0,0.815048,0.0,0.0
LogisticRegression,0.874531,0.851439,1.0,0.918207,0.59809,1.0
SVC,,,,,,
DecisionTreeClassifier,,,,,,
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,
XGBClassifier,,,,,,


In [16]:
clf = SVC()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

[LibSVM]

In [None]:
clf = DecisionTreeClassifier()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            14.343377
score_time           0.066416
test_accuracy        0.961509
test_precision       0.962332
test_recall          0.990110
test_F1-score        0.974318
test_recall -        0.898795
test_precision -     0.976898
dtype: float64

In [None]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,,,,,,
LogisticRegression,,,,,,
SVC,,,,,,
DecisionTreeClassifier,0.961509,0.962332,0.99011,0.974318,0.898795,0.976898
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,


In [None]:
clf = RandomForestClassifier()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            10.252733
score_time           0.081300
test_accuracy        0.968302
test_precision       0.962500
test_recall          1.000000
test_F1-score        0.979310
test_recall -        0.898795
test_precision -     1.000000
dtype: float64

In [None]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,,,,,,
LogisticRegression,,,,,,
SVC,,,,,,
DecisionTreeClassifier,0.961509,0.962332,0.99011,0.974318,0.898795,0.976898
RandomForestClassifier,0.968302,0.9625,1.0,0.97931,0.898795,1.0
KNeighborsClassifier,,,,,,


In [None]:
clf = KNeighborsClassifier()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

fit_time            3.273046
score_time          4.863502
test_accuracy       0.792058
test_precision      0.810056
test_recall         0.937363
test_F1-score       0.864396
test_recall -       0.471319
test_precision -    0.745593
dtype: float64

In [None]:
scores_df.loc[type(clf).__name__] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,,,,,,
LogisticRegression,,,,,,
SVC,,,,,,
DecisionTreeClassifier,0.961509,0.962332,0.99011,0.974318,0.898795,0.976898
RandomForestClassifier,0.968302,0.9625,1.0,0.97931,0.898795,1.0
KNeighborsClassifier,0.792058,0.810056,0.937363,0.864396,0.471319,0.745593


In [1]:
clf = XGBClassifier(n_estimators=5,
                    use_label_encoder=False,
                    scale_pos_weight=3,
                    learning_rate=0.1,
                    max_depth=2)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = {'accuracy': [],
          'precision': [],
          'recall': [],
          'F1-score': [],
          'recall -': [],
          'precision -': [],
          }

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    clf.fit(X_train, y_train,
            eval_set=[(X_train, y_train), (X_val, y_val)],
            eval_metric='error',
            early_stopping_rounds=4,
            verbose=True)

    print('___')

    scores['accuracy'].append(accuracy_score(y_val, clf.predict(X_val)))
    scores['precision'].append(precision_score(y_val, clf.predict(X_val)))
    scores['recall'].append(recall_score(y_val, clf.predict(X_val)))
    scores['F1-score'].append(f1_score(y_val, clf.predict(X_val)))
    scores['recall -'].append(recall_score(y_val, clf.predict(X_val), pos_label=0))
    scores['precision -'].append(precision_score(y_val, clf.predict(X_val), pos_label=0, zero_division=0))

NameError: name 'XGBClassifier' is not defined

In [48]:
mean_scores = pd.DataFrame(scores).apply(np.mean)
mean_scores

accuracy       0.889651
precision      0.864698
recall         0.995604
F1-score       0.925481
recall -       0.656244
precision -    0.986057
dtype: float64

In [16]:
scores_df.loc[type(clf).__name__] = mean_scores.values
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.687833,0.687833,1.0,0.815048,0.0,0.0
LogisticRegression,,,,,,
SVC,,,,,,
DecisionTreeClassifier,,,,,,
RandomForestClassifier,,,,,,
KNeighborsClassifier,,,,,,
XGBClassifier,0.996978,1.0,0.995604,0.997787,1.0,0.990614
