In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
X = np.load('drive/MyDrive/DATA/data_2/X.npy').astype(np.int8)
y = np.load('drive/MyDrive/DATA/data_2/y.npy').astype(np.int8)

In [None]:
y = np.where(y == 1, 0, 1)
np.unique(y)

array([0, 1])

In [None]:
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'F1-score': 'f1',
           'recall -': make_scorer(recall_score, pos_label=0),
           'precision -': make_scorer(precision_score, pos_label=0, zero_division=0),
           }

scores_df = pd.DataFrame(index=['DummyClassifier', 2, 10, 50, 100, 1000, 'full'], columns=list(scoring.keys()))

In [None]:
Xs_pca = []
ns_components = [2, 10, 50, 100, 1000]
for n_components in ns_components:
    pca = PCA(n_components=n_components)
    Xs_pca.append(pca.fit_transform(X))

In [None]:
scores_pca = []

clf = LogisticRegression(penalty='l2',
                         C=0.01,
                         class_weight='balanced',
                         n_jobs=-1)

for X_pca in Xs_pca:
    scores = cross_validate(clf, X_pca, y, cv=5, scoring=scoring, n_jobs=2)
    mean_scores = pd.DataFrame(scores).apply(np.mean)
    scores_pca.append(mean_scores[2:])

In [None]:
scores_df.loc[[2, 10, 50, 100, 1000]] = scores_pca
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,,,,,,
2,0.537438,0.775802,0.540564,0.636649,0.527869,0.275724
10,0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
50,0.56435,0.77613,0.595893,0.672128,0.468852,0.272034
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
full,,,,,,


In [None]:
clf = LogisticRegression(penalty='l2',
                         C=0.01,
                         class_weight='balanced',
                         n_jobs=-1)

scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=2)
mean_scores = pd.DataFrame(scores).apply(np.mean)



In [None]:
scores_df.loc['full'] = mean_scores[2:].values
scores_df.index.name = 'Number of components'
scores_df

Unnamed: 0_level_0,accuracy,precision,recall,F1-score,recall -,precision -
Number of components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DummyClassifier,,,,,,
2,0.537438,0.775802,0.540564,0.636649,0.527869,0.275724
10,0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
50,0.56435,0.77613,0.595893,0.672128,0.468852,0.272034
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
full,0.75326,0.752859,1.0,0.859006,0.00655738,0.4


In [None]:
clf = DummyClassifier()
scores = cross_validate(clf, X, y, cv=5, scoring=scoring, n_jobs=-1)
mean_scores = pd.DataFrame(scores).apply(np.mean)

In [None]:
scores_df.loc['DummyClassifier'] = mean_scores.values[2:]
scores_df

Unnamed: 0_level_0,accuracy,precision,recall,F1-score,recall -,precision -
Number of components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DummyClassifier,0.621377,0.755631,0.73245,0.743645,0.285246,0.265511
2,0.537438,0.775802,0.540564,0.636649,0.527869,0.275724
10,0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
50,0.56435,0.77613,0.595893,0.672128,0.468852,0.272034
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
full,0.75326,0.752859,1.0,0.859006,0.00655738,0.4


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler



In [None]:
sampler = RandomUnderSampler(random_state=42)
X_res, y_res = sampler.fit_resample(X, y)



In [None]:
y_res.shape

(610,)

In [None]:
clf = LogisticRegression(penalty='l2',
                         C=0.01,
                         class_weight='balanced',
                         n_jobs=-1)

scores = cross_validate(clf, X_res, y_res, cv=5, scoring=scoring, n_jobs=2)
mean_scores = pd.DataFrame(scores).apply(np.mean)

In [None]:
mean_scores

fit_time            3.787232
score_time          0.042509
test_accuracy       0.526230
test_precision      0.520202
test_recall         0.590164
test_F1-score       0.551719
test_recall -       0.462295
test_precision -    0.536037
dtype: float64

In [None]:
scores_df

Unnamed: 0_level_0,accuracy,precision,recall,F1-score,recall -,precision -
Number of components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DummyClassifier,0.621377,0.755631,0.73245,0.743645,0.285246,0.265511
2,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
10,0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
50,0.56435,0.77613,0.595893,0.672128,0.468852,0.272034
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
full,0.75326,0.752859,1.0,0.859006,0.00655738,0.4


In [None]:
as_list = scores_df.index.tolist()
idx = as_list.index(2)
as_list[idx] = 'RandomUnderSampler'
scores_df.index = as_list

In [None]:
scores_df.loc[2] = mean_scores.values[2:]
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.621377,0.755631,0.73245,0.743645,0.285246,0.265511
RandomUnderSampler,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037
10,0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
50,0.56435,0.77613,0.595893,0.672128,0.468852,0.272034
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
full,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
2,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037


In [None]:
as_list = scores_df.index.tolist()
idx = as_list.index(10)
as_list[idx] = 'SMOTE'
scores_df.index = as_list

In [None]:
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.621377,0.755631,0.73245,0.743645,0.285246,0.265511
RandomUnderSampler,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037
SMOTE,0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
50,0.56435,0.77613,0.595893,0.672128,0.468852,0.272034
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
full,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
2,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037


In [None]:
scores_df.loc[50] = scores_df.loc['full']

In [None]:
as_list = scores_df.index.tolist()
idx = as_list.index(50)
as_list[idx] = 'LogisticRegression'
scores_df.index = as_list

In [None]:
test_index_0 = np.random.choice(np.argwhere(y == 0)[:, 0], size=100, replace=False)
test_index_1 = np.random.choice(np.argwhere(y == 1)[:, 0], size=100, replace=False)
test_index = np.append(test_index_0, test_index_1)
X_test, y_test = X[test_index], y[test_index]

In [None]:
mask = np.ones(len(X), bool)
mask[test_index] = 0
X_train, y_train = X[mask], y[mask]

In [None]:
X_train.shape, y_train.shape, np.unique(y_train, return_counts=True)    

((1028, 67925), (1028,), (array([0, 1]), array([205, 823])))

In [None]:
X_test.shape, y_test.shape, np.unique(y_test, return_counts=True)

((200, 67925), (200,), (array([0, 1]), array([100, 100])))

In [None]:
X_SMOTE = []

for k_neighbors in [1, 2, 3, 5, 10]:
    sampler = SMOTE(k_neighbors=k_neighbors, n_jobs=-1, random_state=42)
    X_train_res, y_train_res = sampler.fit_resample(X_train, y_train)
    X_SMOTE.append([X_train_res, y_train_res])



In [None]:
scores_SMOTE = []

clf = LogisticRegression(penalty='l2',
                         C=0.01,
                         n_jobs=-1)

for X_y in X_SMOTE:
    clf.fit(X_y[0], X_y[1])
    
    scores = {'accuracy': [],
              'precision': [],
              'recall': [],
              'F1-score': [],
              'recall -': [],
              'precision -': [],
              }

    scores['accuracy'].append(accuracy_score(y_test, clf.predict(X_test)))
    scores['precision'].append(precision_score(y_test, clf.predict(X_test)))
    scores['recall'].append(recall_score(y_test, clf.predict(X_test)))
    scores['F1-score'].append(f1_score(y_test, clf.predict(X_test)))
    scores['recall -'].append(recall_score(y_test, clf.predict(X_test), pos_label=0))
    scores['precision -'].append(precision_score(y_test, clf.predict(X_test), pos_label=0, zero_division=0))

    scores_SMOTE.append(scores)

In [None]:
scores_SMOTE

[{'F1-score': [0.6666666666666666],
  'accuracy': [0.5],
  'precision': [0.5],
  'precision -': [0.0],
  'recall': [1.0],
  'recall -': [0.0]},
 {'F1-score': [0.6666666666666666],
  'accuracy': [0.5],
  'precision': [0.5],
  'precision -': [0.0],
  'recall': [1.0],
  'recall -': [0.0]},
 {'F1-score': [0.6666666666666666],
  'accuracy': [0.5],
  'precision': [0.5],
  'precision -': [0.0],
  'recall': [1.0],
  'recall -': [0.0]},
 {'F1-score': [0.6666666666666666],
  'accuracy': [0.5],
  'precision': [0.5],
  'precision -': [0.0],
  'recall': [1.0],
  'recall -': [0.0]},
 {'F1-score': [0.6666666666666666],
  'accuracy': [0.5],
  'precision': [0.5],
  'precision -': [0.0],
  'recall': [1.0],
  'recall -': [0.0]}]

In [None]:
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.621377,0.755631,0.73245,0.743645,0.285246,0.265511
RandomUnderSampler,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037
SMOTE,0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
LogisticRegression,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
LogisticRegression,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
2,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037


In [156]:
as_list = scores_df.index.tolist()
idx = as_list.index('SMOTE')
as_list[idx] = 'SMOTE k=1, 2, 3, 5, 10'
scores_df.index = as_list

In [157]:
scores_df

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.621377,0.755631,0.73245,0.743645,0.285246,0.265511
RandomUnderSampler,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037
"SMOTE k=1, 2, 3, 5, 10",0.496768,0.734602,0.517926,0.60697,0.432787,0.228177
LogisticRegression,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
100,0.548824,0.75317,0.594736,0.664309,0.409836,0.250287
1000,0.683222,0.766569,0.832027,0.797706,0.232787,0.314855
LogisticRegression,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
2,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037


In [162]:
scores_SMOTE_best = {k: v[0] for (k, v) in scores_SMOTE[0].items()}

In [163]:
scores_df.loc['SMOTE k=1, 2, 3, 5, 10'] = scores_SMOTE_best

In [164]:
scores_df.head(4)

Unnamed: 0,accuracy,precision,recall,F1-score,recall -,precision -
DummyClassifier,0.621377,0.755631,0.73245,0.743645,0.285246,0.265511
RandomUnderSampler,0.52623,0.520202,0.590164,0.551719,0.462295,0.536037
"SMOTE k=1, 2, 3, 5, 10",0.5,0.5,1.0,0.666667,0.0,0.0
LogisticRegression,0.75326,0.752859,1.0,0.859006,0.00655738,0.4
