In [1]:
import os
import random
from collections import Counter

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import confusion_matrix

pd.options.display.float_format = '{:.4f}'.format

In [2]:
from sklearn.linear_model import SGDClassifier, LogisticRegressionCV, LogisticRegression

In [3]:
def confusion_matrix_report(true, pred):
    [tn, fp, fn, tp] = confusion_matrix(true, pred).ravel()
    precision = tp/(fp + tp)
    recall = tp/(fn + tp)
    print(f"\t\tT\tF")
    print(f"\t1 [{tp:5d}, {fn:5d}]")
    print(f"\t0 [{fp:5d}, {tn:5d}]")
    print(f"accuracy : \t{ (tp+tn)/(tn + fp + fn + tp) :2.5f}")
    print(f"precision : \t{precision:2.5f}")
    print(f"recall : \t{recall:2.5f}")
    print(f"f1score : \t{(2*recall*precision)/(recall+precision):2.5f}")
    
    
def shuffle(x, y):
    idx = list(range(y_resample.shape[0]))
    random.shuffle(idx)
    return x[idx], y[idx]

In [4]:
dataset = pd.read_csv(os.path.join("/home/gruds/projects/chan/pico/dataset/credit_card","preprocessed.csv"), index_col = 0)

In [5]:
columns = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
           'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
           'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Time',
           'Class']
dataset = dataset[columns]

In [6]:
reduce_size = False
normalize = True

In [7]:
if reduce_size:
    columns = dataset.columns
    x = dataset.drop("Class",axis =1).values
    y = dataset.Class.values

    rus = RandomUnderSampler(random_state=42, sampling_strategy=0.01)
    x , y = rus.fit_resample(x, y)
    reduced = np.concatenate([x, y.reshape((-1,1))], axis =1)

    del x, y
    dataset = pd.DataFrame(data=reduced, columns=columns)

In [8]:
if normalize:
    is_fraud = dataset.Class.values.copy()

    scaler = StandardScaler() # RobustScaler # MinMaxScaler # StandardScaler

    transformed = dataset.drop("Class",axis =1)
    columns = transformed.columns
    transformed = scaler.fit_transform(transformed)

    del dataset
    dataset = pd.DataFrame(data=transformed, columns=columns)
    dataset = dataset.assign(Class = is_fraud)

In [9]:
x = dataset.drop('Class', axis =1)
y = dataset.Class

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# normal

In [11]:
Counter(y_train)

Counter({0: 227430, 1: 393})

In [12]:
logistic_regression = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True
                                        , intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100
                                        , multi_class='ovr', verbose=2, warm_start=False, n_jobs=6
                        )
logistic_regression.fit(x_train, y_train)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 out of   1 | elapsed:    9.4s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=6, penalty='l2', random_state=None,
                   solver='lbfgs', tol=0.0001, verbose=2, warm_start=False)

In [13]:
y_pred = logistic_regression.predict(x_test)

In [14]:
confusion_matrix_report(y_pred, y_test)

		T	F
	1 [   66,    11]
	0 [   29, 56850]
accuracy : 	0.99930
precision : 	0.69474
recall : 	0.85714
f1score : 	0.76744


# random oversampling

In [15]:
from imblearn.over_sampling import RandomOverSampler

In [16]:
ros = RandomOverSampler(sampling_strategy=0.01, random_state=42)
x_resample, y_resample = ros.fit_resample(x_train, y_train)

In [17]:
Counter(y_resample)

Counter({0: 227430, 1: 2274})

In [18]:
logistic_regression = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True
                                        , intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100
                                        , multi_class='ovr', verbose=2, warm_start=False, n_jobs=6
                        )
logistic_regression.fit(x_resample, y_resample)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 out of   1 | elapsed:    7.7s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=6, penalty='l2', random_state=None,
                   solver='lbfgs', tol=0.0001, verbose=2, warm_start=False)

In [19]:
y_pred = logistic_regression.predict(x_test)

In [20]:
confusion_matrix_report(y_pred, y_test)

		T	F
	1 [   79,    16]
	0 [   16, 56845]
accuracy : 	0.99944
precision : 	0.83158
recall : 	0.83158
f1score : 	0.83158


# variational oversampling

In [21]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [22]:
Counter(y_train)

Counter({0: 227430, 1: 393})

In [23]:
from vos import VariationalOversampler
variational_oversampler = VariationalOversampler(8)

tensorflow version 2.0.0


In [24]:
variational_oversampler.fit(x_train, y_train, 1)

Train on 314 samples, validate on 79 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/100

In [25]:
n_resample = int(Counter(y_train)[0] * 0.04 - Counter(y_train)[1])

In [26]:
x_gen = variational_oversampler.decoder.predict(np.random.normal(size = (n_resample, variational_oversampler.latent_dim)), verbose=1)



In [27]:
x_resample =  np.concatenate([x_train, x_gen], axis =0)
y_resample = np.concatenate([y_train, np.full((n_resample,) , 1)], axis =0)

In [28]:
x_resample, y_resample = shuffle(x_resample, y_resample)

In [29]:
logistic_regression = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True
                                        , intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100
                                        , multi_class='ovr', verbose=2, warm_start=False, n_jobs=6
                        )
logistic_regression.fit(x_resample, y_resample)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 out of   1 | elapsed:    6.2s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=6, penalty='l2', random_state=None,
                   solver='lbfgs', tol=0.0001, verbose=2, warm_start=False)

In [30]:
y_pred = logistic_regression.predict(x_test)

In [31]:
confusion_matrix_report(y_pred, y_test)

		T	F
	1 [   80,    47]
	0 [   15, 56814]
accuracy : 	0.99891
precision : 	0.84211
recall : 	0.62992
f1score : 	0.72072
