In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import (ADASYN, SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, KMeansSMOTE, RandomOverSampler)
from imblearn.combine import (SMOTETomek, SMOTEENN)
from imblearn.under_sampling import (CondensedNearestNeighbour, TomekLinks, RandomUnderSampler)
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline


df = pd.read_csv('creditcard.csv')

In [2]:
#No null values, all of them are numeric
#df.info()

In [3]:
#It seems reasonable to drop time variable
X = df.drop(['Class', 'Time'], axis = 1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [4]:
#Notice that it will take time because of high number of records in df
#resapmling data, OS-oversapmling, US-undersampling, CS-combined sampling
samp = {'Rand_OS': RandomOverSampler(sampling_strategy = 0.4).fit_resample(X_train, y_train),
             'ADASYN_OS': ADASYN(sampling_strategy = 0.4, n_jobs = -1).fit_resample(X_train, y_train),
             'SMOTE_OS': SMOTE(sampling_strategy = 0.4, n_jobs = -1).fit_resample(X_train, y_train),
             'Random_US': RandomUnderSampler(sampling_strategy = 0.4).fit_resample(X_train, y_train),
             'SMOTEENN_CS' : SMOTEENN(sampling_strategy = 0.4, n_jobs = -1).fit_resample(X_train, y_train),
             'SMOTETomek_CS': SMOTETomek(sampling_strategy = 0.4, n_jobs = -1).fit_resample(X_train, y_train)}

In [9]:
#Storage of f1 scores for different sampling algorithms and models
f_score = {}

In [10]:
pipe = Pipeline(steps=[('scaler', StandardScaler()), ('logistic', LogisticRegression(max_iter = 10000))])
param_grid = {'logistic__C': [0.001, 0.01, 0.1, 1, 10]}


for i in samp:
    search = GridSearchCV(pipe, param_grid, n_jobs=-1)
    search.fit(samp[i][0], samp[i][1])
    y_pred = search.predict(X_test)

    f_score['LogisticRegression %s' %i] = round(f1_score(y_test, y_pred,4))
print(f_score)



{'LogisticRegression Rand_OS': 0.0, 'LogisticRegression ADASYN_OS': 0.0, 'LogisticRegression SMOTE_OS': 0.0, 'LogisticRegression Random_US': 0.0, 'LogisticRegression SMOTEENN_CS': 0.0, 'LogisticRegression SMOTETomek_CS': 0.0}




In [102]:
pipe = Pipeline(steps=[('scaler', StandardScaler()), ('dTree', DecisionTreeClassifier())])

param_grid = {'dTree__min_samples_split': [1000, 2000],
             'dTree__max_depth': [5, 10]}

search = GridSearchCV(pipe, param_grid, n_jobs=-1)

for i in samp:
    search.fit(samp[i][0], samp[i][1])
    y_pred = search.predict(X_test)

    f_score['dTree %s' %i] = round(f1_score(y_test, y_pred,4))



In [None]:
# standaryzacja
# n_neigh 3 5 10 15
# weight 
# zaimplementowac wlasny klasyfikator

for i in samp:
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('neigh', KNeighborsClassifier(n_neighbors=3, n_jobs=-1))])
    pipe.fit(samp[i][0], samp[i][1])
    y_pred = neigh.predict(X_test)

    f_score['KNeighbors %s' %i] = round(f1_score(y_test, y_pred,4))

In [None]:
print(f_score['KNeighbors'])

In [None]:
# standaryzacja
# weight ‘distance’
# stacking lub esambling

svc = SVC()
svc.fit(samp['Rand_OS'][0], over_samp['Rand_OS'][1])
y_pred = linear_svc.predict(X_test)

f_score['SVC'] = f1_score(y_test, y_pred)