In [61]:
import pandas as pd
from preprocessing.preprocessing import pre_process

data_train_X = pd.read_csv('./data_fraud/X_train.csv')
data_train_Y = pd.read_csv('./data_fraud/Y_train.csv')

data_train_X, data_train_Y = pre_process(data_train_X, data_train_Y)

train_X = data_train_X[:33333]
train_Y = data_train_Y[:33333]

val_X = data_train_X[33333:66666]
val_Y = data_train_Y[33333:66666]

test_X = data_train_X[66666:]
test_Y = data_train_Y[66666:]

904
0.966294298408


In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, \
    f1_score

forest = RandomForestClassifier(n_jobs=-1,
                                # class_weight={0: 1000, 1: 1},
                                n_estimators=100,
                                max_features=None)
forest.fit(train_X, train_Y)
predictions = forest.predict(test_X)
cm = confusion_matrix(test_Y, predictions)
print(cm)
print("Precision: ", precision_score(test_Y, predictions))
print("Recall: ", recall_score(test_Y, predictions))
print("F1-Score: ", f1_score(test_Y, predictions))

  


[[32420   111]
 [  521   282]]
Precision:  0.717557251908
Recall:  0.351183063512
F1-Score:  0.471571906355


In [62]:
# Re-Sampling
from imblearn.over_sampling import SMOTE

sm = SMOTE(ratio=1.0, random_state=42)
X_res, Y_res = sm.fit_sample(pd.concat([train_X, val_X], axis=0),
                             pd.concat([train_Y, val_Y], axis=0))

  y = column_or_1d(y, warn=True)


In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, \
    f1_score

forest = RandomForestClassifier(n_jobs=-1,
                                n_estimators=30,
                                max_features=None)
forest.fit(X_res, Y_res)
predictions = forest.predict(test_X)
cm = confusion_matrix(test_Y, predictions)
print(cm)
print("Precision: ", precision_score(test_Y, predictions))
print("Recall: ", recall_score(test_Y, predictions))
print("F1-Score: ", f1_score(test_Y, predictions))

[[32252   279]
 [  473   330]]
Precision:  0.541871921182
Recall:  0.41095890411
F1-Score:  0.467422096317


In [64]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1e5)
h = .02  # step size in the mesh
lr.fit(X_res, Y_res)
predictions = lr.predict(test_X)
cm = confusion_matrix(test_Y, predictions)
print(cm)
print("Precision: ", precision_score(test_Y, predictions))
print("Recall: ", recall_score(test_Y, predictions))
print("F1-Score: ", f1_score(test_Y, predictions))

[[24445  8086]
 [  250   553]]
Precision:  0.0640120384304
Recall:  0.688667496887
F1-Score:  0.117136199958


In [65]:
from sklearn.naive_bayes import GaussianNB
naive_bayes_clf = GaussianNB()
naive_bayes_clf.fit(X_res, Y_res)

predictions = naive_bayes_clf.predict(test_X)
cm = confusion_matrix(test_Y, predictions)
print(cm)
print("Precision: ", precision_score(test_Y, predictions))
print("Recall: ", recall_score(test_Y, predictions))
print("F1-Score: ", f1_score(test_Y, predictions))

[[28198  4333]
 [  440   363]]
Precision:  0.0772998296422
Recall:  0.452054794521
F1-Score:  0.132024004364


In [66]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_res, Y_res)

predictions = dt_clf.predict(test_X)
cm = confusion_matrix(test_Y, predictions)
print(cm)
print("Precision: ", precision_score(test_Y, predictions))
print("Recall: ", recall_score(test_Y, predictions))
print("F1-Score: ", f1_score(test_Y, predictions))

[[31789   742]
 [  490   313]]
Precision:  0.296682464455
Recall:  0.389788293898
F1-Score:  0.336921420883


In [68]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(5, 2), random_state=1)

mlp_clf.fit(X_res, Y_res)

predictions = dt_clf.predict(test_X)
cm = confusion_matrix(test_Y, predictions)
print(cm)
print("Precision: ", precision_score(test_Y, predictions))
print("Recall: ", recall_score(test_Y, predictions))
print("F1-Score: ", f1_score(test_Y, predictions))

[[31789   742]
 [  490   313]]
Precision:  0.296682464455
Recall:  0.389788293898
F1-Score:  0.336921420883


In [69]:
import numpy as np
from sklearn.ensemble import VotingClassifier

eclf3 = VotingClassifier(estimators=[
    ('nn', mlp_clf), ('dt', dt_clf), ('rf', forest)],
    voting='soft', weights=[1, 1, 2])
eclf3 = eclf3.fit(X_res, Y_res)
predictions = eclf3.predict(test_X)

cm = confusion_matrix(test_Y, predictions)
print(cm)
print("Precision: ", precision_score(test_Y, predictions))
print("Recall: ", recall_score(test_Y, predictions))
print("F1-Score: ", f1_score(test_Y, predictions))

[[32093   438]
 [  473   330]]
Precision:  0.4296875
Recall:  0.41095890411
F1-Score:  0.420114576703
