In [108]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time

In [50]:
df = pd.read_csv('dados/creditcard.csv')

In [51]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


Os passos para preparação dos dados são:

* "Embaralhar os dados"
* Tranformar as classes na notação one-hot
* Normalizar os dados
* Dividir os dados X/y 
* Converter o DataFrame em arrays numpy (foat32)
* Separar os dados de treino e teste


In [52]:
dados_embaralhados = df.sample(frac=1)

In [53]:
dados_embaralhados.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
272700,165223.0,1.913324,-0.250188,-3.129403,0.299169,2.881294,3.321391,0.081487,0.658414,0.038398,...,0.076838,0.194915,-0.020039,0.710326,0.489731,-0.477923,-0.009395,-0.058022,66.05,0
213316,139203.0,1.840882,-0.543399,-1.271718,0.443064,0.065373,-0.186962,0.010568,-0.160279,0.708365,...,0.002228,-0.121989,0.077246,0.272048,-0.13827,0.209529,-0.065061,-0.029481,127.9,0
251942,155577.0,2.045388,0.003926,-1.19856,1.596251,0.50037,0.213221,-0.096227,-0.0753,-0.828483,...,0.195569,0.649778,-0.055347,-0.876479,-0.13911,2.526769,-0.205892,-0.104809,12.74,0
116532,74319.0,1.344846,-0.418145,0.204762,-0.653515,-0.965454,-1.090996,-0.253617,-0.14554,-1.246309,...,-0.749367,-1.837375,0.31131,0.473401,-0.102422,0.596985,-0.085735,-0.001009,11.98,0
209087,137409.0,-0.809497,0.897605,-0.113028,-1.107324,1.153097,0.009334,0.813692,0.082092,0.228997,...,0.092786,0.171141,-0.388818,0.007484,0.141047,0.487137,-0.361147,0.146626,8.43,0


In [54]:
# notação one-hot
# 0 -> [1, 0]
# 1 -> [0, 1]
one_hot_data = pd.get_dummies(dados_embaralhados, columns=['Class'])

In [55]:
one_hot_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class_0,Class_1
272700,165223.0,1.913324,-0.250188,-3.129403,0.299169,2.881294,3.321391,0.081487,0.658414,0.038398,...,0.194915,-0.020039,0.710326,0.489731,-0.477923,-0.009395,-0.058022,66.05,1,0
213316,139203.0,1.840882,-0.543399,-1.271718,0.443064,0.065373,-0.186962,0.010568,-0.160279,0.708365,...,-0.121989,0.077246,0.272048,-0.13827,0.209529,-0.065061,-0.029481,127.9,1,0
251942,155577.0,2.045388,0.003926,-1.19856,1.596251,0.50037,0.213221,-0.096227,-0.0753,-0.828483,...,0.649778,-0.055347,-0.876479,-0.13911,2.526769,-0.205892,-0.104809,12.74,1,0
116532,74319.0,1.344846,-0.418145,0.204762,-0.653515,-0.965454,-1.090996,-0.253617,-0.14554,-1.246309,...,-1.837375,0.31131,0.473401,-0.102422,0.596985,-0.085735,-0.001009,11.98,1,0
209087,137409.0,-0.809497,0.897605,-0.113028,-1.107324,1.153097,0.009334,0.813692,0.082092,0.228997,...,0.171141,-0.388818,0.007484,0.141047,0.487137,-0.361147,0.146626,8.43,1,0


In [56]:
# normalizando os dados
# todos os dados estarão entre 0 e 1
dados_normalizados = (one_hot_data - one_hot_data.min()) / (one_hot_data.max() - one_hot_data.min())

In [57]:
dados_normalizados.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class_0,Class_1
272700,0.956196,0.990799,0.764619,0.783186,0.265192,0.785113,0.296413,0.265852,0.792448,0.464103,...,0.519124,0.665136,0.47795,0.605396,0.347381,0.416338,0.311946,0.002571,1.0,0.0
213316,0.80561,0.989568,0.761525,0.815377,0.271571,0.766156,0.26114,0.26542,0.783666,0.487182,...,0.50434,0.666581,0.418892,0.570145,0.459674,0.41531,0.312526,0.004978,1.0,0.0
251942,0.900372,0.993042,0.7673,0.816644,0.322691,0.769085,0.265164,0.264769,0.784578,0.43424,...,0.540343,0.664612,0.264129,0.570098,0.838191,0.412711,0.310997,0.000496,1.0,0.0
116532,0.430107,0.981141,0.762846,0.840962,0.22296,0.759217,0.252051,0.26381,0.783824,0.419847,...,0.424317,0.670057,0.446025,0.572157,0.522965,0.414928,0.313103,0.000466,1.0,0.0
209087,0.795228,0.944541,0.776729,0.835455,0.202843,0.773479,0.263114,0.270313,0.786266,0.470668,...,0.518015,0.659659,0.383243,0.585824,0.505021,0.409845,0.316099,0.000328,1.0,0.0


In [58]:
df_X = dados_normalizados.drop(['Class_0', 'Class_1'], axis=1)

In [59]:
df_X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
272700,0.956196,0.990799,0.764619,0.783186,0.265192,0.785113,0.296413,0.265852,0.792448,0.464103,...,0.579213,0.562718,0.519124,0.665136,0.47795,0.605396,0.347381,0.416338,0.311946,0.002571
213316,0.80561,0.989568,0.761525,0.815377,0.271571,0.766156,0.26114,0.26542,0.783666,0.487182,...,0.581398,0.561515,0.50434,0.666581,0.418892,0.570145,0.459674,0.41531,0.312526,0.004978
251942,0.900372,0.993042,0.7673,0.816644,0.322691,0.769085,0.265164,0.264769,0.784578,0.43424,...,0.579475,0.564632,0.540343,0.664612,0.264129,0.570098,0.838191,0.412711,0.310997,0.000496
116532,0.430107,0.981141,0.762846,0.840962,0.22296,0.759217,0.252051,0.26381,0.783824,0.419847,...,0.574623,0.549399,0.424317,0.670057,0.446025,0.572157,0.522965,0.414928,0.313103,0.000466
209087,0.795228,0.944541,0.776729,0.835455,0.202843,0.773479,0.263114,0.270313,0.786266,0.470668,...,0.577016,0.562975,0.518015,0.659659,0.383243,0.585824,0.505021,0.409845,0.316099,0.000328


In [60]:
df_y = dados_normalizados[['Class_0', 'Class_1']]

In [61]:
df_y.head()

Unnamed: 0,Class_0,Class_1
272700,1.0,0.0
213316,1.0,0.0
251942,1.0,0.0
116532,1.0,0.0
209087,1.0,0.0


In [62]:
# tranformar em arrays
ar_X = np.asarray(df_X.values, 'float32')

In [63]:
ar_X

array([[9.5619589e-01, 9.9079877e-01, 7.6461852e-01, ..., 4.1633755e-01,
        3.1194642e-01, 2.5709232e-03],
       [8.0561024e-01, 9.8956805e-01, 7.6152468e-01, ..., 4.1531008e-01,
        3.1252560e-01, 4.9783662e-03],
       [9.0037155e-01, 9.9304241e-01, 7.6729977e-01, ..., 4.1271064e-01,
        3.1099698e-01, 4.9589039e-04],
       ...,
       [8.7400460e-01, 9.9016303e-01, 7.6490492e-01, ..., 4.1729501e-01,
        3.1244132e-01, 1.3977571e-03],
       [3.5229060e-01, 9.7965604e-01, 7.6423788e-01, ..., 4.1559029e-01,
        3.1309152e-01, 1.0832520e-03],
       [2.3535234e-01, 9.8131490e-01, 7.5961369e-01, ..., 4.1764495e-01,
        3.1381691e-01, 3.8884970e-04]], dtype=float32)

In [64]:
ar_y = np.asarray(df_y.values, 'float32')

In [65]:
ar_y

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [66]:
tamanho_treino = int(0.8*len(ar_X))
tamanho_treino

227845

In [67]:
# dados de treino
(X_treino_cru, y_treino_cru) = (ar_X[:tamanho_treino], ar_y[:tamanho_treino])

In [68]:
# dados de teste
(X_teste_cru, y_teste_cru) = (ar_X[tamanho_treino:], ar_y[tamanho_treino:])

In [69]:
print('treino:', len(X_treino_cru), "/", 'teste:', len(X_teste_cru))

treino: 227845 / teste: 56962


In [70]:
count_legit, count_fraud = np.unique(df['Class'], return_counts=True)[1]

In [71]:
print('não fraude:', count_legit, '/', 'fraude:', count_fraud)

não fraude: 284315 / fraude: 492


In [72]:
razao_fraude = float(count_fraud / (count_fraud + count_legit))
print('Porcentagem de transações fraudulentas ', round(razao_fraude*100, 2), '%')

Porcentagem de transações fraudulentas  0.17 %


In [73]:
# é uma dataset bem desbalanceado 

In [74]:
peso = 1/razao_fraude
peso

578.8760162601626

In [75]:
# técnica para minimizar o viés
# dos dados desbalanceados 
y_treino_cru[:, 1] = y_treino_cru[:, 1] * peso

In [76]:
dimensao_dos_inputs = ar_X.shape[1]
dimensao_dos_outputs = ar_y.shape[1]

In [90]:
num_camadas1_cell = 100
num_camadas2_cell = 150

In [91]:
X_train_node = tf.placeholder(tf.float32, [None, dimensao_dos_inputs], name='X_train')
y_train_node =  tf.placeholder(tf.float32, [None, dimensao_dos_outputs], name='y_train')

In [92]:
X_teste_node = tf.constant(X_teste_cru, name='X_teste')
y_teste_node = tf.constant(y_teste_cru, name='y_teste')

In [93]:
pesos_node1 = tf.Variable(tf.zeros([dimensao_dos_inputs, num_camadas1_cell]), name='peso1')
vies_node1 = tf.Variable(tf.zeros([num_camadas1_cell]), name='vies_1')

In [100]:
pesos_node2 = tf.Variable(tf.zeros([num_camadas1_cell, num_camadas2_cell]), name='peso2')
vies_node2 = tf.Variable(tf.zeros([num_camadas2_cell]), name='vies_2')

In [101]:
pesos_node3 = tf.Variable(tf.zeros([num_camadas2_cell, dimensao_dos_outputs]), name='peso3')
vies_node3 = tf.Variable(tf.zeros([dimensao_dos_outputs]), name='vies_3')

In [102]:
def network(input_tensor):
    layer1 = tf.nn.sigmoid(tf.matmul(input_tensor, pesos_node1) + vies_node1)
    layer2 = tf.nn.dropout(tf.nn.sigmoid(tf.matmul(layer1, pesos_node2) + vies_node2), 0.85)
    layer3 = tf.nn.softmax(tf.matmul(layer2, pesos_node3) + vies_node3)
    return layer3

In [103]:
y_train_previsao = network(X_train_node)

In [104]:
y_test_previsao = network(X_teste_node)

In [105]:
cross_entropy = tf.losses.softmax_cross_entropy(y_train_node, y_train_previsao)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [106]:
optimizer = tf.train.AdamOptimizer(0.005).minimize(cross_entropy)

In [113]:
def calculate_accuracy(actual, predicted):
    actual = np.argmax(actual, 1)
    predicted = np.argmax(predicted, 1)
    return (np.sum(np.equal(predicted, actual)) / predicted.shape[0])*100

In [109]:
num_epochs = 100

In [114]:
with tf.Session() as session:
    tf.global_variables_initializer().run()
    for epoch in range(num_epochs):
        start_time = time.time()
        _, cross_entropy_score = session.run([optimizer, cross_entropy],
                                            feed_dict={X_train_node: X_treino_cru,
                                                      y_train_node: y_treino_cru})
        if epoch % 10 == 0:
            timer = time.time() - start_time
            print('Epoch: {}'.format(epoch), 
                  'Corrent loss: {0:.4f}'.format(cross_entropy_score), 
                  'Elapsed time: {0:.2f} seconds'.format(timer))
            
            final_y_test = y_teste_node.eval()
            final_y_test_prediction = y_test_previsao.eval()
            final_accuracy = calculate_accuracy(final_y_test, final_y_test_prediction)
            print('Current accuracy: {0:.2f}%'.format(final_accuracy))
            
    final_y_test = y_teste_node.eval()
    final_y_test_prediction = y_test_previsao.eval()
    final_accuracy = calculate_accuracy(final_y_test, final_y_test_prediction)
    print('Final accuracy: {0:.2f}%'.format(final_accuracy))

Epoch: 0 Corrent loss: 1.4018 Elapsed time: 1.05 seconds
Current accuracy: 0.16%
Epoch: 10 Corrent loss: 1.4022 Elapsed time: 0.99 seconds
Current accuracy: 0.16%
Epoch: 20 Corrent loss: 1.3849 Elapsed time: 0.99 seconds
Current accuracy: 0.16%
Epoch: 30 Corrent loss: 1.3018 Elapsed time: 0.99 seconds
Current accuracy: 61.22%
Epoch: 40 Corrent loss: 1.1394 Elapsed time: 1.01 seconds
Current accuracy: 94.73%
Epoch: 50 Corrent loss: 1.0045 Elapsed time: 1.17 seconds
Current accuracy: 97.50%
Epoch: 60 Corrent loss: 0.9175 Elapsed time: 1.03 seconds
Current accuracy: 99.19%
Epoch: 70 Corrent loss: 0.8758 Elapsed time: 0.99 seconds
Current accuracy: 99.09%
Epoch: 80 Corrent loss: 0.8569 Elapsed time: 1.00 seconds
Current accuracy: 99.36%
Epoch: 90 Corrent loss: 0.8478 Elapsed time: 1.00 seconds
Current accuracy: 99.38%
Final accuracy: 99.40%


In [115]:
final_fraud_y_test = final_y_test[final_y_test[:,1] == 1]
final_y_test_prediction = final_y_test_prediction[final_y_test[:, 1] == 1]
final_fraud_accuracy = calculate_accuracy(final_fraud_y_test, final_y_test_prediction)
print('Final fraud specific accuracy: {0:.2f}'.format(final_fraud_accuracy))

Final fraud specific accuracy: 77.53
