# Credit Card Fraud Detection

## About the data

The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection. 

The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

All the features are transformed using PCA due to confidentiality issues. The only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

## Importing necessary libraries

In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [17]:
dataset = pd.read_csv('creditcard.csv')
dataset.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [18]:
# Create a new feature for normal (non-fraudulent) transactions.
dataset.loc[dataset.Class == 0, 'Normal'] = 1
dataset.loc[dataset.Class == 1, 'Normal'] = 0

# Rename 'Class' to 'Fraud'.
dataset = dataset.rename(columns={'Class': 'Fraud'})

In [21]:
X = dataset.iloc[:,:-2].values
y = dataset.iloc[:,-2:].values

In [22]:
#data cleaning process
from sklearn.preprocessing import Imputer
imputer=Imputer(missing_values="NaN",strategy="mean",axis=0)
imputer=imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [24]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
# Number of input nodes.
input_nodes = 30

# Multiplier maintains a fixed ratio of nodes between each layer.
mulitplier = 1.5 

# Number of nodes in each hidden layer
hidden_nodes1 = 18
hidden_nodes2 = round(hidden_nodes1 * mulitplier)
hidden_nodes3 = round(hidden_nodes2 * mulitplier)

# Percent of nodes to keep during dropout.
pkeep = tf.placeholder(tf.float32)

# input
x = tf.placeholder(tf.float32, [None, input_nodes])

# layer 1
W1 = tf.Variable(tf.truncated_normal([input_nodes, hidden_nodes1], stddev = 0.15))
b1 = tf.Variable(tf.zeros([hidden_nodes1]))
y1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)

# layer 2
W2 = tf.Variable(tf.truncated_normal([hidden_nodes1, hidden_nodes2], stddev = 0.15))
b2 = tf.Variable(tf.zeros([hidden_nodes2]))
y2 = tf.nn.sigmoid(tf.matmul(y1, W2) + b2)

# layer 3
W3 = tf.Variable(tf.truncated_normal([hidden_nodes2, hidden_nodes3], stddev = 0.15)) 
b3 = tf.Variable(tf.zeros([hidden_nodes3]))
y3 = tf.nn.sigmoid(tf.matmul(y2, W3) + b3)
y3 = tf.nn.dropout(y3, pkeep)

# layer 4
W4 = tf.Variable(tf.truncated_normal([hidden_nodes3, 2], stddev = 0.15)) 
b4 = tf.Variable(tf.zeros([2]))
y4 = tf.nn.softmax(tf.matmul(y3, W4) + b4)

# output
y = y4
y_ = tf.placeholder(tf.float32, [None, 2])

In [26]:
# Parameters
training_epochs = 2000
training_dropout = 0.9
display_step = 10
n_samples = Y_train.shape[0]
batch_size = 2048
learning_rate = 0.005

In [27]:
# Cost function: Cross Entropy
cost = -tf.reduce_sum(y_ * tf.log(y))

# We will optimize our model via AdamOptimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

# Correct prediction if the most likely value (Fraud or Normal) from softmax equals the target value.
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [38]:
accuracy_summary = [] # Record accuracy values for plot
cost_summary = [] # Record cost values for plot
valid_accuracy_summary = [] 
valid_cost_summary = [] 
stop_early = 0 # To keep track of the number of epochs before early stopping

# Save the best weights so that they can be used to make the final predictions
checkpoint = "/Users/setCodesToFire/Documents/MyGithub/fraud-detection/best_model.ckpt"
saver = tf.train.Saver(max_to_keep=1)

# Initialize variables and tensorflow session
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(training_epochs): 
        for batch in range(int(n_samples/batch_size)):
            batch_x = X_train[batch*batch_size : (1+batch)*batch_size]
            batch_y = Y_train[batch*batch_size : (1+batch)*batch_size]

            sess.run([optimizer], feed_dict={x: batch_x, 
                                             y_: batch_y,
                                             pkeep: training_dropout})

        # Display logs after every 10 epochs
        if (epoch) % display_step == 0:
            train_accuracy, newCost = sess.run([accuracy, cost], feed_dict={x: X_train, 
                                                                            y_: Y_train,
                                                                            pkeep: training_dropout})


            print ("Epoch:", epoch,
                   "Acc =", "{:.5f}".format(train_accuracy), 
                   "Cost =", "{:.5f}".format(newCost))
            
            # Save the weights if these conditions are met.
            # if epoch > 0 and train_accuracy > 0.999:

            
            # Record the results of the model
            accuracy_summary.append(train_accuracy)
            cost_summary.append(newCost)
            
            # If the model does not improve after 15 logs, stop the training.
            if epoch > 100:
                stop_early += 1
                if stop_early == 15:
                    saver.save(sess, checkpoint)
                    break
            else:
                stop_early = 0
            
    print()
    print("Optimization Finished!")
    print()

'''with tf.Session() as sess:
    # Load the best weights and show its results
    saver.restore(sess, checkpoint)
    training_accuracy = sess.run(accuracy, feed_dict={x: inputX, y_: inputY, pkeep: training_dropout})
    validation_accuracy = sess.run(accuracy, feed_dict={x: inputX_valid, y_: inputY_valid, pkeep: 1})
    
    print("Results using the best Valid_Acc:")
    print()
    print("Training Accuracy =", training_accuracy)
    print("Validation Accuracy =", validation_accuracy)'''

Epoch: 0 Acc = 0.99827 Cost = 3204.76514
Epoch: 10 Acc = 0.99827 Cost = 1075.40198
Epoch: 20 Acc = 0.99953 Cost = 584.17822
Epoch: 30 Acc = 0.99961 Cost = 497.22852
Epoch: 40 Acc = 0.99964 Cost = 441.80002
Epoch: 50 Acc = 0.99963 Cost = 420.84180
Epoch: 60 Acc = 0.99965 Cost = 383.07428
Epoch: 70 Acc = 0.99967 Cost = 365.00525
Epoch: 80 Acc = 0.99968 Cost = 328.94931
Epoch: 90 Acc = 0.99973 Cost = 313.24545
Epoch: 100 Acc = 0.99972 Cost = 284.08777
Epoch: 110 Acc = 0.99972 Cost = 256.72412
Epoch: 120 Acc = 0.99975 Cost = 220.12538
Epoch: 130 Acc = 0.99977 Cost = 197.62003
Epoch: 140 Acc = 0.99977 Cost = 187.38641
Epoch: 150 Acc = 0.99972 Cost = 201.80078
Epoch: 160 Acc = 0.99979 Cost = 154.98798
Epoch: 170 Acc = 0.99980 Cost = 140.41112
Epoch: 180 Acc = 0.99976 Cost = 151.60507
Epoch: 190 Acc = 0.99980 Cost = 118.44639
Epoch: 200 Acc = 0.99980 Cost = 117.31044
Epoch: 210 Acc = 0.99979 Cost = 120.67056
Epoch: 220 Acc = 0.99980 Cost = 103.00670
Epoch: 230 Acc = 0.99980 Cost = 106.26556
E

'with tf.Session() as sess:\n    # Load the best weights and show its results\n    saver.restore(sess, checkpoint)\n    training_accuracy = sess.run(accuracy, feed_dict={x: inputX, y_: inputY, pkeep: training_dropout})\n    validation_accuracy = sess.run(accuracy, feed_dict={x: inputX_valid, y_: inputY_valid, pkeep: 1})\n    \n    print("Results using the best Valid_Acc:")\n    print()\n    print("Training Accuracy =", training_accuracy)\n    print("Validation Accuracy =", validation_accuracy)'