In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


# DataSet

In [2]:
my_data = pd.read_csv("creditcard.csv")
# So first of all let's get introduced with out dataset
my_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
#Next we will expand the view, by seeing the shape of our data
my_data.shape

(284807, 31)

In [4]:
#From here we can see that we must to deal with 31 columns and 284807 rows

In [5]:
#Then we may want to see how many subclasses we have in our classes 
my_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [6]:
#Here as it can be seen, we are facing with some really unbalanced dataset.
#As we have been warned in the problem itself, where it said that only 0.175% of 
#transactions were fradulent, which is very low, and for a machine it is very hard to
#recognise the 0's and 1's in our target(which is 'Class' column)

# Balancing DataSet

In [7]:
NotFraud = my_data[my_data['Class']==0]
IsFraud = my_data[my_data['Class']==1]

In [8]:
NotFraud.shape

(284315, 31)

In [9]:
IsFraud.shape

(492, 31)

In [10]:
NotFraud = NotFraud.sample(IsFraud.shape[0])

In [11]:
NotFraud.shape, IsFraud.shape

((492, 31), (492, 31))

In [12]:
my_balanced_data = IsFraud.append(NotFraud, ignore_index = True)
my_balanced_data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
1,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
2,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
3,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00,1
4,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,163069.0,-1.244138,1.445448,0.194148,-1.533079,1.017902,0.267090,0.416655,-2.528615,0.160290,...,1.961804,-1.552942,0.066769,-1.114818,-0.080105,0.133718,0.085288,-0.035423,3.87,0
980,147735.0,-1.376171,-0.329257,1.130025,-4.312976,1.360511,0.963587,0.025344,0.716028,1.497464,...,-0.033862,-0.095708,-0.211580,-1.002500,0.625930,-1.065037,0.326162,0.109395,21.24,0
981,49815.0,-1.892063,1.518854,0.016176,0.215841,-1.666710,0.211805,-1.518594,1.856502,-1.709308,...,-0.302012,-1.011806,0.122364,-0.451496,-0.233219,-0.486899,-0.732414,-0.128020,7.42,0
982,115099.0,-0.589887,-0.619743,1.612229,-3.026047,-0.711846,-0.461882,0.531636,-0.284462,-2.446143,...,-0.114857,0.013461,-0.234783,0.041286,0.670268,-0.177102,-0.182539,-0.182885,100.00,0


In [13]:
import pandas as pd
import tensorflow as tf
my_balanced_data =my_balanced_data.drop(['Amount'],axis=1)
my_balanced_data = my_balanced_data.drop(['Time'],axis=1)
my_balanced_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,-2.772272,...,0.126911,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,1
1,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,-0.838587,...,2.102339,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,1
2,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,-1.525412,...,-0.430022,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,1
3,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,-4.801637,...,-0.171608,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,1
4,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,-2.447469,...,0.009061,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1


In [14]:
X = my_balanced_data.iloc[:, my_balanced_data.columns != 'Class']
y = my_balanced_data.iloc[:, my_balanced_data.columns == 'Class']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=0)

In [15]:
X_train.shape[0]

688

In [16]:
X_test.shape[0]

296

In [17]:
input_shape = (29, 1)

In [18]:
# Initialising the ANN
model = Sequential()
# Adding the input layer and the first hidden layer
model.add(Dense(units =512 , kernel_initializer = 'uniform', activation = tf.nn.relu, input_dim = 28))
# Adding the second hidden layer
model.add(Dense(units =256 , kernel_initializer = 'uniform', activation = tf.nn.softmax))
# Adding the third hidden layer
model.add(Dense(units = 128, kernel_initializer = 'uniform', activation = tf.nn.relu))
# Adding the output layer
model.add(Dense(units = 1, kernel_initializer = 'uniform', activation = tf.nn.sigmoid))

In [19]:
model.compile(optimizer='Adagrad',  loss='binary_crossentropy',  metrics=['accuracy'])
model.fit(x=X_train,y=y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x2aa5a936748>

In [20]:
model.evaluate(X_test, y_test)



[0.2204034598292531, 0.9358108043670654]

In [29]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
confusion_matrix(y_test, y_pred)

array([[142,   7],
       [ 12, 135]], dtype=int64)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94       149
           1       0.95      0.92      0.93       147

    accuracy                           0.94       296
   macro avg       0.94      0.94      0.94       296
weighted avg       0.94      0.94      0.94       296

