Skip to content

Commit 1b06db4

Browse files
committed
Python Tensorflow Keras Fraud Detection Autoencoder.py
Added keras tensorflow code as Python script
1 parent d104731 commit 1b06db4

File tree

1 file changed

+289
-0
lines changed

1 file changed

+289
-0
lines changed
Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
# import packages
2+
# matplotlib inline
3+
import pandas as pd
4+
import numpy as np
5+
from scipy import stats
6+
import tensorflow as tf
7+
import matplotlib.pyplot as plt
8+
import seaborn as sns
9+
import pickle
10+
from sklearn.model_selection import train_test_split
11+
from sklearn.metrics import confusion_matrix, precision_recall_curve
12+
from sklearn.metrics import recall_score, classification_report, auc, roc_curve
13+
from sklearn.metrics import precision_recall_fscore_support, f1_score
14+
from sklearn.preprocessing import StandardScaler
15+
from pylab import rcParams
16+
from keras.models import Model, load_model
17+
from keras.layers import Input, Dense
18+
from keras.callbacks import ModelCheckpoint, TensorBoard
19+
from keras import regularizers
20+
21+
22+
23+
24+
#set random seed and percentage of test data
25+
RANDOM_SEED = 314 #used to help randomly select the data points
26+
TEST_PCT = 0.2 # 20% of the data
27+
28+
#set up graphic style in this case I am using the color scheme from xkcd.com
29+
rcParams['figure.figsize'] = 14, 8.7 # Golden Mean
30+
LABELS = ["Normal","Fraud"]
31+
#col_list = ["cerulean","scarlet"]# https://xkcd.com/color/rgb/
32+
#sns.set(style='white', font_scale=1.75, palette=sns.xkcd_palette(col_list))
33+
34+
35+
df = pd.read_csv("data/creditcard.csv") #unzip and read in data downloaded to the local directory
36+
df.head(n=5) #just to check you imported the dataset properly
37+
38+
39+
df.shape #secondary check on the size of the dataframe
40+
41+
42+
df.isnull().values.any() #check to see if any values are null, which there are not
43+
44+
45+
46+
pd.value_counts(df['Class'], sort = True) #class comparison 0=Normal 1=Fraud
47+
48+
49+
50+
51+
52+
53+
#if you don't have an intuitive sense of how imbalanced these two classes are, let's go visual
54+
count_classes = pd.value_counts(df['Class'], sort = True)
55+
count_classes.plot(kind = 'bar', rot=0)
56+
plt.xticks(range(2), LABELS)
57+
plt.title("Frequency by observation number")
58+
plt.xlabel("Class")
59+
plt.ylabel("Number of Observations");
60+
61+
62+
63+
64+
normal_df = df[df.Class == 0] #save normal_df observations into a separate df
65+
fraud_df = df[df.Class == 1] #do the same for frauds
66+
67+
68+
fraud_df.Amount.describe()
69+
70+
71+
72+
73+
#plot of high value transactions
74+
bins = np.linspace(200, 2500, 100)
75+
plt.hist(normal_df.Amount, bins, alpha=1, normed=True, label='Normal')
76+
plt.hist(fraud_df.Amount, bins, alpha=0.6, normed=True, label='Fraud')
77+
plt.legend(loc='upper right')
78+
plt.title("Amount by percentage of transactions (transactions \$200+)")
79+
plt.xlabel("Transaction amount (USD)")
80+
plt.ylabel("Percentage of transactions (%)");
81+
plt.show()
82+
83+
84+
85+
86+
bins = np.linspace(0, 48, 48) #48 hours
87+
plt.hist((normal_df.Time/(60*60)), bins, alpha=1, normed=True, label='Normal')
88+
plt.hist((fraud_df.Time/(60*60)), bins, alpha=0.6, normed=True, label='Fraud')
89+
plt.legend(loc='upper right')
90+
plt.title("Percentage of transactions by hour")
91+
plt.xlabel("Transaction time as measured from first transaction in the dataset (hours)")
92+
plt.ylabel("Percentage of transactions (%)");
93+
#plt.hist((df.Time/(60*60)),bins)
94+
plt.show()
95+
96+
97+
98+
99+
100+
plt.scatter((normal_df.Time/(60*60)), normal_df.Amount, alpha=0.6, label='Normal')
101+
plt.scatter((fraud_df.Time/(60*60)), fraud_df.Amount, alpha=0.9, label='Fraud')
102+
plt.title("Amount of transaction by hour")
103+
plt.xlabel("Transaction time as measured from first transaction in the dataset (hours)")
104+
plt.ylabel('Amount (USD)')
105+
plt.legend(loc='upper right')
106+
plt.show()
107+
108+
109+
110+
111+
112+
113+
#data = df.drop(['Time'], axis=1) #if you think the var is unimportant
114+
df_norm = df
115+
df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1))
116+
df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))
117+
118+
119+
120+
121+
122+
train_x, test_x = train_test_split(df_norm, test_size=TEST_PCT, random_state=RANDOM_SEED)
123+
train_x = train_x[train_x.Class == 0] #where normal transactions
124+
train_x = train_x.drop(['Class'], axis=1) #drop the class column
125+
126+
127+
test_y = test_x['Class'] #save the class column for the test set
128+
test_x = test_x.drop(['Class'], axis=1) #drop the class column
129+
130+
train_x = train_x.values #transform to ndarray
131+
test_x = test_x.values
132+
133+
134+
135+
136+
train_x.shape
137+
138+
139+
140+
141+
142+
# Reduce number of epochs and batch_size if your Jupyter crashes (due to memory issues)
143+
# nb_epoch = 100
144+
# batch_size = 128
145+
nb_epoch = 5
146+
batch_size = 32
147+
148+
input_dim = train_x.shape[1] #num of columns, 30
149+
encoding_dim = 14
150+
hidden_dim = int(encoding_dim / 2) #i.e. 7
151+
learning_rate = 1e-7
152+
153+
input_layer = Input(shape=(input_dim, ))
154+
encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer)
155+
encoder = Dense(hidden_dim, activation="relu")(encoder)
156+
decoder = Dense(hidden_dim, activation='tanh')(encoder)
157+
decoder = Dense(input_dim, activation='relu')(decoder)
158+
autoencoder = Model(inputs=input_layer, outputs=decoder)
159+
160+
161+
162+
163+
164+
165+
166+
autoencoder.compile(metrics=['accuracy'],
167+
loss='mean_squared_error',
168+
optimizer='adam')
169+
170+
cp = ModelCheckpoint(filepath="models/autoencoder_fraud.h5",
171+
save_best_only=True,
172+
verbose=0)
173+
174+
tb = TensorBoard(log_dir='logs/keras-fraud',
175+
histogram_freq=0,
176+
write_graph=True,
177+
write_images=True)
178+
179+
history = autoencoder.fit(train_x, train_x,
180+
epochs=nb_epoch,
181+
batch_size=batch_size,
182+
shuffle=True,
183+
validation_data=(test_x, test_x),
184+
verbose=1,
185+
callbacks=[cp, tb]).history
186+
187+
188+
189+
190+
191+
192+
autoencoder = load_model('models/autoencoder_fraud.h5')
193+
194+
195+
196+
197+
plt.plot(history['loss'], linewidth=2, label='Train')
198+
plt.plot(history['val_loss'], linewidth=2, label='Test')
199+
plt.legend(loc='upper right')
200+
plt.title('Model loss')
201+
plt.ylabel('Loss')
202+
plt.xlabel('Epoch')
203+
#plt.ylim(ymin=0.70,ymax=1)
204+
plt.show()
205+
206+
207+
208+
209+
210+
211+
test_x_predictions = autoencoder.predict(test_x)
212+
mse = np.mean(np.power(test_x - test_x_predictions, 2), axis=1)
213+
error_df = pd.DataFrame({'Reconstruction_error': mse,
214+
'True_class': test_y})
215+
error_df.describe()
216+
217+
218+
219+
220+
221+
false_pos_rate, true_pos_rate, thresholds = roc_curve(error_df.True_class, error_df.Reconstruction_error)
222+
roc_auc = auc(false_pos_rate, true_pos_rate,)
223+
224+
plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f'% roc_auc)
225+
plt.plot([0,1],[0,1], linewidth=5)
226+
227+
plt.xlim([-0.01, 1])
228+
plt.ylim([0, 1.01])
229+
plt.legend(loc='lower right')
230+
plt.title('Receiver operating characteristic curve (ROC)')
231+
plt.ylabel('True Positive Rate')
232+
plt.xlabel('False Positive Rate')
233+
plt.show()
234+
235+
236+
237+
238+
239+
precision_rt, recall_rt, threshold_rt = precision_recall_curve(error_df.True_class, error_df.Reconstruction_error)
240+
plt.plot(recall_rt, precision_rt, linewidth=5, label='Precision-Recall curve')
241+
plt.title('Recall vs Precision')
242+
plt.xlabel('Recall')
243+
plt.ylabel('Precision')
244+
plt.show()
245+
246+
247+
248+
249+
250+
plt.plot(threshold_rt, precision_rt[1:], label="Precision",linewidth=5)
251+
plt.plot(threshold_rt, recall_rt[1:], label="Recall",linewidth=5)
252+
plt.title('Precision and recall for different threshold values')
253+
plt.xlabel('Threshold')
254+
plt.ylabel('Precision/Recall')
255+
plt.legend()
256+
plt.show()
257+
258+
259+
260+
261+
262+
263+
threshold_fixed = 5
264+
groups = error_df.groupby('True_class')
265+
fig, ax = plt.subplots()
266+
267+
for name, group in groups:
268+
ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='',
269+
label= "Fraud" if name == 1 else "Normal")
270+
ax.hlines(threshold_fixed, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
271+
ax.legend()
272+
plt.title("Reconstruction error for different classes")
273+
plt.ylabel("Reconstruction error")
274+
plt.xlabel("Data point index")
275+
plt.show();
276+
277+
278+
279+
280+
281+
pred_y = [1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values]
282+
conf_matrix = confusion_matrix(error_df.True_class, pred_y)
283+
284+
plt.figure(figsize=(12, 12))
285+
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
286+
plt.title("Confusion matrix")
287+
plt.ylabel('True class')
288+
plt.xlabel('Predicted class')
289+
plt.show()

0 commit comments

Comments
 (0)