|
| 1 | +# import packages |
| 2 | +# matplotlib inline |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +from scipy import stats |
| 6 | +import tensorflow as tf |
| 7 | +import matplotlib.pyplot as plt |
| 8 | +import seaborn as sns |
| 9 | +import pickle |
| 10 | +from sklearn.model_selection import train_test_split |
| 11 | +from sklearn.metrics import confusion_matrix, precision_recall_curve |
| 12 | +from sklearn.metrics import recall_score, classification_report, auc, roc_curve |
| 13 | +from sklearn.metrics import precision_recall_fscore_support, f1_score |
| 14 | +from sklearn.preprocessing import StandardScaler |
| 15 | +from pylab import rcParams |
| 16 | +from keras.models import Model, load_model |
| 17 | +from keras.layers import Input, Dense |
| 18 | +from keras.callbacks import ModelCheckpoint, TensorBoard |
| 19 | +from keras import regularizers |
| 20 | + |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | +#set random seed and percentage of test data |
| 25 | +RANDOM_SEED = 314 #used to help randomly select the data points |
| 26 | +TEST_PCT = 0.2 # 20% of the data |
| 27 | + |
| 28 | +#set up graphic style in this case I am using the color scheme from xkcd.com |
| 29 | +rcParams['figure.figsize'] = 14, 8.7 # Golden Mean |
| 30 | +LABELS = ["Normal","Fraud"] |
| 31 | +#col_list = ["cerulean","scarlet"]# https://xkcd.com/color/rgb/ |
| 32 | +#sns.set(style='white', font_scale=1.75, palette=sns.xkcd_palette(col_list)) |
| 33 | + |
| 34 | + |
| 35 | +df = pd.read_csv("data/creditcard.csv") #unzip and read in data downloaded to the local directory |
| 36 | +df.head(n=5) #just to check you imported the dataset properly |
| 37 | + |
| 38 | + |
| 39 | +df.shape #secondary check on the size of the dataframe |
| 40 | + |
| 41 | + |
| 42 | +df.isnull().values.any() #check to see if any values are null, which there are not |
| 43 | + |
| 44 | + |
| 45 | + |
| 46 | +pd.value_counts(df['Class'], sort = True) #class comparison 0=Normal 1=Fraud |
| 47 | + |
| 48 | + |
| 49 | + |
| 50 | + |
| 51 | + |
| 52 | + |
| 53 | +#if you don't have an intuitive sense of how imbalanced these two classes are, let's go visual |
| 54 | +count_classes = pd.value_counts(df['Class'], sort = True) |
| 55 | +count_classes.plot(kind = 'bar', rot=0) |
| 56 | +plt.xticks(range(2), LABELS) |
| 57 | +plt.title("Frequency by observation number") |
| 58 | +plt.xlabel("Class") |
| 59 | +plt.ylabel("Number of Observations"); |
| 60 | + |
| 61 | + |
| 62 | + |
| 63 | + |
| 64 | +normal_df = df[df.Class == 0] #save normal_df observations into a separate df |
| 65 | +fraud_df = df[df.Class == 1] #do the same for frauds |
| 66 | + |
| 67 | + |
| 68 | +fraud_df.Amount.describe() |
| 69 | + |
| 70 | + |
| 71 | + |
| 72 | + |
| 73 | +#plot of high value transactions |
| 74 | +bins = np.linspace(200, 2500, 100) |
| 75 | +plt.hist(normal_df.Amount, bins, alpha=1, normed=True, label='Normal') |
| 76 | +plt.hist(fraud_df.Amount, bins, alpha=0.6, normed=True, label='Fraud') |
| 77 | +plt.legend(loc='upper right') |
| 78 | +plt.title("Amount by percentage of transactions (transactions \$200+)") |
| 79 | +plt.xlabel("Transaction amount (USD)") |
| 80 | +plt.ylabel("Percentage of transactions (%)"); |
| 81 | +plt.show() |
| 82 | + |
| 83 | + |
| 84 | + |
| 85 | + |
| 86 | +bins = np.linspace(0, 48, 48) #48 hours |
| 87 | +plt.hist((normal_df.Time/(60*60)), bins, alpha=1, normed=True, label='Normal') |
| 88 | +plt.hist((fraud_df.Time/(60*60)), bins, alpha=0.6, normed=True, label='Fraud') |
| 89 | +plt.legend(loc='upper right') |
| 90 | +plt.title("Percentage of transactions by hour") |
| 91 | +plt.xlabel("Transaction time as measured from first transaction in the dataset (hours)") |
| 92 | +plt.ylabel("Percentage of transactions (%)"); |
| 93 | +#plt.hist((df.Time/(60*60)),bins) |
| 94 | +plt.show() |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | +plt.scatter((normal_df.Time/(60*60)), normal_df.Amount, alpha=0.6, label='Normal') |
| 101 | +plt.scatter((fraud_df.Time/(60*60)), fraud_df.Amount, alpha=0.9, label='Fraud') |
| 102 | +plt.title("Amount of transaction by hour") |
| 103 | +plt.xlabel("Transaction time as measured from first transaction in the dataset (hours)") |
| 104 | +plt.ylabel('Amount (USD)') |
| 105 | +plt.legend(loc='upper right') |
| 106 | +plt.show() |
| 107 | + |
| 108 | + |
| 109 | + |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | +#data = df.drop(['Time'], axis=1) #if you think the var is unimportant |
| 114 | +df_norm = df |
| 115 | +df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1)) |
| 116 | +df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1)) |
| 117 | + |
| 118 | + |
| 119 | + |
| 120 | + |
| 121 | + |
| 122 | +train_x, test_x = train_test_split(df_norm, test_size=TEST_PCT, random_state=RANDOM_SEED) |
| 123 | +train_x = train_x[train_x.Class == 0] #where normal transactions |
| 124 | +train_x = train_x.drop(['Class'], axis=1) #drop the class column |
| 125 | + |
| 126 | + |
| 127 | +test_y = test_x['Class'] #save the class column for the test set |
| 128 | +test_x = test_x.drop(['Class'], axis=1) #drop the class column |
| 129 | + |
| 130 | +train_x = train_x.values #transform to ndarray |
| 131 | +test_x = test_x.values |
| 132 | + |
| 133 | + |
| 134 | + |
| 135 | + |
| 136 | +train_x.shape |
| 137 | + |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | + |
| 142 | +# Reduce number of epochs and batch_size if your Jupyter crashes (due to memory issues) |
| 143 | +# nb_epoch = 100 |
| 144 | +# batch_size = 128 |
| 145 | +nb_epoch = 5 |
| 146 | +batch_size = 32 |
| 147 | + |
| 148 | +input_dim = train_x.shape[1] #num of columns, 30 |
| 149 | +encoding_dim = 14 |
| 150 | +hidden_dim = int(encoding_dim / 2) #i.e. 7 |
| 151 | +learning_rate = 1e-7 |
| 152 | + |
| 153 | +input_layer = Input(shape=(input_dim, )) |
| 154 | +encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(learning_rate))(input_layer) |
| 155 | +encoder = Dense(hidden_dim, activation="relu")(encoder) |
| 156 | +decoder = Dense(hidden_dim, activation='tanh')(encoder) |
| 157 | +decoder = Dense(input_dim, activation='relu')(decoder) |
| 158 | +autoencoder = Model(inputs=input_layer, outputs=decoder) |
| 159 | + |
| 160 | + |
| 161 | + |
| 162 | + |
| 163 | + |
| 164 | + |
| 165 | + |
| 166 | +autoencoder.compile(metrics=['accuracy'], |
| 167 | + loss='mean_squared_error', |
| 168 | + optimizer='adam') |
| 169 | + |
| 170 | +cp = ModelCheckpoint(filepath="models/autoencoder_fraud.h5", |
| 171 | + save_best_only=True, |
| 172 | + verbose=0) |
| 173 | + |
| 174 | +tb = TensorBoard(log_dir='logs/keras-fraud', |
| 175 | + histogram_freq=0, |
| 176 | + write_graph=True, |
| 177 | + write_images=True) |
| 178 | + |
| 179 | +history = autoencoder.fit(train_x, train_x, |
| 180 | + epochs=nb_epoch, |
| 181 | + batch_size=batch_size, |
| 182 | + shuffle=True, |
| 183 | + validation_data=(test_x, test_x), |
| 184 | + verbose=1, |
| 185 | + callbacks=[cp, tb]).history |
| 186 | + |
| 187 | + |
| 188 | + |
| 189 | + |
| 190 | + |
| 191 | + |
| 192 | +autoencoder = load_model('models/autoencoder_fraud.h5') |
| 193 | + |
| 194 | + |
| 195 | + |
| 196 | + |
| 197 | +plt.plot(history['loss'], linewidth=2, label='Train') |
| 198 | +plt.plot(history['val_loss'], linewidth=2, label='Test') |
| 199 | +plt.legend(loc='upper right') |
| 200 | +plt.title('Model loss') |
| 201 | +plt.ylabel('Loss') |
| 202 | +plt.xlabel('Epoch') |
| 203 | +#plt.ylim(ymin=0.70,ymax=1) |
| 204 | +plt.show() |
| 205 | + |
| 206 | + |
| 207 | + |
| 208 | + |
| 209 | + |
| 210 | + |
| 211 | +test_x_predictions = autoencoder.predict(test_x) |
| 212 | +mse = np.mean(np.power(test_x - test_x_predictions, 2), axis=1) |
| 213 | +error_df = pd.DataFrame({'Reconstruction_error': mse, |
| 214 | + 'True_class': test_y}) |
| 215 | +error_df.describe() |
| 216 | + |
| 217 | + |
| 218 | + |
| 219 | + |
| 220 | + |
| 221 | +false_pos_rate, true_pos_rate, thresholds = roc_curve(error_df.True_class, error_df.Reconstruction_error) |
| 222 | +roc_auc = auc(false_pos_rate, true_pos_rate,) |
| 223 | + |
| 224 | +plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f'% roc_auc) |
| 225 | +plt.plot([0,1],[0,1], linewidth=5) |
| 226 | + |
| 227 | +plt.xlim([-0.01, 1]) |
| 228 | +plt.ylim([0, 1.01]) |
| 229 | +plt.legend(loc='lower right') |
| 230 | +plt.title('Receiver operating characteristic curve (ROC)') |
| 231 | +plt.ylabel('True Positive Rate') |
| 232 | +plt.xlabel('False Positive Rate') |
| 233 | +plt.show() |
| 234 | + |
| 235 | + |
| 236 | + |
| 237 | + |
| 238 | + |
| 239 | +precision_rt, recall_rt, threshold_rt = precision_recall_curve(error_df.True_class, error_df.Reconstruction_error) |
| 240 | +plt.plot(recall_rt, precision_rt, linewidth=5, label='Precision-Recall curve') |
| 241 | +plt.title('Recall vs Precision') |
| 242 | +plt.xlabel('Recall') |
| 243 | +plt.ylabel('Precision') |
| 244 | +plt.show() |
| 245 | + |
| 246 | + |
| 247 | + |
| 248 | + |
| 249 | + |
| 250 | +plt.plot(threshold_rt, precision_rt[1:], label="Precision",linewidth=5) |
| 251 | +plt.plot(threshold_rt, recall_rt[1:], label="Recall",linewidth=5) |
| 252 | +plt.title('Precision and recall for different threshold values') |
| 253 | +plt.xlabel('Threshold') |
| 254 | +plt.ylabel('Precision/Recall') |
| 255 | +plt.legend() |
| 256 | +plt.show() |
| 257 | + |
| 258 | + |
| 259 | + |
| 260 | + |
| 261 | + |
| 262 | + |
| 263 | +threshold_fixed = 5 |
| 264 | +groups = error_df.groupby('True_class') |
| 265 | +fig, ax = plt.subplots() |
| 266 | + |
| 267 | +for name, group in groups: |
| 268 | + ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='', |
| 269 | + label= "Fraud" if name == 1 else "Normal") |
| 270 | +ax.hlines(threshold_fixed, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold') |
| 271 | +ax.legend() |
| 272 | +plt.title("Reconstruction error for different classes") |
| 273 | +plt.ylabel("Reconstruction error") |
| 274 | +plt.xlabel("Data point index") |
| 275 | +plt.show(); |
| 276 | + |
| 277 | + |
| 278 | + |
| 279 | + |
| 280 | + |
| 281 | +pred_y = [1 if e > threshold_fixed else 0 for e in error_df.Reconstruction_error.values] |
| 282 | +conf_matrix = confusion_matrix(error_df.True_class, pred_y) |
| 283 | + |
| 284 | +plt.figure(figsize=(12, 12)) |
| 285 | +sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d"); |
| 286 | +plt.title("Confusion matrix") |
| 287 | +plt.ylabel('True class') |
| 288 | +plt.xlabel('Predicted class') |
| 289 | +plt.show() |
0 commit comments