# Credit Card Fraud Detection

In [None]:
COLAB = True

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [None]:
if COLAB:
  id_ = "1uSFC-iy_-_NkK-AEjoPMJI6ximN-jM5F"
  aux = "'https://docs.google.com/uc?export=download&id={}&confirm=t'".format(id_)
  !wget $aux -O ./creditcard.csv.zip
  !unzip -qq ./creditcard.csv.zip

In [None]:
# here the import csv file as a pandas dataframe
df = pd.read_csv("./creditcard.csv")

In [None]:
df.head()

**Dataset description**

The dataset contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation.
Due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [None]:
df.isna().sum()

In [None]:
print("df dimensions: {} rows by {} columns".format(df.shape[0], df.shape[1]))

In [None]:
df.columns

In [None]:
y = df['Class'].values
y.shape

In [None]:
df['Time'].max() / 3600

In [None]:
# now we drop the target variable from the data set
#df.drop(['Class', 'Time', 'Amount'],axis=1,inplace=True)
df.drop(['Class', 'Time'],axis=1,inplace=True)
#df.drop('Class',axis=1,inplace=True)
df.shape

In [None]:
def show_counts(y):
    uns, counts = np.unique(y, return_counts=True)
    for u,c in zip(uns,counts):
        print("y={}: {} times".format(u,c))

In [None]:
#distribution of the target variable

plt.figure(figsize=(5,4))
uns,counts = np.unique(y, return_counts=True)
plt.bar(uns,counts)
plt.xticks([0,1])
plt.show()

show_counts(y)

In [None]:
df.hist(figsize=(12,24), bins=100, layout=(-1,3));

In [None]:
df.describe()

In [None]:
show_counts(y[df.Amount >= 50])

In [None]:
y

In [None]:
attribute_names = list(df.columns)
print(attribute_names)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(df[y==0]["V1"], df[y==0]["V2"], alpha=0.1, label="Ok")
plt.scatter(df[y==1]["V1"], df[y==1]["V2"], alpha=0.1, label="Ko")
plt.legend();

In [None]:
df["Amount"] = df["Amount"] / 125

In [None]:
df.describe()

In [None]:
from sklearn.model_selection import train_test_split

df_tr, df_te, y_tr, y_te = train_test_split(df, y, test_size=0.3, random_state=2, stratify=y)

In [None]:
y_tr.mean(), y_te.mean()

In [None]:
df_tr.shape

In [None]:
from tensorflow import keras

In [None]:
# defino mi red como una lista de capas:

model = keras.Sequential(
    [
     keras.Input(29),
     keras.layers.Dense(10, activation="relu"),
     keras.layers.Dense(2, activation="relu"),
     keras.layers.Dense(10, activation="relu"),
     keras.layers.Dense(29)
    ]
)

In [None]:
model.summary()

In [None]:
# preparo conjunto de training y de validación
df_tr = df_tr[y_tr==0] # me quedo solo con los ejemplos ok
df_tr, df_val = train_test_split(df_tr, test_size=0.3, random_state=1) # el 30% lo llevo a validación

In [None]:
df_tr.shape, df_val.shape

In [None]:
model.compile(optimizer='rmsprop', loss="mse")

In [None]:
from matplotlib.ticker import MaxNLocator

def plot_history(historia):
    f = plt.figure(figsize=(4,4))
    h = historia.history
    aux = range(1,len(h["loss"])+1)
    mejor_epoca = np.argmin(h["val_loss"])
    plt.plot(aux, h["loss"], label="entrenamiento")
    plt.plot(aux, h["val_loss"], label="validación")
    plt.plot(mejor_epoca+1, h["val_loss"][mejor_epoca], 'or')
    plt.title('Loss', fontsize=18)
    plt.xlabel('Época', fontsize=18)
    plt.xticks(fontsize=12); plt.yticks(fontsize=12)
    plt.legend()
    f.gca().xaxis.set_major_locator(MaxNLocator(integer=True))

In [None]:
lista_callbacks = [
    keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,
    ),
    keras.callbacks.ModelCheckpoint(
        filepath="best_model.keras",
        monitor="val_loss",
        #save_best_only=True,
    )
]

In [None]:
historia = model.fit(df_tr, df_tr, batch_size=128, epochs=50, callbacks=lista_callbacks,
                     validation_data=(df_val, df_val))

In [None]:
plot_history(historia)

In [None]:
# cargo best_model de fichero, que es el punto rojo (donde mejor métrica en validación)
model = keras.models.load_model("best_model.keras")

In [None]:
df_te[:3]

In [None]:
errores_totales = ((df_te.values - model.predict(df_te))**2).mean(axis=1)
errores_totales.shape

In [None]:
df_te.shape

In [None]:
inds = np.argsort(errores_totales)[::-1] # saco los índices de los errores de mayor a menor magnitud

In [None]:
np.argsort([5,4,3])[::-1]

In [None]:
y_te[inds[:20]]

In [None]:
pd.options.display.max_rows = None

results_te = pd.DataFrame({"error_AE":errores_totales, "clase":y_te})
results_te.sort_values("error_AE", ascending=False)[:100]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpt, tpr, thresholds = roc_curve(y_true=y_te, y_score=errores_totales, pos_label=1)

In [None]:
plt.plot(fpt, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC={}".format(roc_auc_score(y_true=y_te, y_score=errores_totales)))
plt.grid()
#plt.xlim([0.01,0.1])
plt.show()
plt.plot(fpt, thresholds)
plt.xlabel("False Positive Rate")
plt.ylabel("Threshold")
#plt.xlim([0.01,0.1])
#plt.ylim([0,10])
plt.grid();

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_te, errores_totales, pos_label=1)

In [None]:
plt.plot(recall, precision)
plt.xlabel("Recall (TPR)")
plt.ylabel("Precision")
plt.grid()
#plt.xlim([0.01,0.1])
plt.show()
plt.plot(recall[1:], thresholds)
plt.xlabel("Recall (TPR)")
plt.ylabel("Threshold")
#plt.xlim([0.01,0.1])
#plt.ylim([0,10])
plt.grid();

### Comparativa con modelo básico (autoencoder lineal, que es equivalente a PCA):

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(df_tr) # no tenían anomalías

# Encoder:
salida_encoder_te = pca.transform(df_te)
salida_encoder_te

In [None]:
# Decoder:
salida_decoder_te = pca.inverse_transform(salida_encoder_te)
salida_decoder_te.shape

In [None]:
errores_totales = ((df_te.values - salida_decoder_te)**2).mean(axis=1)
errores_totales.shape

In [None]:
pd.options.display.max_rows = None

results_te = pd.DataFrame({"error_AE":errores_totales, "clase":y_te})
results_te.sort_values("error_AE", ascending=False)[:100]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpt, tpr, thresholds = roc_curve(y_true=y_te, y_score=errores_totales, pos_label=1)

In [None]:
plt.plot(fpt, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC={}".format(roc_auc_score(y_true=y_te, y_score=errores_totales)))
plt.grid()
#plt.xlim([0.01,0.1])
plt.show()
plt.plot(fpt, thresholds)
plt.xlabel("False Positive Rate")
plt.ylabel("Threshold")
#plt.xlim([0.01,0.1])
#plt.ylim([0,10])
plt.grid();

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_te, errores_totales, pos_label=1)

In [None]:
plt.plot(recall, precision)
plt.xlabel("Recall (TPR)")
plt.ylabel("Precision")
plt.grid()
#plt.xlim([0.01,0.1])
plt.show()
plt.plot(recall[1:], thresholds)
plt.xlabel("Recall (TPR)")
plt.ylabel("Threshold")
#plt.xlim([0.01,0.1])
#plt.ylim([0,10])
plt.grid();