# Credit Card Fraud Detection

In [None]:
COLAB = True

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [None]:
if COLAB:
    id_ = "1uSFC-iy_-_NkK-AEjoPMJI6ximN-jM5F"
    aux = "'https://docs.google.com/uc?export=download&id={}&confirm=t'".format(id_)
    !wget $aux -O ./creditcard.csv.zip
    !unzip -qq ./creditcard.csv.zip

In [None]:
!ls -la

In [None]:
# here the import csv file as a pandas dataframe
df = pd.read_csv("./creditcard.csv")

In [None]:
pd.options.display.max_columns = None
df.tail()

In [None]:
df.var()

In [None]:
df["Time"].min(), df["Time"].max()

In [None]:
172792.0 / (60*60*24) # número de segundos en un día

**Dataset description**

The dataset contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation.
Due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [None]:
df.isna().sum()

In [None]:
print("df dimensions: {} rows by {} columns".format(df.shape[0], df.shape[1]))

In [None]:
df.columns

In [None]:
y = df['Class'].values
y.shape

In [None]:
df['Time'].max() / 3600

In [None]:
# now we drop the target variable from the data set
#df.drop(['Class', 'Time', 'Amount'],axis=1,inplace=True)
df.drop(['Class', 'Time'],axis=1,inplace=True)
#df.drop('Class',axis=1,inplace=True)
df.shape

In [None]:
def show_counts(y):
    uns, counts = np.unique(y, return_counts=True)
    for u,c in zip(uns,counts):
        print("y={}: {} times".format(u,c))

In [None]:
#distribution of the target variable

plt.figure(figsize=(5,4))
uns,counts = np.unique(y, return_counts=True)
plt.bar(uns,counts)
plt.xticks([0,1])
plt.show()

show_counts(y)

In [None]:
df.hist(figsize=(12,24), bins=100, layout=(-1,3));

In [None]:
df.describe()

In [None]:
show_counts(y[df.Amount >= 50])

In [None]:
y[df.Amount >= 50].mean()

In [None]:
y[df.Amount >= 500].mean()

In [None]:
y[df.Amount >= 1000].mean()

In [None]:
df.describe().T

In [None]:
y

In [None]:
attribute_names = list(df.columns)
print(attribute_names)

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(df[y==0]["V1"], df[y==0]["V2"], alpha=0.1, label="Ok")
plt.scatter(df[y==1]["V1"], df[y==1]["V2"], alpha=0.1, label="Ko")
plt.legend();

In [None]:
df["Amount"] = df["Amount"] / 125

In [None]:
df.describe()

In [None]:
from sklearn.model_selection import train_test_split

df_tr, df_te, y_tr, y_te = train_test_split(df, y, test_size=0.3, random_state=2, stratify=y)

In [None]:
y_tr.mean(), y_te.mean()

In [None]:
df_tr.shape

In [None]:
# preparo conjunto de training y de validación
df_tr = df_tr[y_tr==0] # me quedo solo con los ejemplos ok
df_tr, df_val = train_test_split(df_tr, test_size=0.3, random_state=1) # el 30% lo llevo a validación

In [None]:
df_tr.shape, df_val.shape

In [None]:
# Isolation Forest:

from sklearn.ensemble import IsolationForest

clf = IsolationForest(n_estimators=100, contamination="auto")
# contamination: tasa de outliers. Es “auto” o debe estar en (0, 0.5]

clf.fit(df_tr)
outliers_train = clf.predict(df_tr) # -1 si outlier, 1 si ok (inlier)
outliers_test  = clf.predict(df_te)

In [None]:
len(y_te[outliers_test==-1])

In [None]:
sum(y_te[outliers_test==-1])

In [None]:
scores_train = -clf.decision_function(df_tr) # -1 si outlier, 1 si ok (inlier)
scores_test  = -clf.decision_function(df_te)

In [None]:
scores_test

In [None]:
np.unique(outliers_test)

In [None]:
scores_test

In [None]:
pd.options.display.max_rows = None
results_te = pd.DataFrame({"score_anomalía":scores_test, "clase":y_te})
results_te.sort_values("score_anomalía", ascending=False)[:500]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score
fpt, tpr, thresholds = roc_curve(y_true=y_te, y_score=scores_test, pos_label=1)

In [None]:
y_te

In [None]:
plt.plot(fpt, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("AUC={}".format(roc_auc_score(y_true=y_te, y_score=scores_test)))
plt.grid()
#plt.xlim([0.01,0.1])
plt.show()
plt.plot(fpt, thresholds)
plt.xlabel("False Positive Rate")
plt.ylabel("Threshold")
#plt.xlim([0.01,0.1])
#plt.ylim([0,10])
plt.grid();

In [None]:
len(y_te[scores_test>0.1])

In [None]:
sum(y_te[scores_test>0.1])

In [None]:
100*y_te.mean()

In [None]:
umbral = 10
(y_te[scores_test>umbral] == 0).sum()

In [None]:
(y_te == 0).sum()

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_te, scores_test, pos_label=1)

In [None]:
plt.plot(recall, precision)
plt.xlabel("Recall (TPR)")
plt.ylabel("Precision")
plt.grid()
#plt.xlim([0.01,0.1])
plt.show()
plt.plot(recall[1:], thresholds)
plt.xlabel("Recall (TPR)")
plt.ylabel("Threshold")
#plt.xlim([0.01,0.1])
#plt.ylim([0,10])
plt.grid();