In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

# German Credit Card Fraud

In [None]:
data = pd.read_csv("data/german_cc_fraud.csv")

In [None]:
data['class'].value_counts()

In [None]:
data.head()

In [None]:
data_dummies = pd.get_dummies(data.drop("class", axis=1))

In [None]:
data_dummies.columns

In [None]:
X = data_dummies.values.astype(np.float)

In [None]:
X.shape

# Elliptic Envelope

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.covariance import EllipticEnvelope

In [None]:
X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=.8)
X_preprocessed = pca.fit_transform(X_scaled)

In [None]:
pca.n_components_

In [None]:
ee = EllipticEnvelope(contamination=.3).fit(X_preprocessed)

In [None]:
np.bincount(ee.predict(X_preprocessed) + 1)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(data['class'] == "good", ee.predict(X_preprocessed) == 1)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(data['class'] == "good", ee.decision_function(X_preprocessed))

# PCA

In [None]:
pca_full = PCA().fit(X_scaled)
plt.figure()
plt.plot(pca_full.explained_variance_ratio_)

In [None]:
roc_auc_score(data['class'] == "good", pca.score_samples(X_scaled))

# Robust PCA

In [None]:
from robust_pca import RobustPCA
rpca = RobustPCA().fit(X_scaled)

In [None]:
roc_auc_score(data['class'] == "good", rpca.score_samples(X_scaled))

# KDE

In [None]:
from sklearn.neighbors import KernelDensity

In [None]:
kde = KernelDensity(bandwidth=5).fit(X_scaled)


In [None]:
plt.figure()
plt.hist(kde.score_samples(X_scaled), bins=100);

In [None]:
roc_auc_score(data['class'] == "good", kde.score_samples(X_scaled))

# Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=.3).fit(data_dummies.values)

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score
confusion_matrix(data['class'] == "good", iso.predict(data_dummies.values) == 1)

In [None]:
roc_auc_score(data['class'] == "good", iso.decision_function(data_dummies.values))

# Exercise

Apply the above techniques to the Adult dataset (the rich people are the outliers).