# Unsupervised Anomaly Detection Techniques 1

In [1]:
import numpy as np
import pandas as pd

In [2]:
credit_data = pd.read_csv('creditcard.csv')

In [3]:
credit_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
#the fraud and normal class transaction
fraud_classes = credit_data[credit_data['Class']==1]

normal_classes = credit_data[credit_data['Class']==0]

In [5]:
print(fraud_classes.shape,normal_classes.shape)

(492, 31) (284315, 31)


In [6]:
## Let's consider a small sample of the data, say around 20% of dataset
new_data= credit_data.sample(frac = 0.2,random_state=1)

new_data.shape

(56961, 31)

In [7]:
#original dataset
credit_data.shape

(284807, 31)

In [8]:
#number of fraud and normal transactions in the dataset

fraud_transaction = new_data[new_data['Class']==1]
normal_transaction = new_data[new_data['Class']==0]
outlier_ratio = len(fraud_transaction)/float(len(normal_transaction))

In [9]:
print(outlier_ratio)

print("Fraud Cases : {}".format(len(fraud_transaction)))

print("Normal Cases : {}".format(len(normal_transaction)))

0.0015296972254457222
Fraud Cases : 87
Normal Cases : 56874


In [10]:
columns =  new_data.columns.tolist()

# Filter the columns
columns = [c for c in columns if c not in ["Class"]]

# Predicting variable 
target = "Class"
RANDOM_SEED = 42
state = np.random.RandomState(42)
X = new_data[columns]
Y = new_data[target]
X_outliers = state.uniform(low=0, high=1, size=(X.shape[0], X.shape[1]))

In [11]:
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

(56961, 30)
(56961,)


Isolation Forest: This is a technique to tetect outliers or anomlies. This alogrithm considers each data pt as anomalies. This method is that's why called Isolation; This is very efficient way of detecting anomalies, regardles of the size of data 

Local Outlier Factr: This is also an unsupervised anomaly detection which uses local density of the data point wrt to its nearest neighbors. the neighbors are usually greater than the number of objects (min)

Here, I am checking if svm is also a good fit or not for financial datasets

In [12]:
import sklearn
from sklearn.metrics import classification_report,accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
#The outlier methods that are being used as below

outlier_classifiers = {
    "Isolation Forest":IsolationForest(n_estimators=100, max_samples=len(X), 
                                       contamination=outlier_ratio,random_state=state, verbose=0),
    "Local Outlier Factor":LocalOutlierFactor(n_neighbors=20, algorithm='auto', 
                                              leaf_size=30, metric='euclidean',
                                              p=2, metric_params=None, contamination=outlier_ratio),
    "Support Vector Machine":OneClassSVM(kernel='rbf', degree=3, gamma=0.1,nu=0.05, 
                                         max_iter=-1, random_state=state)
   
}

In [13]:
n_outliers = len(fraud_transaction)
for i, (clf_name,clf) in enumerate(outlier_classifiers.items()):
    #Fit the data and tag outliers
    if clf_name == "Support Vector Machine":
        clf.fit(X)
        y_predicted = clf.predict(X)
    elif clf_name == "Local Outlier Factor":
        y_predicted = clf.fit_predict(X)
        scores_prediction = clf.negative_outlier_factor_
    else:    
        clf.fit(X)
        scores_prediction = clf.decision_function(X)
        y_predicted = clf.predict(X)
        
    #Normal = 0, Fraud =1 reshaping the transactions
    y_predicted[y_predicted == 1] = 0
    y_predicted[y_predicted == -1] = 1
    number_errors = (y_predicted != Y).sum()
    
    # Classification Metrics
    print("{}: {}".format(clf_name,number_errors))
    print("Accuracy Score:")
    print(accuracy_score(Y,y_predicted))
    print("Classification Report:")
    print(classification_report(Y,y_predicted))



Isolation Forest: 139
Accuracy Score:
0.9975597338529871
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56874
           1       0.20      0.21      0.21        87

    accuracy                           1.00     56961
   macro avg       0.60      0.60      0.60     56961
weighted avg       1.00      1.00      1.00     56961

Local Outlier Factor: 173
Accuracy Score:
0.9969628342199048
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56874
           1       0.01      0.01      0.01        87

    accuracy                           1.00     56961
   macro avg       0.50      0.50      0.50     56961
weighted avg       1.00      1.00      1.00     56961





Support Vector Machine: 29070
Accuracy Score:
0.48965081371464686
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.49      0.66     56874
           1       0.00      0.56      0.00        87

    accuracy                           0.49     56961
   macro avg       0.50      0.53      0.33     56961
weighted avg       1.00      0.49      0.66     56961



Conclusion: This was for a sample amount of data and the results are as above. The inference that we can get from this is:
a) Isolation forest identified 139 erros where as LOF identified 173 and svm identified 29070. 
b) The islolation forest has 99.7% accuracy and lof has 99.6% accuracy, and accuracy of svm is 0.40. From this we can say that svm will not perform well on unsupervised anomaly detection.
c) From the overall model results, we can say that Isolation forest is a better algorithm as compared to LOF.
d) The fraud case detection in isolation forest is around 21%, LOF around 1% and 0% for svm
e) The sample size can be increased to get a better accuracy. In my other notebook i have used the complete dataset to see the accuracy of both the algorithms. The svm algortihm is excluded there.