In [11]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from Resampling import data, upsampled, downsampled, y, outlier_fraction


"""The first unsupervised algorithm I will be trying is the isolation forest 
algorithm with the original data (cut to 10% of its original size for faster
computation). """

# Re-setting X to the data minus the Class feature
X = data.drop('Class', axis=1)

# Fitting the model
a = IsolationForest(max_samples=len(X), contamination=outlier_fraction).fit(X)

# Prediction
y_prediction = a.predict(X)

y_prediction[y_prediction == 1] = 0  # Valid transactions are labelled as 0.
y_prediction[y_prediction == -1] = 1  # Fraudulent transactions are labelled as 1.

errors = (y_prediction != y).sum()  # Total number of errors is calculated.

print(errors)
print(accuracy_score(y_prediction, y))
print(classification_report(y_prediction, y))

64
0.9977528878901724
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28426
           1       0.40      0.35      0.37        55

    accuracy                           1.00     28481
   macro avg       0.70      0.67      0.69     28481
weighted avg       1.00      1.00      1.00     28481



In [13]:
"""Now, let's try the same isolation forest algorithm with a randomly upsampled dataset with 
21326 fraudulent and 21326 valid transactions, where the outlier fraction
is .5"""

X = upsampled.drop('Class', axis=1)
y = upsampled.Class

# New model with 50% contamination because there are equal amounts of fraud and valid
b = IsolationForest(max_samples=len(X), contamination=.50).fit(X)

# Prediction
y_prediction2 = b.predict(X)

y_prediction2[y_prediction2 == 1] = 0  # Valid transactions are labelled as 0.
y_prediction2[y_prediction2 == -1] = 1  # Fraudulent transactions are labelled as 1.

errors2 = (y_prediction2 != y).sum()  # Total number of errors is calculated.

print(errors2)
print(accuracy_score(y_prediction2, y))
print(classification_report(y_prediction2, y))


"""
The upsampled dataset has much better precision and recall for finding fraudulent transactions,
and is overall much more accurate at finding out which data points are fraudulent. However, it
only correctly identifies around 82% of non-fraudulent cases as opposed to with the original data,
where every single non-fraudulent case was identified."""

7152
0.8322623012336413
              precision    recall  f1-score   support

           0       0.83      0.83      0.83     21319
           1       0.83      0.83      0.83     21319

    accuracy                           0.83     42638
   macro avg       0.83      0.83      0.83     42638
weighted avg       0.83      0.83      0.83     42638



'\nThe upsampled dataset has much better precision and recall for finding fraudulent transactions,\nand is overall much more accurate at finding out which data points are fraudulent. However, it\nonly correctly identifies around 82% of non-fraudulent cases as opposed to with the original data,\nwhere every single non-fraudulent case was identified.\n'

In [5]:

""""Next, we will try the local outlier factor algorithm. First, we will use the original
dataset (but 1/10 of the original size)

"""

# Reset the variables for X and y for the original data
X = data.drop('Class', axis=1)
y = data.Class # y is output

# Initialize a model with 20 neighbors
c = LocalOutlierFactor(n_neighbors = 20,contamination = outlier_fraction)
# Fit the model
y_prediction3 = c._fit_predict(X)
y_prediction3[y_prediction3 == 1] = 0 # Valid transactions are labelled as 0.
y_prediction3[y_prediction3 == -1] = 1 # Fraudulent transactions are labelled as 1.

errors3 = (y_prediction3 != y).sum()
print(accuracy_score(y_prediction3,y))
print(classification_report(y_prediction3,y))

"""
Both the precision and recall are pretty low, meaning that this is not a good algorithm to apply
to this dataset. This is partly due to the fact that there are so many more non-fraud data points
than fraud data points, so it is difficult to draw a line between the LOF of fraud and non-fraud cases.
"""

0.9964888873283944
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28426
           1       0.02      0.02      0.02        55

    accuracy                           1.00     28481
   macro avg       0.51      0.51      0.51     28481
weighted avg       1.00      1.00      1.00     28481



In [14]:
"""
Next, I will try the same LOF algorithm, but with the upsampled dataset this time to try to offset
some of the errors in precision and accuracy.
"""

X = upsampled.drop('Class', axis=1)
y = upsampled.Class

# Initialize a model with 20 neighbors
d = LocalOutlierFactor(n_neighbors = 20,contamination = .5)
y_prediction4 = d._fit_predict(X)
y_prediction4[y_prediction4 == 1] = 0 
y_prediction4[y_prediction4 == -1] = 1

errors4 = (y_prediction4 != y).sum()
print(accuracy_score(y_prediction4,y))
print(classification_report(y_prediction4,y))

"""
Unfortunately, it is even worse when there are equal amounts of fraudulent and non-fraudulent cases.
In conclusion, isolation forest on an upsampled dataset is the best option for this dataset.
"""

0.17207655143299405
              precision    recall  f1-score   support

           0       0.34      0.26      0.29     28656
           1       0.00      0.00      0.00     13982

    accuracy                           0.17     42638
   macro avg       0.17      0.13      0.15     42638
weighted avg       0.23      0.17      0.20     42638

