In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn import decomposition
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from scipy.linalg import eigh
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
import datetime
import joblib


<p style='font-size:20px'><b> Modelling approach </b></p>
<p>

- Cascading ensemble has been proven quite effective for solving classification problems with highly imbalanced datasets & the same will be used for this project.
- We will develop 3 layers of model, in each layer certain # of points will be filtered for next level and the rest will be cleared as 'not-fraud' in that level. 
- The # of points passed to the next level will be decided by setting a threshold on the predicted probability of a point being fraud.     
- This will enable us to tune thresholds as per the business requirement of the model as mentioned during EDA.
    
</p>

<p style='font-size:18px'><b> 1) Level 1 Model </b></p>
<p style='font-size:16px'><b> 1.1 Objective </b></p>

<p>

- As visualised in EDA, there is a huge/dense cluster of not-fraud points that can be easily separated from the overlapping region of the two classes.
- The objective of this level will be to classify these easily separable points as 'not-fraud' & pass rest of the points to the next level.     
    
</p>

<p style='font-size:16px'><b> 1.2 Model charecteristics </b></p>

<p>

Based on the objective of this level, the ML model should have the following properties:
- Doesn't need to have high complexity, since we are only targeting the easily separable not-fraud points.
- Should have a good probabilistic interpretation, to enable tuning of the probability threshold.
- For practical purposes, a fast run-time would be beneficial to green flag transactions that are very likely to be not-fraud.
    
Some models that can suit these characteristics:
1. Logistic regression    
2. RBF SVM    
3. Decision tree
    
    
</p>


In [2]:
# Load the dataset for credit card frauds
train_data = pd.read_csv('dataset/creditcard_train.csv')

<p style='font-size:18px'><b> 2) Custom function: Probability thresholding</b></p>

<p>

Following are the details of this function (input, output, purpose)
    
A) Inputs:
- y_proba: The prob value predicted by the model for the dataset
- y_actual: The actual class label of corresponding points in the the dataset
- num_buckets: The number of equally spaced buckets of probability values

B) OUTPUT:
- The num of positive & negative class labels that fall under probability range buckets defined by num_buckets
- The lowest probability value(threshold) that a positive (fraud) class label can assume accross all predictions
- The number of points below 100%, 90%, 50% value of this threshold

C) PURPOSE:
- To decide the threshold for qualifying a point as not-fraud with high confidence based on probability values.
- The points below this threshold will 'clear' level 1 to be classified as 'not-fraud', rest of the points will be passed as training data to the next level.
- We will set this threshold to be 25% lower than the lowest probability value in the positive class label.

</p>

In [3]:
def prob_class(y_proba, y_actual, num_buckets):
    
    prob_buckets = np.linspace(0,1,num_buckets+1)
    prob_class_report = []
    prob_class_report.append(['Prob range', 'Not-Fraud', 'Fraud'])
    
    for i in range(len(prob_buckets)-1):
        
        # Col1: Probability range
        prob_range = '(' + str(round(prob_buckets[i],2)) + ', ' + str(round(prob_buckets[i+1],2)) + ']'
        
        # Defining the upper and lower bound of the probability
        prob_high = prob_buckets[i+1]
        if i==0:
            prob_low = -0.01
        else:
            prob_low = prob_buckets[i]
        
        # Col2: #Points that are not fraud in the prob_range
        not_fraud = np.sum((y_proba>prob_low)&(y_proba<=prob_high)&(y_actual==0))

        # Col3: #Points that are fraud in the prob_range
        fraud = np.sum((y_proba>prob_low)&(y_proba<=prob_high)&(y_actual==1))
        
        prob_class_report.append([prob_range, not_fraud, fraud])
    
    # Printing the Prob range vs count of fraud or not-fraud points
    print("\nDistribution of class labels for various prob buckets: \n")
    col_width = 15
    print("".join(word.center(col_width) for word in prob_class_report[0]))
    print("----------------------------------------------------")
    for row in prob_class_report[1:]:
        print(str(row[0]).center(col_width), end = "")
        print(str(row[1]).center(col_width), end = "")
        print(str(row[2]).center(col_width))
    
    # Prob value of the fraud datapoint having lowest prob value
    fraud_min_prob = np.min(y_proba[(y_actual==1)])
    
    # Number of not fraud datapoints having prob value below the fraud_min_prob
    not_fraud_1 = np.sum((y_actual==0)&(y_proba<fraud_min_prob))

    # Number of not fraud datapoints having prob value below the 90% of fraud_min_prob
    not_fraud_2 = np.sum((y_actual==0)&(y_proba<0.9*fraud_min_prob))
    
    # Number of not fraud datapoints having prob value below the 90% of fraud_min_prob
    not_fraud_3 = np.sum((y_actual==0)&(y_proba<0.5*fraud_min_prob))
    
    print("\n1. Lowest prob value in the fraud class (threshold): {}%".format(round(100*fraud_min_prob,3)))
    print("2. # of not fraud pts below threshold: {}".format(not_fraud_1))
    print("3. # of not fraud pts below 90% of threshold: {}".format(not_fraud_2))
    print("4. # of not fraud pts below 50% of threshold: {}".format(not_fraud_3))
    
    return (fraud_min_prob, not_fraud_1, not_fraud_2, not_fraud_3)
    

In [4]:
# Split into X (input) & Y (output)
train_x = train_data.drop(['Time', 'Class'], axis = 1).to_numpy()
train_y = train_data['Class'].to_numpy()

In [5]:
# Standardizing the data - not necessary for Decision trees, but required for Logistic regression & RBF SVM
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)


<p style='font-size:18px'><b> Option 1: Logistic regression </b></p>


In [6]:
start_time = datetime.datetime.now()

logreg_ = LogisticRegression(max_iter=1000)
hyper_params = {'C': [10**-2, 10**-1, 10**0, 10**1, 10**2], 'class_weight' : [{0:1, 1:1}, {0:1, 1:5}, {0:1, 1:10}, {0:1, 1:100}]}
logreg_gcv = GridSearchCV(logreg_, hyper_params, scoring = 'recall', cv = 5)
logreg_gcv.fit(train_x_scaled, train_y)

print("Training time: ", datetime.datetime.now() - start_time)
print("Hyper param values for best estimator: ", logreg_gcv.best_params_)


Training time:  0:01:37.478292
Hyper param values for best estimator:  {'C': 0.1, 'class_weight': {0: 1, 1: 100}}


In [7]:
# Probability vs class labels
l1_logreg_clf = logreg_gcv.best_estimator_
y_pred_proba_train = l1_logreg_clf.predict_proba(train_x_scaled)[:,1]
prob_details = prob_class(y_pred_proba_train, train_y, 20)

# Filter the points that we are not confident to be not-fraud
prob_thresh = prob_details[0]*0.75
print("\nProb threshold selected: {}%".format(round(100*prob_thresh,3)))
temp = train_data[(y_pred_proba_train>=prob_thresh)]
print("Number of points filtered for L2: ", temp.shape[0])


Distribution of class labels for various prob buckets: 

   Prob range     Not-Fraud        Fraud     
----------------------------------------------------
  (0.0, 0.05]       195010           17      
  (0.05, 0.1]       10151            12      
  (0.1, 0.15]        3772            4       
  (0.15, 0.2]        1600            3       
  (0.2, 0.25]        763             1       
  (0.25, 0.3]        379             1       
  (0.3, 0.35]        251             4       
  (0.35, 0.4]        239             2       
  (0.4, 0.45]        206             1       
  (0.45, 0.5]        160             2       
  (0.5, 0.55]        150             2       
  (0.55, 0.6]        135             2       
  (0.6, 0.65]         85             1       
  (0.65, 0.7]         64             2       
  (0.7, 0.75]         43             2       
  (0.75, 0.8]         46             2       
  (0.8, 0.85]         27             4       
  (0.85, 0.9]         29             1       
  (0.9, 0.95]  



<p style='font-size:18px'><b> Option 2: RBF SVM </b></p>


In [8]:
# Option 2: RBF SVM

start_time = datetime.datetime.now()

class_weights = {0:1, 1:5}
l1_rbfsvm_clf = SVC(C=1.0, max_iter=5000, kernel='rbf', probability=True, class_weight=class_weights)
l1_rbfsvm_clf.fit(train_x_scaled, train_y)

print("Training time: ", datetime.datetime.now() - start_time)

Training time:  0:03:44.426800




In [19]:
# Probability vs class labels
y_pred_proba_train = l1_rbfsvm_clf.predict_proba(train_x_scaled)[:,1]
prob_details = prob_class(y_pred_proba_train, train_y, 20)

# Filter the points that we are not confident to be not-fraud
prob_thresh = prob_details[0]*0.75
print("\nProb threshold selected: {}%".format(round(100*prob_thresh,3)))
temp = train_data[(y_pred_proba_train>=prob_thresh)]
print("Number of points filtered for L2: ", temp.shape[0])

l1_prob_thresh = prob_thresh


Distribution of class labels for various prob buckets: 

   Prob range     Not-Fraud        Fraud     
----------------------------------------------------
  (0.0, 0.05]       213209           30      
  (0.05, 0.1]         4              4       
  (0.1, 0.15]         5              2       
  (0.15, 0.2]         0              1       
  (0.2, 0.25]         1              1       
  (0.25, 0.3]         1              0       
  (0.3, 0.35]         0              2       
  (0.35, 0.4]         0              0       
  (0.4, 0.45]         0              0       
  (0.45, 0.5]         2              0       
  (0.5, 0.55]         0              0       
  (0.55, 0.6]         1              0       
  (0.6, 0.65]         0              0       
  (0.65, 0.7]         1              0       
  (0.7, 0.75]         0              2       
  (0.75, 0.8]         1              0       
  (0.8, 0.85]         0              0       
  (0.85, 0.9]         1              1       
  (0.9, 0.95]  



<p style='font-size:18px'><b> Option 3: Decision tree </b></p>


<p>

- Decision trees can easily overfit the data, we need to constraint the model to a shallow decision tree.
- The leaf nodes having all not-fraud datapoints will be cleared from this level, and the nodes having both class labels will be filtered for next level.    
- Therefore, we will experiment with various depths to choose the best model with the right bias-variance tradeoff.


</p>

In [10]:
# Decision tree 1: Max depth = 5

start_time = datetime.datetime.now()

l1_dectree_clf1 = DecisionTreeClassifier(max_depth = 5)
l1_dectree_clf1.fit(train_x_scaled, train_y)

print("Training time: ", datetime.datetime.now() - start_time)

# Some attributes of the decision tree
print("\n1. Max depth: ", l1_dectree_clf1.tree_.max_depth)
print("2. # of nodes: ", l1_dectree_clf1.tree_.node_count)

Training time:  0:00:03.822064

1. Max depth:  5
2. # of nodes:  41


In [11]:
# Probability vs class labels
y_pred_proba_train = l1_dectree_clf1.predict_proba(train_x_scaled)[:,1]
prob_details = prob_class(y_pred_proba_train, train_y, 20)

# Filter the points that we are not confident to be not-fraud
prob_thresh = prob_details[0]*0.75
print("\nProb threshold selected: {}%".format(round(100*prob_thresh,3)))
temp = train_data[(y_pred_proba_train>=prob_thresh)]
print("Number of points filtered for L2: ", temp.shape[0])


Distribution of class labels for various prob buckets: 

   Prob range     Not-Fraud        Fraud     
----------------------------------------------------
  (0.0, 0.05]       213180           58      
  (0.05, 0.1]         26             2       
  (0.1, 0.15]         0              0       
  (0.15, 0.2]         8              2       
  (0.2, 0.25]         0              0       
  (0.25, 0.3]         0              0       
  (0.3, 0.35]         0              0       
  (0.35, 0.4]         0              0       
  (0.4, 0.45]         0              0       
  (0.45, 0.5]         0              0       
  (0.5, 0.55]         0              0       
  (0.55, 0.6]         6              9       
  (0.6, 0.65]         0              0       
  (0.65, 0.7]         0              0       
  (0.7, 0.75]         0              0       
  (0.75, 0.8]         0              0       
  (0.8, 0.85]         1              5       
  (0.85, 0.9]         0              0       
  (0.9, 0.95]  

In [12]:
# Decision tree 2: Max depth = 15

start_time = datetime.datetime.now()

l1_dectree_clf2 = DecisionTreeClassifier(max_depth = 15)
l1_dectree_clf2.fit(train_x_scaled, train_y)

print("Training time: ", datetime.datetime.now() - start_time)

# Some attributes of the decision tree
print("\n1. Max depth: ", l1_dectree_clf2.tree_.max_depth)
print("2. # of nodes: ", l1_dectree_clf2.tree_.node_count)

Training time:  0:00:09.893523

1. Max depth:  15
2. # of nodes:  225


In [13]:
# Probability vs class labels
y_pred_proba_train = l1_dectree_clf2.predict_proba(train_x_scaled)[:,1]
prob_details = prob_class(y_pred_proba_train, train_y, 20)

# Filter the points that we are not confident to be not-fraud
prob_thresh = prob_details[0]*0.75
print("\nProb threshold selected: {}%".format(round(100*prob_thresh,3)))
temp = train_data[(y_pred_proba_train>=prob_thresh)]
print("Number of points filtered for L2: ", temp.shape[0])


Distribution of class labels for various prob buckets: 

   Prob range     Not-Fraud        Fraud     
----------------------------------------------------
  (0.0, 0.05]       213213           15      
  (0.05, 0.1]         9              1       
  (0.1, 0.15]         0              0       
  (0.15, 0.2]         0              0       
  (0.2, 0.25]         10             3       
  (0.25, 0.3]         0              0       
  (0.3, 0.35]         0              0       
  (0.35, 0.4]         0              0       
  (0.4, 0.45]         0              0       
  (0.45, 0.5]         0              0       
  (0.5, 0.55]         0              0       
  (0.55, 0.6]         0              0       
  (0.6, 0.65]         0              0       
  (0.65, 0.7]         0              0       
  (0.7, 0.75]         0              0       
  (0.75, 0.8]         0              0       
  (0.8, 0.85]         0              0       
  (0.85, 0.9]         0              0       
  (0.9, 0.95]  

In [16]:
# Decision tree 3: Max depth = 19

start_time = datetime.datetime.now()

l1_dectree_clf3 = DecisionTreeClassifier(max_depth = 21)
l1_dectree_clf3.fit(train_x_scaled, train_y)

print("Training time: ", datetime.datetime.now() - start_time)

# Some attributes of the decision tree
print("\n1. Max depth: ", l1_dectree_clf3.tree_.max_depth)
print("2. # of nodes: ", l1_dectree_clf3.tree_.node_count)

Training time:  0:00:10.234272

1. Max depth:  19
2. # of nodes:  287


In [17]:
# Probability vs class labels
y_pred_proba_train = l1_dectree_clf3.predict_proba(train_x_scaled)[:,1]
prob_details = prob_class(y_pred_proba_train, train_y, 20)

# Filter the points that we are not confident to be not-fraud
prob_thresh = prob_details[0]*0.75
print("\nProb threshold selected: {}%".format(round(100*prob_thresh,3)))
temp = train_data[(y_pred_proba_train>=prob_thresh)]
print("Number of points filtered for L2: ", temp.shape[0])


Distribution of class labels for various prob buckets: 

   Prob range     Not-Fraud        Fraud     
----------------------------------------------------
  (0.0, 0.05]       213232           0       
  (0.05, 0.1]         0              0       
  (0.1, 0.15]         0              0       
  (0.15, 0.2]         0              0       
  (0.2, 0.25]         0              0       
  (0.25, 0.3]         0              0       
  (0.3, 0.35]         0              0       
  (0.35, 0.4]         0              0       
  (0.4, 0.45]         0              0       
  (0.45, 0.5]         0              0       
  (0.5, 0.55]         0              0       
  (0.55, 0.6]         0              0       
  (0.6, 0.65]         0              0       
  (0.65, 0.7]         0              0       
  (0.7, 0.75]         0              0       
  (0.75, 0.8]         0              0       
  (0.8, 0.85]         0              0       
  (0.85, 0.9]         0              0       
  (0.9, 0.95]  

<p style='font-size:18px'><b> Comparison </b></p>

<p>

1. Logistic regression
- Has the fastest run-time, but is very simple & underfitting making it unable to clear a sufficient number of 'not-fraud' points from this level.


2. RB SVM
- Has the worst train & run time, but has the best probabilistic interpretation.
- It is also able to clear a hige number of 'not-fraud' points in level 1 without overfitting the data.


3. Decision tree: After experimenting with the depth of the decision tree, following observations were made:
- For depth <= 20: The model was heavily underfitting allowing only ~374 points to clear the level 1 as not-fraud
- For depth = 27: The model completely overfits the data will 100% accuracy, precision & recall
- For depth IN [21, 24]: A balanced bias-varaince tradeoff is visible, with the model filtering ~15k points for level 2,similar to RBF SVM, but with a much faster train & run time.

</p>


<p style='font-size:16px'><b> Conclusion </b></p>
    
<p>
    
- We will choose RBF SVM as the final model for L1 due to it's better probabilistic interpretation.
- The prob values on decision tree are simply the fraction of positive points in a leaf node, this may not work well enough for future unseen datapoints.
- However, for faster run-times, we can experiment with decision tree separately.
                     
</p>
    

<p style='font-size:18px'><b> Comparison </b></p>

<p>


A) Logistic regression
- Has the fastest run-time, but is very simple & underfitting making it unable to clear a sufficient number of 'not-fraud' points from this level.
    
B) RBF SVM
- Has the worst train & run time, but has the best probabilistic interpretation.
- It is also able to clear a huge number of 'not-fraud' points in level 1 without overfitting the data.
    
C) Decision tree: After experimenting with various depths, following observations were made:
- Below a certain depth value, the model was heavily underfitting allowing < 2000 points to clear the level 1 as not-fraud.    
- Beyond another depth value, the model completely overfits train data with 100% accuracy, precision & recall.
- Only when the depth is in a certain range, the model filters a sufficient number of datapoints & also doesn't overfit the dataset.                                                                                 
 
</p>



<p style='font-size:18px'><b> Conclusion </b></p>

<p>

- We will choose RBF SVM as the final model for L1 due to it's better probabilistic interpretation.
- The prob values on decision tree are simply the fraction of positive points in a leaf node, this may not work well enough for future unseen datapoints
- However, for faster run-times, we can experiment with decision tree separately.

 
</p>


In [20]:
# Filtering points that didn't pass the L1 model as 'not-fraud' to be used as training data for L2 model
y_pred_proba_train = l1_rbfsvm_clf.predict_proba(train_x_scaled)[:,1]
train_l2 = train_data[(y_pred_proba_train>=l1_prob_thresh)]

try:
    train_l2.to_csv('dataset/creditcard_train_l2.csv', mode = 'x', index = False)
except:
    print("File already saved?")

joblib.dump(scaler, 'models/l1_scaler.pkl')
joblib.dump(l1_rbfsvm_clf, 'models/l1_rbfsvm_clf.pkl')
joblib.dump(l1_prob_thresh, 'models/l1_prob_thresh.pkl')


['models/l1_prob_thresh.pkl']