In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from statistics import mean
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import PowerTransformer
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.utils import resample

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve,confusion_matrix


from sklearn import metrics 
from sklearn.model_selection import  train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
df=pd.read_csv("Fraud.csv")

In [29]:
# Splitting features and target 
X = df.drop(['isFraud','nameOrig','nameDest'],axis=1)
y = df['isFraud']

In [31]:
label_encoder = LabelEncoder()
label_encoder.fit(X["type"])

In [32]:
list(label_encoder.classes_)

['CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER']

In [33]:
X["type"]=label_encoder.transform(X["type"])

In [34]:
X.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,1,3,9839.64,170136.0,160296.36,0.0,0.0
1,1,3,1864.28,21249.0,19384.72,0.0,0.0
2,1,4,181.0,181.0,0.0,0.0,0.0
3,1,1,181.0,181.0,0.0,21182.0,0.0
4,1,3,11668.14,41554.0,29885.86,0.0,0.0


In [35]:
# Splitting train and test data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y,random_state=42)

In [36]:
# since there is no null values so we are moving forward with feature selection

### Feature selection

- pearson correlation

In [37]:
# if two  features are highly correlated ( not w.r.t target i m saying ) .then we can drop one of the feature as it is doing the same thing , for that we specify some threshold value
# taking a threshold value --> given by domain expert
# the time being we are taking threshold =0.95

In [38]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [39]:
# giving threshold 95 percent for the time 
threshold=0.95

In [40]:
correlation(X_train,threshold)

{'newbalanceDest', 'newbalanceOrig'}

In [41]:
X_train=X_train.drop(columns=['newbalanceDest','newbalanceOrig'],axis=1)

In [42]:
X_test=X_test.drop(columns=['newbalanceDest','newbalanceOrig'],axis=1)

**Drop Constant Features Using Variance Threshold**

In [43]:
### It will remove  zero variance features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X_train) # you ave to do fit on x_train

In [44]:
var_thres.get_support()

array([ True,  True,  True,  True,  True])

- so there is no constant feature 

## Handeling Imbalanced data

In [45]:
from imblearn.combine import SMOTETomek

# Resampling the minority class. The strategy can be changed as required.
smt = SMOTETomek(random_state=42,sampling_strategy='minority',n_jobs=-1)

In [46]:
# Fit the model to generate the data.
X_train, y_train = smt.fit_resample(X_train, y_train)

In [47]:
X_train.shape ,y_train.shape

((8894168, 5), (8894168,))

# standardize the datasets

**Why Robust scaler and not Standard scaler?**

- Scaling the data using Robust scaler
- Since most of the independent variables are not normally distributed we cannot use Standardscaler

**Why Robust Scaler and not Minmax?**

- because most of the feature has outliers. So Minmax will scale data according to Max values which is outlier.
- This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).

In [48]:
# Fit with robust scaler for KNN best K-selection experminet
robustscaler = RobustScaler()
s_x_train = robustscaler.fit_transform(X_train)
s_x_test  = robustscaler.transform(X_test)

## grid search for XGBClassifier

In [49]:
def evaluate_clf(true, predicted):
    '''
    This function takes in true values and predicted values
    Returns: Accuracy, F1-Score, Precision, Recall, Roc-auc Score
    '''
    acc = accuracy_score(true, predicted) # Calculate Accuracy
    f1 = f1_score(true, predicted) # Calculate F1-score
    precision = precision_score(true, predicted) # Calculate Precision
    recall = recall_score(true, predicted)  # Calculate Recall
    roc_auc = roc_auc_score(true, predicted) #Calculate Roc
    return acc, f1 , precision, recall, roc_auc

In [50]:
# parameter for xgbooost
# choosing only this much parameters tuning our model
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [51]:
#for Grid Search CV
cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)
xgb_clf_=XGBClassifier(tree_method='gpu_hist', predictor='gpu_predictor')
grid_cv = GridSearchCV(estimator=xgb_clf_, param_grid=params  , cv = cv) 
grid_cv.fit(s_x_train,y_train)

In [52]:
grid_cv.best_params_

{'colsample_bytree': 1.0,
 'gamma': 0.5,
 'max_depth': 5,
 'min_child_weight': 1,
 'subsample': 0.8}

In [53]:
xgb_clf_grid =  XGBClassifier(**grid_cv.best_params_,n_jobs= -1 )
xgb_clf_grid.fit(s_x_train,y_train) 

In [54]:
# Make predictions
y_train_pred = xgb_clf_grid.predict(s_x_train)
y_test_pred = xgb_clf_grid.predict(s_x_test)

# Training set performance
model_train_accuracy, model_train_f1,model_train_precision,\
model_train_recall,model_train_rocauc_score=evaluate_clf(y_train ,y_train_pred)



# Test set performance
model_test_accuracy,model_test_f1,model_test_precision,\
model_test_recall,model_test_rocauc_score=evaluate_clf(y_test, y_test_pred)



print('Model performance for Training set')
print("- Accuracy: {:.4f}".format(model_train_accuracy))
print('- F1 score: {:.4f}'.format(model_train_f1)) 
print('- Precision: {:.4f}'.format(model_train_precision))
print('- Recall: {:.4f}'.format(model_train_recall))
print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))


print('----------------------------------')

print('Model performance for Test set')
print('- Accuracy: {:.4f}'.format(model_test_accuracy))
print('- F1 score: {:.4f}'.format(model_test_f1))
print('- Precision: {:.4f}'.format(model_test_precision))
print('- Recall: {:.4f}'.format(model_test_recall))
print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))

Model performance for Training set
- Accuracy: 0.9940
- F1 score: 0.9940
- Precision: 0.9908
- Recall: 0.9972
- Roc Auc Score: 0.9940
----------------------------------
Model performance for Test set
- Accuracy: 0.9906
- F1 score: 0.2141
- Precision: 0.1200
- Recall: 0.9927
- Roc Auc Score: 0.9916
