Importing required libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,ConfusionMatrixDisplay,roc_auc_score

Reading dataset

In [None]:
df = pd.read_csv('fraud_dataset.csv')

Checking null values

In [None]:
df.isnull().sum()

Checking number of unique values

In [None]:
df.nunique(axis=0)

Checkiong for duplicate values

In [None]:
df[df.duplicated()]

checking statistical distribution

In [None]:
df.describe()

checking target columns

In [None]:
sns.countplot(x = 'isFraud' ,data=df )
plt.title('Target Variable')

In [None]:
categorical_columns= df.select_dtypes(include= ["object"]).columns
def categorical_bar(categorical_columns):
    for column in categorical_columns:
        plt.figure(figsize=(20,4))
        ax = plt.subplot(121)
        df[column].value_counts().plot(kind="bar")
        plt.xlabel(column)
        plt.ylabel("count of customers")
        plt.title(column)
        
        # Add percentage labels to the top of each bar
        total = sum([p.get_height() for p in ax.patches])
        for p in ax.patches:
            width = p.get_width()
            height = p.get_height()
            x, y = p.get_xy() 
            ax.annotate(f'{height/total:.1%}', (x + width/2, y + height*1.02), ha='center')

In [None]:
categorical_bar(['type'])

In [None]:
categorical_columns= df.select_dtypes(include= ["object"]).columns
print(categorical_columns)


BIvariate analysis of categorical with target column

In [None]:

fig = px.histogram(df, x="type",
             color='isFraud', barmode='group',text_auto=True,
             height=400)
fig.show()

Univariate ana;ysis of numerical columns

In [None]:
numerical_variable = df.select_dtypes(include=[np.number])

In [None]:
def numerical_hist(columns):
    plt.style.use("ggplot")
    for column in columns:
        plt.figure(figsize=(20,4))
        ax = plt.subplot(121)
        sns.histplot(data=df, x=column,kde=True,bins=50)
        plt.title(column)

In [None]:
numerical_hist(numerical_variable)

In [None]:
df.info()

outlier detection

In [None]:
def numerical_boxplot(columns):
    plt.style.use("ggplot")
    for column in columns:
        plt.figure(figsize=(20,4))
        ax = plt.subplot(121)
        sns.boxplot(data=df, x=column)
        plt.title(column)

In [None]:
numerical_boxplot(numerical_variable)

Z score outlier detection

In [None]:
# Z score
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(df['amount']))
outlier = np.array(np.where(z > 3))
#a = pd.DataFrame(outlier)
print(outlier.size)

IQR outlier detection

In [None]:
lower= []
upper= []
for i in numerical_variable.columns:
    IQR= df[i].quantile(0.75) - df[i].quantile(0.25)
    lower_bound= df[i].quantile(0.25) - (1.5*IQR)
    upper_bound= df[i].quantile(0.75) + (1.5*IQR)
    
    print(i, ":", lower_bound, ",",  upper_bound)
    
    lower.append(lower_bound)
    upper.append(upper_bound)

Correlation detection

In [None]:
corr = df.corr()# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
df.head()

'oldbalanceOrg','oldbalanceDest' these two are highly correlated wwith newballaanceorg and newcalancedest

removing highly correlated

In [None]:
df = df.drop(['oldbalanceOrg','oldbalanceDest'],axis=1)
df

In [None]:
corr = df.corr()# plot the heatmap
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

%matplotlib inline 
# Generate visualizations
AV.AutoViz("fraud_dataset.csv")

In [None]:
df.head()

In [None]:
def encode_nominal(df,column):
    for i in column:
        dummies = pd.get_dummies(data=df[i],prefix=i)
        df = pd.concat([df,dummies],axis=1)
        df.drop([i],axis=1,inplace=True)
    return df

In [None]:
df = encode_nominal(df,['type'])
df

model creation

In [None]:
X = df.drop(['isFraud','nameOrig','nameDest'], axis=1)
y = df['isFraud']

TRain test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

Balancing the data

In [None]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(sampling_strategy=0.75, random_state=101)
X_train_enn, y_train_enn = smote_enn.fit_resample(X_train, y_train)

y_train_enn.value_counts()

Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_enn)
X_test_scaled = scaler.transform(X_test)

Creating fucntion to report model

In [None]:
def report_model(model):
    y_pred = model.predict(X_test_scaled)
    print(classification_report(y_test,y_pred))
    print ("The accuracy of is : ", accuracy_score(y_test, y_pred)*100, "%")
    print ("The aurroc_auc_score of is : ", roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]))
    cm= confusion_matrix(y_test,y_pred)
    cm_plot = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
    cm_plot.plot()
    plt.show()

Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

model_lg = LogisticRegression(random_state=101)

In [None]:
model_lg.fit(X_train_scaled,y_train_enn)

In [None]:
report_model(model_lg)

HYper parameter tuning for losgistic regression

In [None]:
# Penalty Type
penalty = ['l1', 'l2']

# Use logarithmically spaced C values (recommended in official docs)
C = np.logspace(0, 4, 10)

In [None]:
grid_model_lg = GridSearchCV(model_lg,param_grid={'C':C,'penalty':penalty})
grid_model_lg.fit(X_train_scaled,y_train_enn)

In [None]:
import pickle

In [None]:
#pickle.dump(grid_model_lg, open('grid_model_lg.pkl', 'wb'))

In [None]:
pickled_lg = pickle.load(open('grid_model_lg.pkl', 'rb'))

In [None]:
pickled_lg.best_params_

In [None]:
report_model(pickled_lg)

Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(n_estimators=10,max_features='auto',random_state=101)

In [None]:
model_rf.fit(X_train_scaled,y_train_enn)

In [None]:
report_model(model_rf)

In [None]:
n_estimators=[64,100,128,150]
max_features= [2,3,4,8]
bootstrap = [True,False]
oob_score = [True,False]

In [None]:
param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'bootstrap':bootstrap,
             'oob_score':oob_score}

In [None]:
grid_model_rf = GridSearchCV(model_rf,param_grid)
#grid_model_rf.fit(X_train_scaled,y_train_enn)

In [None]:
#pickle.dump(grid_model_rf, open('grid_model_rf.pkl', 'wb'))

In [None]:
pickled_rf = pickle.load(open('grid_model_rf.pkl', 'rb'))

In [None]:
pickled_rf.best_params_

In [None]:
report_model(pickled_rf)

XGboost

In [None]:
from xgboost import XGBClassifier

model_xg = XGBClassifier(objective='binary:logitraw',random_state=101,tree_method="gpu_hist")

In [None]:
model_xg.fit(X_train_scaled,y_train_enn)

In [None]:
report_model(model_xg)

HYperparameter tuning

In [None]:
param_grid = { 'max_depth': [3,5,7,10],
           'learning_rate': [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.2,0.5,0.7,1],
           'n_estimators': [100, 64, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}

In [None]:
grid_model_xgb = GridSearchCV(model_xg,param_grid)
#grid_model_xgb.fit(X_train_scaled,y_train_enn)

In [None]:
#pickle.dump(grid_model_xgb, open('grid_model_xgb.pkl', 'wb'))

In [None]:
pickled_xgb = pickle.load(open('grid_model_xgb.pkl', 'rb'))

In [None]:
pickled_xgb.best_params_

In [None]:
report_model(pickled_xgb)

In [None]:
feat_xg = pd.DataFrame(data=model_xg.feature_importances_, index=X.columns, columns=["importance"]).sort_values(by='importance',ascending=False)
plt.figure(figsize=(14,6),dpi=200)
sns.barplot(data=feat_xg,x=feat_xg.index,y='importance')

plt.xticks(rotation=90);

Here for this project xgboost model performs best for us with a auc roc score of 0.9864.
