In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler

#Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)
%matplotlib inline

#preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#ML libraries
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
#Metrics Libraries
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#Misc libraries
import warnings
warnings.filterwarnings("ignore")

In [2]:
# read the data
df = pd.read_csv("/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/paysim1/PS_20174392719_1491204439457_log.csv'

In [None]:
# checking the first 5 rows of the data
df.head()

In [None]:
# checking the shape of the data
df.shape

In [None]:
# checking the overall info about the data
df.info()

In [None]:
# checking for null values
df.isna().sum()

In [None]:
# descriptive statistics of the data
df.describe().round(2)

In [None]:
# checking type column categories
df["type"].unique()

In [None]:
# storing the column ech categorie count 
type = df["type"].value_counts()

In [None]:
# getting the categories in type column
transaction = type.index

In [None]:
quantity = type.values
quantity

In [None]:
# visualizing the type column categories
px.pie(df, values = quantity, names = transaction,hole = 0.4,title = "distribution of transaction type")

In [None]:
# Count the number of occurrences of each value in the 'isFraud' column
fraud_counts = df['isFraud'].value_counts()

# Calculate the percentage of fraud and non-fraud cases
fraud_percentage = (fraud_counts[1] / len(df)) * 100
non_fraud_percentage = (fraud_counts[0] / len(df)) * 100

# Print the fraud and non-fraud percentages
print("Fraud percentage:", fraud_percentage)
print("Non-fraud percentage:", non_fraud_percentage)

In [None]:
print('\n The types of fraudulent transactions are {}'.format(\
list(df.loc[df.isFraud == 1].type.drop_duplicates().values)))

dfFraudTransfer = df.loc[(df.isFraud == 1) & (df.type == 'TRANSFER')]
dfFraudCashout = df.loc[(df.isFraud == 1) & (df.type == 'CASH_OUT')]

print ('\n The number of fraudulent TRANSFERs = {}'.\
       format(len(dfFraudTransfer)))

print ('\n The number of fraudulent CASH_OUTs = {}'.\
       format(len(dfFraudCashout)))

In [None]:
#Checking for balance in target
fig = go.Figure(data=[go.Pie(labels=['Not Fraud','Fraud'], values=df['isFraud'].value_counts())])
fig.show()

In [None]:
# replacing string to integer values bassed the max occurance in the data
df.replace(to_replace = ['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'], value = [2,4,1,5,3],inplace = True)

In [None]:
# taking the required columns for further analysis
x = df[['type','amount','oldbalanceOrg','newbalanceOrig']]
y = df[['isFraud']]

In [None]:
# Undersampling:
undersampler = RandomUnderSampler(random_state=42)
X_res, y_res = undersampler.fit_resample(x, y)

In [None]:
#Checking for balance in target
fig = go.Figure(data=[go.Pie(labels=['Not Fraud','Fraud'], values=y_res['isFraud'].value_counts())])
fig.show()

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, train_size=0.7, random_state=111)

In [None]:
#Standardizing the numerical columns
col_names=['amount','oldbalanceOrg','newbalanceOrig']
features_train = X_train[col_names]
features_test = X_test[col_names]
scaler = StandardScaler().fit(features_train.values)
features_train = scaler.transform(features_train.values)
features_test = scaler.transform(features_test.values)
X_train[col_names] = features_train
X_test[col_names] =features_test

In [None]:
#creating the objects
logreg_cv = LogisticRegression(solver='liblinear',random_state=123)
dt_cv=DecisionTreeClassifier(random_state=123)
knn_cv=KNeighborsClassifier()
svc_cv=SVC(kernel='linear',random_state=123)
nb_cv=GaussianNB()
rf_cv=RandomForestClassifier(random_state=123)
cv_dict = {0: 'Logistic Regression', 1: 'Decision Tree',2:'KNN',3:'SVC',4:'Naive Bayes',5:'Random Forest'}
cv_models=[logreg_cv,dt_cv,knn_cv,svc_cv,nb_cv,rf_cv]


for i,model in enumerate(cv_models):
    print("{} Test Accuracy: {}".format(cv_dict[i],cross_val_score(model, X_train, y_train, cv=10, scoring ='accuracy').mean()))

In [None]:
# hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [2, 4, 8],
    'min_samples_split': [2, 5, 10],
}
# Create the Random Forest classifier
rfc = RandomForestClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator and its parameters
best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the best parameters
print("Best parameters:", best_params)

# Evaluate the best model on the test data
y_pred = best_estimator.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test accuracy:", test_accuracy)

In [None]:
#Classification metrics
print(classification_report(y_test, y_pred, target_names=['Not Fraud','Fraud']))