In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler as scale
from sklearn.preprocessing import OrdinalEncoder as oe

In [14]:
data = pd.read_csv("Fraud_Detection.csv")
data = data.dropna(axis=0)
data.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [15]:
x_data = data.drop(["isFraud"], axis=1)
y_data = data["isFraud"]

scale_list = ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]
encode_list = ["type", "nameOrig", "nameDest"]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data,train_size=0.8)

for i in x_data.columns:

  if i in scale_list:
    scaling = scale()
    x_train[i] = scaling.fit_transform(np.array(x_train[i]).reshape(-1, 1))
    x_test[i] = scaling.transform(np.array(x_test[i]).reshape(-1, 1))

  elif i in encode_list:
    encoding = oe(handle_unknown='use_encoded_value', unknown_value=-1)
    x_train[i] = encoding.fit_transform(np.array(x_train[i]).reshape(-1, 1))
    x_test[i] = encoding.transform(np.array(x_test[i]).reshape(-1, 1))

In [17]:
print(x_train.head())

        step  type    amount  nameOrig  oldbalanceOrg  newbalanceOrig  \
9452       7   3.0 -0.566352  378263.0      -0.270371       -0.276359   
565777    23   3.0 -0.568906  299619.0      -0.247694       -0.253732   
595270    33   3.0 -0.530639  463263.0      -0.300539       -0.303432   
491596    19   1.0 -0.361787  353643.0      -0.180636       -0.206327   
325799    16   3.0 -0.514904   20642.0      -0.162585       -0.174588   

        nameDest  oldbalanceDest  newbalanceDest  isFlaggedFraud  
9452     98785.0       -0.418875       -0.458300             0.0  
565777  136469.0       -0.418875       -0.458300             0.0  
595270   58955.0       -0.418875       -0.458300             0.0  
491596    8298.0        9.588381        8.982745             0.0  
325799  110334.0       -0.418875       -0.458300             0.0  


In [51]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators=100, max_depth=100, learning_rate=0.1, objective='binary:logistic',  nthread=4, seed=20)
model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)])

[0]	validation_0-logloss:0.59836	validation_1-logloss:0.59835
[1]	validation_0-logloss:0.52074	validation_1-logloss:0.52071
[2]	validation_0-logloss:0.45601	validation_1-logloss:0.45597
[3]	validation_0-logloss:0.40130	validation_1-logloss:0.40126
[4]	validation_0-logloss:0.35457	validation_1-logloss:0.35452
[5]	validation_0-logloss:0.31431	validation_1-logloss:0.31426
[6]	validation_0-logloss:0.27938	validation_1-logloss:0.27935
[7]	validation_0-logloss:0.24891	validation_1-logloss:0.24889
[8]	validation_0-logloss:0.22222	validation_1-logloss:0.22237
[9]	validation_0-logloss:0.19864	validation_1-logloss:0.19879
[10]	validation_0-logloss:0.17783	validation_1-logloss:0.17798
[11]	validation_0-logloss:0.15943	validation_1-logloss:0.15958
[12]	validation_0-logloss:0.14314	validation_1-logloss:0.14341
[13]	validation_0-logloss:0.12861	validation_1-logloss:0.12887
[14]	validation_0-logloss:0.11565	validation_1-logloss:0.11592
[15]	validation_0-logloss:0.10409	validation_1-logloss:0.10435
[1

In [52]:
y_pred = model.predict(x_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)
print(score)

0.999584846067434


In [57]:
print("The train data class distribution:")
print(y_train.value_counts())

print("\nThe test data class distributio:")
print(y_test.value_counts())

print("\nThe prediction distribution:")
print(pd.DataFrame(y_pred).value_counts())

print("\nThe number of samples predicted wrongly:")
print(len(y_test) - (score * len(y_test)))

print("\nThere is a huge unbalance in the fraud class in the dataset")

The train data class distribution:
0.0    491077
1.0       306
Name: isFraud, dtype: int64

The test data class distributio:
0.0    122781
1.0        65
Name: isFraud, dtype: int64

The prediction distribution:
0    122786
1        60
dtype: int64

The number of samples predicted wrongly:
51.0

There is a huge unbalance in the fraud class in the dataset


In [54]:
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred))

print("The fraud samples available for the model are very less, so the f1 has droped.")


0.5920000000000001
