# **Payment Fraud detection model (Accredian Assignment) :**

## **1.) Importing Libraries and Dataset :**

In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import pickle
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [64]:
data = pd.read_csv(r'Fraud.csv')

In [65]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## **2.) Data preprocessing and understanding statistics of the data :**

In [66]:
data.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [67]:
data.duplicated().sum()

0

In [68]:
data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [70]:
# Exploring Transaction Type :
print(data.type.value_counts())

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64


### Data visualization of distrubution of transaction type :

In [71]:
Type = data['type'].value_counts()
Transaction = Type.index
Quantity = Type.values

In [72]:
fig = px.pie(data,names = Transaction, values = Quantity,hole=0.5,title = "Distribution of Transaction Type",)
fig.update_layout(title_x=0.5)
fig.show()

### Understanding the Correlation of dependent variable :

In [73]:
data.select_dtypes(include=['float','int'])

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,9839.64,170136.00,160296.36,0.00,0.00,0,0
1,1,1864.28,21249.00,19384.72,0.00,0.00,0,0
2,1,181.00,181.00,0.00,0.00,0.00,1,0
3,1,181.00,181.00,0.00,21182.00,0.00,1,0
4,1,11668.14,41554.00,29885.86,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...
6362615,743,339682.13,339682.13,0.00,0.00,339682.13,1,0
6362616,743,6311409.28,6311409.28,0.00,0.00,0.00,1,0
6362617,743,6311409.28,6311409.28,0.00,68488.84,6379898.11,1,0
6362618,743,850002.52,850002.52,0.00,0.00,0.00,1,0


In [74]:
data.select_dtypes(include='object')

Unnamed: 0,type,nameOrig,nameDest
0,PAYMENT,C1231006815,M1979787155
1,PAYMENT,C1666544295,M2044282225
2,TRANSFER,C1305486145,C553264065
3,CASH_OUT,C840083671,C38997010
4,PAYMENT,C2048537720,M1230701703
...,...,...,...
6362615,CASH_OUT,C786484425,C776919290
6362616,TRANSFER,C1529008245,C1881841831
6362617,CASH_OUT,C1162922333,C1365125890
6362618,TRANSFER,C1685995037,C2080388513


In [75]:
corr = data.select_dtypes(include=['float','int']).corr()
corr['isFraud'].sort_values(ascending = False)

Unnamed: 0,isFraud
isFraud,1.0
amount,0.076688
isFlaggedFraud,0.044109
step,0.031578
oldbalanceOrg,0.010154
newbalanceDest,0.000535
oldbalanceDest,-0.005885
newbalanceOrig,-0.008148


## Encoding Object Format :

In [76]:
oe = OrdinalEncoder()

In [77]:
data['encoded_type'] = oe.fit_transform(data[['type']])

In [78]:
data['encoded_type'].unique()

array([3., 4., 1., 2., 0.])

In [79]:
data['type'].unique()

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

**0 --> Represents CASH_IN**

**1 --> Represents CASH_OUT**

**2 --> Represents DEBIT**

**3 --> Represents PAYMENT**

**4 --> Represents TRANSFER**

In [80]:
corr = data.select_dtypes(include=['float','int']).corr()
corr['isFraud'].sort_values(ascending = False)

Unnamed: 0,isFraud
isFraud,1.0
amount,0.076688
isFlaggedFraud,0.044109
step,0.031578
encoded_type,0.020833
oldbalanceOrg,0.010154
newbalanceDest,0.000535
oldbalanceDest,-0.005885
newbalanceOrig,-0.008148


# **Model Building :**

In [81]:
# Splitting data in input and output variable :
x = np.array(data[['encoded_type','amount','oldbalanceOrg','newbalanceOrig']])
y = np.array(data[['isFraud']])

In [82]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [83]:
x_train

array([[0.00000000e+00, 2.67719800e+04, 2.75210000e+04, 5.42929800e+04],
       [1.00000000e+00, 3.44879650e+05, 0.00000000e+00, 0.00000000e+00],
       [4.00000000e+00, 1.86260784e+06, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.00000000e+00, 1.05791600e+04, 5.92790000e+04, 4.86998400e+04],
       [1.00000000e+00, 7.30207600e+04, 2.02890000e+04, 0.00000000e+00],
       [1.00000000e+00, 8.96988500e+04, 1.96071000e+05, 1.06372150e+05]])

In [84]:
sc = StandardScaler()

In [85]:
x_train = sc.fit_transform(x_train)
x_train

array([[-1.26974966, -0.25406699, -0.27925143, -0.27391502],
       [-0.52907109,  0.27402537, -0.28878594, -0.29249455],
       [ 1.69296463,  2.79361459, -0.28878594, -0.29249455],
       ...,
       [-0.52907109, -0.28094878, -0.26824902, -0.27582904],
       [-0.52907109, -0.17728912, -0.28175692, -0.29249455],
       [-0.52907109, -0.14960173, -0.2208581 , -0.25609307]])

In [86]:
x_test = sc.transform(x_test)
x_test

array([[-1.26974966,  0.24968618, -0.28155702, -0.17235038],
       [ 0.95228605, -0.27917592, -0.2782644 , -0.2860874 ],
       [-1.26974966, -0.04573662, -0.2518587 , -0.20391271],
       ...,
       [ 0.95228605, -0.2965212 , -0.28083069, -0.28504681],
       [ 0.95228605, -0.24294355, -0.28878594, -0.29249455],
       [-1.26974966,  0.15006798,  0.08907606,  0.17321563]])

# **Hyper-Parameter Tuning :**
* Tuning or getting the best parameters for our model.

In [87]:
model = DecisionTreeClassifier()

In [88]:
para = {"criterion" : ['log_loss', 'entropy', 'gini'],"splitter" : ["best", "random"],"max_depth" : [i for i in range (2,20)],"min_samples_split" : [i for i in range (2,20)],"min_samples_leaf" : [i for i in range (1,20)]}

In [89]:
rd = RandomizedSearchCV(model,param_distributions=para,n_iter = 20)

In [90]:
rd.fit(x_train,y_train)

In [91]:
rd.best_params_

{'splitter': 'best',
 'min_samples_split': 3,
 'min_samples_leaf': 5,
 'max_depth': 18,
 'criterion': 'log_loss'}

In [92]:
rd.best_score_*100

99.9587097320059

In [93]:
model = DecisionTreeClassifier(splitter='best',min_samples_split=10,min_samples_leaf=1,max_depth=16,criterion='entropy')

In [94]:
model.fit(x_train,y_train)

# **Model Score :**

In [95]:
print("Testing Score :",model.score(x_test,y_test)*100,"\nTraining Score :",model.score(x_train,y_train)*100)

Testing Score : 99.95924110927051 
Training Score : 99.97826591651148


# **Pridiction :**

In [96]:
# Features : 'encoded_type','amount','oldbalanceOrg','newbalanceOrig'

In [101]:
def prediction():
    while True:
        try:
            Types = int(input("Enter the Type only in interger as: \n0 --> Represents CASH_IN \n1 --> Represents CASH_OUT \n2 --> Represents DEBIT \n3 --> Represents PAYMENT \n4 --> Represents TRANSFER \n"))
            break
        except ValueError:
            print("Error: Invalid input. Please enter an integer.")

    while True:
        try:
            amount = float(input("Enter amount of the transaction :"))
            break
        except ValueError:
            print("Error: Invalid input. Please enter a float.")

    while True:
        try:
            oldbalanceOrg = float(input("Enter initial balance before the transaction : "))
            break
        except ValueError:
            print("Error: Invalid input. Please enter a float.")

    while True:
        try:
            newbalanceOrig = float(input("Enter new balance after the transaction : "))
            break
        except ValueError:
            print("Error: Invalid input. Please enter a float.")
    inputs = [Types,amount,oldbalanceOrg,newbalanceOrig]
    output = model.predict(sc.transform(np.array(inputs).reshape(-1, 4)))
    if output == 0:
          print("It is not Fraud")
    else:
          print("It is Fraud !!")


In [102]:
prediction()

Enter the Type only in interger as: 
0 --> Represents CASH_IN 
1 --> Represents CASH_OUT 
2 --> Represents DEBIT 
3 --> Represents PAYMENT 
4 --> Represents TRANSFER 
1
Enter amount of the transaction :1
Enter initial balance before the transaction : 1
Enter new balance after the transaction : 1
It is not Fraud


# **Pickling :**

In [104]:
pickle.dump(model,open('fraud_detection_model.pkl','wb'))

In [105]:
pickle_model = pickle.load(open('fraud_detection_model.pkl','rb'))

In [107]:
output = model.predict(sc.transform(np.array(inputs).reshape(-1, 4)))
if output == 0:
      print("It is not Fraud")
else:
      print("It is Fraud !!")

It is not Fraud
