### EDA 

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('AIML Dataset.csv')
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
dataset.shape

(6362620, 11)

In [4]:
dataset.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [5]:
# Used to find if dataset has any missing values
dataset.isna().sum().any()

False

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


##### Now, column **type** is Categorical object which we convert into Numerical Data to apply operations on it

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(dataset['type'])
label

array([3, 3, 4, ..., 1, 4, 1])

In [8]:
# Inserting converted type data into our dataset
dataset.insert(2,'type_num', label)

In [9]:
# Converting nameOrig column into only ids by removing C from its front
dataset['nameOrig'] = dataset['nameOrig'].replace({'C': ''}, regex=True)
dataset

Unnamed: 0,step,type,type_num,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,3,9839.64,1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,3,1864.28,1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,4,181.00,1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,1,181.00,840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,3,11668.14,2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,1,339682.13,786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,4,6311409.28,1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,1,6311409.28,1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,4,850002.52,1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [10]:
dataset['nameOrig'] = pd.to_numeric(dataset['nameOrig'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   type_num        int32  
 3   amount          float64
 4   nameOrig        int64  
 5   oldbalanceOrg   float64
 6   newbalanceOrig  float64
 7   nameDest        object 
 8   oldbalanceDest  float64
 9   newbalanceDest  float64
 10  isFraud         int64  
 11  isFlaggedFraud  int64  
dtypes: float64(5), int32(1), int64(4), object(2)
memory usage: 558.2+ MB


### Feature Selection

Columns **step, nameDest, type and isFlaggedFraud** are not taken into consideration for training our prediction models because
column **nameDest** is string which are not providing any significance to our data, **type** column is dropped because we already converted it into numerical data and **isFlaggedFraud** is removed because we believe that it is the pre determined output which needs to be found out by the model. Also **step** is just hour out of 30 days of simulation

In [11]:
dataset = dataset.drop(['step','nameDest', 'type','isFlaggedFraud'], axis=1)

In [12]:
dataset

Unnamed: 0,type_num,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,3,9839.64,1231006815,170136.00,160296.36,0.00,0.00,0
1,3,1864.28,1666544295,21249.00,19384.72,0.00,0.00,0
2,4,181.00,1305486145,181.00,0.00,0.00,0.00,1
3,1,181.00,840083671,181.00,0.00,21182.00,0.00,1
4,3,11668.14,2048537720,41554.00,29885.86,0.00,0.00,0
...,...,...,...,...,...,...,...,...
6362615,1,339682.13,786484425,339682.13,0.00,0.00,339682.13,1
6362616,4,6311409.28,1529008245,6311409.28,0.00,0.00,0.00,1
6362617,1,6311409.28,1162922333,6311409.28,0.00,68488.84,6379898.11,1
6362618,4,850002.52,1685995037,850002.52,0.00,0.00,0.00,1


In [13]:
# Assigning values
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [14]:
X

array([[3.00000000e+00, 9.83964000e+03, 1.23100682e+09, ...,
        1.60296360e+05, 0.00000000e+00, 0.00000000e+00],
       [3.00000000e+00, 1.86428000e+03, 1.66654430e+09, ...,
        1.93847200e+04, 0.00000000e+00, 0.00000000e+00],
       [4.00000000e+00, 1.81000000e+02, 1.30548614e+09, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.00000000e+00, 6.31140928e+06, 1.16292233e+09, ...,
        0.00000000e+00, 6.84888400e+04, 6.37989811e+06],
       [4.00000000e+00, 8.50002520e+05, 1.68599504e+09, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 8.50002520e+05, 1.28032381e+09, ...,
        0.00000000e+00, 6.51009911e+06, 7.36010163e+06]])

In [15]:
y

array([0, 0, 1, ..., 1, 1, 1], dtype=int64)

### Splitting data into train and test set and also apply Feature Scaling

In [16]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [17]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### Training the Logistic Regression Model on training set

In [18]:
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

LogisticRegression(random_state=0)

### Predicting Test Set Results

In [19]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)), 1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


### Making the Confusion Matrix

In [20]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(f"Accuracy of model: {accuracy_score(y_test, y_pred)}")

[[1588511     117]
 [   1203     824]]
Accuracy of model: 0.999170153175893


### Predicting a single input

In [21]:
dataset.sample()

Unnamed: 0,type_num,amount,nameOrig,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
1505026,1,238127.6,641109996,108606.0,0.0,237312.94,475440.53,0


In [26]:
# type_num	amount	nameOrig	oldbalanceOrg	newbalanceOrig	oldbalanceDest	newbalanceDest
classifier.predict(sc.transform([[1, 238127.6, 641109996, 108606.0, 0.0, 237312.94, 475440.53]]))

array([0], dtype=int64)

We can see from above prediction that it came up with correct prediction for the above query

### Saving our model

In [23]:
import pickle

# Dumping our model into a file
with open('fraud_model.bin', 'wb') as f_out:
    pickle.dump(classifier, f_out)

### Trying our pickle model

In [24]:
with open('fraud_model.bin', 'rb') as f_in:
    model2 = pickle.load(f_in)

In [25]:
# predicting the single value which we predicted before
# predict_mpg(sc.transform([[3, 14650.1, 908817173, 19373.0, 4722.9, 0.0, 0.0,]]), model)

model2.predict(sc.transform([[3, 14650.1, 908817173, 19373.0, 4722.9, 0.0, 0.0,]]))

array([0], dtype=int64)

It gives the same result from before, it means our model is working fine