In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

DATA COLLECTING AND PROCESSING


In [5]:
#loading the csv data to Pandas DataFrame
fraud_data = pd.read_csv('/content/Fraud.csv')

In [6]:
dask_df = dd.from_pandas(fraud_data, npartitions=4)

In [7]:
## print first 5 rows
fraud_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [8]:
## print last 5 rows
fraud_data.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
83556,10,PAYMENT,7705.7,C1834114901,96490.0,88784.3,M1214836727,0.0,0.0,0.0,0.0
83557,10,CASH_OUT,319045.01,C1964329082,56471.0,0.0,C699133054,0.0,319045.01,0.0,0.0
83558,10,CASH_IN,249169.96,C1421944154,3481.0,252650.96,C790672270,38177.07,0.0,0.0,0.0
83559,10,CASH_OUT,244279.64,C722886752,29968.0,0.0,C1492538502,25680.0,269959.64,0.0,0.0
83560,10,CASH_OUT,145014.63,C60491101,,,,,,,


In [9]:
## print the no. of rows and columns
fraud_data.shape

(83561, 11)

In [10]:
## getting some info about the data
fraud_data.info

<bound method DataFrame.info of        step      type     amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0         1   PAYMENT    9839.64  C1231006815       170136.0       160296.36   
1         1   PAYMENT    1864.28  C1666544295        21249.0        19384.72   
2         1  TRANSFER     181.00  C1305486145          181.0            0.00   
3         1  CASH_OUT     181.00   C840083671          181.0            0.00   
4         1   PAYMENT   11668.14  C2048537720        41554.0        29885.86   
...     ...       ...        ...          ...            ...             ...   
83556    10   PAYMENT    7705.70  C1834114901        96490.0        88784.30   
83557    10  CASH_OUT  319045.01  C1964329082        56471.0            0.00   
83558    10   CASH_IN  249169.96  C1421944154         3481.0       252650.96   
83559    10  CASH_OUT  244279.64   C722886752        29968.0            0.00   
83560    10  CASH_OUT  145014.63    C60491101            NaN             NaN   

       

In [11]:
## checking for missing values
fraud_data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     1
newbalanceOrig    1
nameDest          1
oldbalanceDest    1
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64

In [12]:
# Drop rows with NaN values
fraud_data.dropna(inplace=True)

In [13]:
## statistical measure about the data
fraud_data.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,83560.0,83560.0,83560.0,83560.0,83560.0,83560.0,83560.0,83560.0
mean,8.204452,168488.6,888817.0,905501.6,866140.7,1180913.0,0.00134,0.0
std,1.859641,336358.3,2731026.0,2769537.0,2397540.0,2826509.0,0.036587,0.0
min,1.0,0.32,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,9550.573,0.0,0.0,0.0,0.0,0.0,0.0
50%,9.0,48733.65,19873.0,0.0,19389.16,36855.93,0.0,0.0
75%,9.0,205677.2,179591.2,201969.8,556126.7,1019467.0,0.0,0.0
max,10.0,10000000.0,33797390.0,34008740.0,31306920.0,31976990.0,1.0,0.0


In [14]:
# Checking the Distribution of Target Variable isFraud
fraud_data['isFraud'].value_counts()

0.0    83448
1.0      112
Name: isFraud, dtype: int64

In [15]:
# label encoding for column name 'type' which contains string value
label_encoder = LabelEncoder()
fraud_data['type_encoded'] = label_encoder.fit_transform(fraud_data['type'])
fraud_data.drop(columns=['type'], inplace=True)

In [16]:
## label encoding
fraud_data['FirstCharacter'] = fraud_data['nameDest'].str[0].map({'C': 0, 'M': 1})
fraud_data['FirstChar'] = fraud_data['nameOrig'].str[0].map({'C': 2})
fraud_data.drop(columns=['nameDest'], inplace=True)
fraud_data.drop(columns=['nameOrig'], inplace=True)

Splitting the features and target

In [17]:
X = fraud_data.drop(columns = 'isFraud',axis = 1)
Y = fraud_data['isFraud']

In [18]:
print(X)

       step     amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0         1    9839.64       170136.0       160296.36            0.00   
1         1    1864.28        21249.0        19384.72            0.00   
2         1     181.00          181.0            0.00            0.00   
3         1     181.00          181.0            0.00        21182.00   
4         1   11668.14        41554.0        29885.86            0.00   
...     ...        ...            ...             ...             ...   
83555    10   14895.17        51759.0        36863.83       979963.09   
83556    10    7705.70        96490.0        88784.30            0.00   
83557    10  319045.01        56471.0            0.00            0.00   
83558    10  249169.96         3481.0       252650.96        38177.07   
83559    10  244279.64        29968.0            0.00        25680.00   

       newbalanceDest  isFlaggedFraud  type_encoded  FirstCharacter  FirstChar  
0                0.00             0.0     

SPLITTING THE DATA INTO TRAINING AND TESTING

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=2)

In [20]:
print(X.shape, X_train.shape, X_test.shape)

(83560, 10) (62670, 10) (20890, 10)


In [21]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the feature matrix 'X'
X_scaled = scaler.fit_transform(X)

MODEL TRAINING

In [22]:
model = LogisticRegression()

In [23]:
# Training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

MODEL EVALUATION

ACCURACY SCORE

In [24]:
# accuracy of training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [25]:
print('accuracy on training data', training_data_accuracy)

accuracy on training data 0.9987394287537897


In [26]:
# accuracy of testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [27]:
print('accuracy on test data',test_data_accuracy)

accuracy on test data 0.9990426041168023
