In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('credcard.csv')

In [3]:
df.isnull().sum()
## no null values in the dataset 

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [4]:
## check transactions type

df.type.value_counts()

CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

In [5]:
type1 = df['type'].value_counts()
transactions = type1.index
quantity = type1.values

#quantity
figure = px.pie(df, 
                values = quantity, 
                names = transactions, hole = .5, 
                title = 'Distribution of transaction types')
#figure.show()
figure.write_html('Distribution of Transactions Piechart.html', auto_open=True) ## opens pie chart in new window
#px.offline.init_notebook_mode(connected=True)

In [6]:
## check for correlation
correlation = df.corr(method = 'pearson', numeric_only = True)
correlation['isFraud'].sort_values(ascending=True)

newbalanceOrig   -0.008148
oldbalanceDest   -0.005885
newbalanceDest    0.000535
oldbalanceOrg     0.010154
step              0.031578
isFlaggedFraud    0.044109
amount            0.076688
isFraud           1.000000
Name: isFraud, dtype: float64

In [7]:
### transforming categorical data into numerical data

df['type'] = df['type'].map({'CASH_OUT':1, 'PAYMENT':2, 
                             'CASH_IN': 3, 'TRANSFER':4, 
                             'DEBIT': 5})

df['isFraud'] = df['isFraud'].map({0:'No Fraud', 1:'Fraud'})

df.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0
5,1,2,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,No Fraud,0
6,1,2,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,No Fraud,0
7,1,2,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,No Fraud,0
8,1,2,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,No Fraud,0
9,1,5,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,No Fraud,0


In [9]:
### split into train and test

x = np.array(df[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(df[['isFraud']])

In [13]:
## training the model

from sklearn.tree import DecisionTreeClassifier

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
print(model.score(x_test, y_test))

0.9996906934564692


In [14]:
### check for a new transaction, making a prediction

# features = [type, amount, oldbalance, newbalance] types: 1-cash out, 2-payment, 3-cash in, 4-transaction, 5-debit 

features = [[4, 10000, 10000, 0]]
model.predict(features)

array(['Fraud'], dtype=object)