In [None]:
import pandas as pd  # Importing pandas for data manipulation and analysis
import numpy as np  # Importing numpy for numerical operations and array handling
import seaborn as sns  # Importing seaborn for statistical data visualization
import matplotlib.pyplot as plt  # Importing pyplot from matplotlib for plotting graphs
from sklearn.model_selection import train_test_split  # Importing train_test_split for splitting datasets into training and testing sets


In [None]:
df=pd.read_csv('/content/PS_20174392719_1491204439457_log.csv')# Reading the CSV file into a pandas DataFrame

In [None]:
df.head(13)  # Displaying the first 13 rows of the DataFrame to get an overview of the data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,PAYMENT,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,PAYMENT,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,DEBIT,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [None]:
df.columns  # Listing all the column names in the DataFrame to understand the structure of the data

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [None]:
df.info()  # Displaying a summary of the DataFrame, including the data types of each column, non-null values, and memory usage

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [None]:
df['step'].unique()  # Getting the unique values in the 'step' column to understand the distinct time steps in the data

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [None]:
df.isnull().sum()  # Counting the number of missing (null) values in each column to identify data quality issues

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [None]:
df.shape  # Getting the dimensions of the DataFrame, i.e., the number of rows and columns

(6362620, 11)

In [None]:
df['type'].unique()  # Getting the unique values in the 'type' column to understand the different transaction types present in the data

array(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],
      dtype=object)

In [None]:
type = df['type'].value_counts()  # Counting the occurrences of each transaction type in the 'type' column and storing the result in the variable 'type'

In [None]:
type

type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64

In [None]:
transactions = type.index  # Extracting the index (transaction types) from the 'type' variable, storing them in 'transactions'

In [None]:
transactions

Index(['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT'], dtype='object', name='type')

In [None]:
quantity = type.values  # Extracting the counts of each transaction type from the 'type' variable and storing them in the 'quantity' variable

In [None]:
quantity

array([2237500, 2151495, 1399284,  532909,   41432])

In [None]:
import plotly.express as px  # Importing plotly express for interactive and expressive data visualization

In [None]:
px.pie(df,values=quantity,names=transactions,hole=0.2,title="Distribution of transactions type")

In [None]:
df = df.dropna()  # Dropping rows from the DataFrame where any NaN (missing) values are present

In [None]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [None]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [None]:
df['type']=df['type'].replace(to_replace=['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT'], value=[1, 2, 3, 4, 5])
# Replacing specific values in the DataFrame: 'CASH_OUT' with 1, 'PAYMENT' with 2, 'CASH_IN' with 3, 'TRANSFER' with 4, and 'DEBIT' with 5


In [None]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,2,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,4,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,1,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,2,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,1,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,4,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,1,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,4,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [None]:
df['isFraud'] = df['isFraud'].map({0: 'no fraud', 1: 'fraud'})
# Mapping the values in the 'isFraud' column: changing 0 to 'no fraud' and 1 to 'fraud'


In [None]:
x = df[['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig']]
# Selecting specific columns ('type', 'amount', 'oldbalanceOrg', 'newbalanceOrig') from the DataFrame and storing them in the variable 'x'


In [None]:
x

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig
0,2,9839.64,170136.00,160296.36
1,2,1864.28,21249.00,19384.72
2,4,181.00,181.00,0.00
3,1,181.00,181.00,0.00
4,2,11668.14,41554.00,29885.86
...,...,...,...,...
6362615,1,339682.13,339682.13,0.00
6362616,4,6311409.28,6311409.28,0.00
6362617,1,6311409.28,6311409.28,0.00
6362618,4,850002.52,850002.52,0.00


In [None]:
y = df.iloc[:, -2]
# Selecting the second-to-last column from the DataFrame and storing it in the variable 'y'

In [None]:
y

0          no fraud
1          no fraud
2             fraud
3             fraud
4          no fraud
             ...   
6362615       fraud
6362616       fraud
6362617       fraud
6362618       fraud
6362619       fraud
Name: isFraud, Length: 6362620, dtype: object

In [None]:
from sklearn.tree import DecisionTreeClassifier
# Importing the DecisionTreeClassifier class from the sklearn.tree module to use for building a decision tree model


In [None]:
model = DecisionTreeClassifier()
# Creating an instance of the DecisionTreeClassifier, initializing a decision tree model with default parameters

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
# Splitting the data into training and testing sets: 80% for training and 20% for testing, with a random state for reproducibility

In [None]:
model.fit(xtrain, ytrain)
# Training the decision tree model using the training data (xtrain and ytrain)

In [None]:
model.score(xtest, ytest)
# Calculate the accuracy score of the trained model on the testing data (xtest and ytest)

0.9997053100766665

In [None]:
model.predict([[4, 181.00, 181.00, 0.00]])
# Making a prediction using the trained decision tree model. Here, [4, 181.00, 181.00, 0.00] represents a single data point with features 'type', 'amount', 'oldbalanceOrg', and 'newbalanceOrig'.


X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names



array(['fraud'], dtype=object)

In [None]:
x

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig
0,2,9839.64,170136.00,160296.36
1,2,1864.28,21249.00,19384.72
2,4,181.00,181.00,0.00
3,1,181.00,181.00,0.00
4,2,11668.14,41554.00,29885.86
...,...,...,...,...
6362615,1,339682.13,339682.13,0.00
6362616,4,6311409.28,6311409.28,0.00
6362617,1,6311409.28,6311409.28,0.00
6362618,4,850002.52,850002.52,0.00


In [None]:
y

0          no fraud
1          no fraud
2             fraud
3             fraud
4          no fraud
             ...   
6362615       fraud
6362616       fraud
6362617       fraud
6362618       fraud
6362619       fraud
Name: isFraud, Length: 6362620, dtype: object