In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## Dictionary of variables
1. **Step:** Maps a unit of time in the real world. In this case 1 step is 1 hour of time.
2. **Type:** Type of transaction: CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER
3. **Amount**: Amount of the transaction in local currency
4. **NameOrig:** Customer who started the transaction
5. **OldbalanceOrg:** Initial balance before the transaction
6. **NewbalanceOrig:** Customer's balance after the transaction.
7. **NameDest:** Recipient ID of the transaction.
8. **OldbalanceDest:** Initial recipient balance before the transaction.
9.  **NewbalanceDest:** Recipient's balance after the transaction.
10. **IsFraud:** Identifies a fraudulent transaction (1) and non fraudulent (0)

In [2]:
# Read the CSV file and store the data in a dataframe
data = pd.read_csv("Data_payments.csv")

# Display the first 5 rows of the dataframe
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
# Check for missing values in the dataframe
# The isnull() function returns a dataframe with boolean values indicating whether a value is missing or not
# The sum() function sums up the boolean values and returns the count of missing values for each column
data.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [4]:
# The value_counts() function returns a Pandas series with the value counts of each unique value in the specified column.
# The index of the series will be the unique values, and the values will be the counts of each unique value.

# Get the value counts of the 'type' column
type_transaction = data['type'].value_counts()

# Get the value counts of the 'isFraud' column
fraud_transactions = data['isFraud'].value_counts()

In [5]:
# Create a pie chart with the value counts of the 'type' column
fig1 = px.pie(data, 
             values=type_transaction, 
             names=type_transaction.index , hole = 0.9, 
             title="Percentaje of Transaction Type")

# Create a pie chart with the value counts of the 'isFraud' column
fig2 = px.pie(data, 
             values=fraud_transactions, 
             names=fraud_transactions.index ,
             title="Percentaje of Transaction Type") 

# Set the size of the first pie chart
fig1.update_layout(width=800, height=400)

# Set the size of the second pie chart
fig2.update_layout(width=800, height=400)

# Display the first pie chart
fig1.show()

# Display the second pie chart
fig2.show()


In [6]:
# The corr() function returns a Pandas dataframe with the correlations between all columns in the original dataframe.
# The sort_values() function sorts the values in the specified column in ascending or descending order.
# In this case, the values are sorted in descending order, so the columns with the highest correlation with 'isFraud' will appear first.

# Calculate the correlation between all columns
correlation = data.corr()

# Get the correlation of the 'isFraud' column with all other columns
# and sort the values in descending order
correlation["isFraud"].sort_values(ascending=False)

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64

In [7]:
# The map() function applies a function or a mapping to a Pandas series.
# In this case, it maps the values in the 'type' and 'isFraud' columns to new values using a dictionary.
# The head() function displays the first n rows of the dataframe (in this case, n=5).

# Map the values in the 'type' column to integer values
data["type"] = data["type"].map({"CASH_OUT": 1,
                                 "PAYMENT": 2, 
                                 "CASH_IN": 3,
                                 "TRANSFER": 4,
                                 "DEBIT": 5})

# Map the values in the 'isFraud' column to string values
data["isFraud"] = data["isFraud"].map({0: "No Fraud",1: "Fraud"})

# Display the first five rows of the modified dataframe
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0


In [8]:
# The np.array() function converts a Pandas dataframe or series to a NumPy array.
# In this case, it converts the specified columns of the dataframe to a NumPy array and stores it in the 'x' variable.
# It also converts the 'isFraud' column to a NumPy array and stores it in the 'y' variable.
# NumPy arrays are used for mathematical and scientific operations in Python and are often used in machine learning.

# Convert the specified columns of the dataframe to a NumPy array
x = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])

# Convert the 'isFraud' column of the dataframe to a NumPy array
y = np.array(data[["isFraud"]])

In [9]:
# The train_test_split() function splits the data into a training set and a testing set.
# The test_size parameter specifies the proportion of the data that should be used for testing (in this case, 20%).
# The random_state parameter specifies a seed for the random number generator, which is used to shuffle the data before splitting it.
# The fit() function fits the model to the training data.
# The score() function returns the model's score on the testing data.
# The score is a measure of how well the model predicts the target values for the testing data.

# Split the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.20, random_state=42)

# Create a Decision Tree Classifier model
model = DecisionTreeClassifier()

# Fit the model to the training data
model.fit(xtrain, ytrain)

# Print the model's score on the testing data
print(model.score(xtest, ytest))

0.9997076675960532


In [10]:
# The predict() function makes a prediction on a single or multiple samples using the trained model.
# In this case, it makes a prediction on the single sample stored in the 'features' array.
# The output of the predict() function is an array with the predicted values for each sample.

# Create a NumPy array with a single sample
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])

# Use the model to make a prediction on the sample
model.predict(features)

array(['Fraud'], dtype=object)

In [11]:
#Test with all inputs you want
features = np.array([[2, 1000.00, 1000.00, 0.0],
                     [4, 5000.00, 5000.00, 0.0],
                     [1, 1000.00, 1000.00, 0.0],
                     [2, 1000.00, 1000.00, 0.0],
                     [3, 1000.00, 1000.00, 0.0],
                     [5, 5000.00, 5000.00, 0.0],
                     [8, 1000.00, 1000.00, 0.0]])

# Use the model to make a prediction on the sample
model.predict(features)

array(['No Fraud', 'Fraud', 'No Fraud', 'No Fraud', 'No Fraud',
       'No Fraud', 'No Fraud'], dtype=object)