Fraud Detection System: Develop a Bayesian Belief Network to detect fraudulent transactions based on transaction features, such as amount, location, and time. Use the model to identify suspicious transactions and reduce financial losses due to fraud.

# Implementation of Bayesian Belief Network for Fraud Detection.

#### Import libraries

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime as dt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from pgmpy.estimators import MaximumLikelihoodEstimator,BayesianEstimator
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

#### Load Dataset

In [2]:
file = pd.read_csv("fraudTest_.csv")
df = pd.DataFrame(file)
df.head(5)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


#### Data Cleansing

In [3]:
# selecting the required features
trans_data = df[['amt', 'state','trans_date_trans_time','is_fraud']] 

#Rename the columns Amount & Location
trans_data.rename(columns= {'amt':'Amount','state':'Location'},inplace =True) 

 #Convert to datetime format
trans_data["trans_date_trans_time"] = pd.to_datetime(trans_data["trans_date_trans_time"])

# fetched Time
trans_data["trans_time"] = trans_data["trans_date_trans_time"].dt.hour 

#Dropping the unnecessary column
trans_data = trans_data.drop(['trans_date_trans_time'],axis=1) 
print("Dimension of data:",trans_data.shape)

#checking if any null values present
trans_data.isnull().sum()

Dimension of data: (555719, 4)


Amount        0
Location      0
is_fraud      0
trans_time    0
dtype: int64

In [4]:
le = LabelEncoder()
trans_data['Location']  = le.fit_transform(trans_data['Location']) # convert the object to numeric
trans_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Amount      555719 non-null  float64
 1   Location    555719 non-null  int32  
 2   is_fraud    555719 non-null  int64  
 3   trans_time  555719 non-null  int64  
dtypes: float64(1), int32(1), int64(2)
memory usage: 14.8 MB


In [5]:
print("Clean Dataset : \n")
print(trans_data.head(5))

Clean Dataset : 

   Amount  Location  is_fraud  trans_time
0    2.86        39         0          12
1   29.84        43         0          12
2   41.28        33         0          12
3   60.05         8         0          12
4    3.19        21         0          12


#### Building the network(DAG)

In [6]:
model = BayesianModel([('Amount', 'is_fraud'), ('Location', 'is_fraud'),('trans_time', 'is_fraud')])

#### Training the model

In [7]:
model.fit(trans_data,estimator = MaximumLikelihoodEstimator )

#### Conditional Probability Table of Amount,Location,time

In [8]:
#print("\nCPD:Amount\n",model.get_cpds('Amount'))
#print("\nCPD:Location\n",model.get_cpds('Location'))

print("\nCPD:Time\n",model.get_cpds('trans_time'))


CPD:Time
 +----------------+-----------+
| trans_time(0)  | 0.0326658 |
+----------------+-----------+
| trans_time(1)  | 0.03322   |
+----------------+-----------+
| trans_time(2)  | 0.0326424 |
+----------------+-----------+
| trans_time(3)  | 0.0327486 |
+----------------+-----------+
| trans_time(4)  | 0.0325254 |
+----------------+-----------+
| trans_time(5)  | 0.0322411 |
+----------------+-----------+
| trans_time(6)  | 0.0325812 |
+----------------+-----------+
| trans_time(7)  | 0.0325668 |
+----------------+-----------+
| trans_time(8)  | 0.0323779 |
+----------------+-----------+
| trans_time(9)  | 0.0324732 |
+----------------+-----------+
| trans_time(10) | 0.0324786 |
+----------------+-----------+
| trans_time(11) | 0.0325488 |
+----------------+-----------+
| trans_time(12) | 0.0504518 |
+----------------+-----------+
| trans_time(13) | 0.0507055 |
+----------------+-----------+
| trans_time(14) | 0.0507523 |
+----------------+-----------+
| trans_time(15) | 0.0504716

#### Inference from the Bayesian Belief Network

In [9]:
infer = VariableElimination(model)

In [10]:
q=infer.query(variables=['is_fraud'],evidence = {"Location": 8}) # a local test data to find the inference 
print(q)

+-------------+-----------------+
| is_fraud    |   phi(is_fraud) |
| is_fraud(0) |          0.5422 |
+-------------+-----------------+
| is_fraud(1) |          0.4578 |
+-------------+-----------------+


In [11]:
q=infer.query(variables=['is_fraud'],evidence = {"Amount": 41.28})
print(q)

+-------------+-----------------+
| is_fraud    |   phi(is_fraud) |
| is_fraud(0) |          0.5237 |
+-------------+-----------------+
| is_fraud(1) |          0.4763 |
+-------------+-----------------+


In [12]:
q=infer.query(variables=['is_fraud'],evidence = {"trans_time": 12})
print(q)

+-------------+-----------------+
| is_fraud    |   phi(is_fraud) |
| is_fraud(0) |          0.5480 |
+-------------+-----------------+
| is_fraud(1) |          0.4520 |
+-------------+-----------------+


#### Detect Fraud transactions

In [13]:
suspicious_transactions = []
for index, row in trans_data.iterrows():
    evidence = {'Amount': row['Amount'], 'Location': row['Location'], 'trans_time': row['trans_time'] % 24}
    query = 'is_fraud'
    
    posterior = infer.query(variables = [query],evidence=evidence,joint = False)
    p = posterior['is_fraud'].values
    print(p)
    if p[1] >= 0.5:
        suspicious_transactions.append(index)


print("Suspicious transactions: \n") # Print the suspicious transactions
print(suspicious_transactions)


[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]
[1. 0.]


KeyboardInterrupt: 