# PAYMENT FRAUD DETECTION

**------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------**

## <font color=dodgerblue> Importing Libraries</font>

* **Pandas**

In [3]:
import pandas as pd

## <font color=dodgerblue> Reading the Dataset </font>

In [4]:
data = pd.read_csv("PS_20174392719_1491204439457_log.csv")

## <font color=dodgerblue> Finding Information </font>

 - **Total Entries - <font color=deeppink> 63,62,620 </font>**
 - **Total Columns - <font color=deeppink> 11 </font>**
 - **Float Datatype - <font color=deeppink> 5 Columns </font>**
 - **Object Datatype - <font color=deeppink> 3 Columns </font>**
 - **Integer Datatype - <font color=deeppink> 3 Columns </font>**

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


## <font color=dodgerblue> Checking Top 5 Records </font>

In [7]:
data.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## <font color=dodgerblue> Dropping all the unnecessary columns </font>

 - **step**

In [9]:
data.drop('step',axis=1,inplace=True)

## <font color=dodgerblue> Detailed Statistics on the Dataset </font>

In [10]:
data.describe()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


## <font color=DodgerBlue> Finding the Percentage of Null Values in each Column </font>

 - **So we don't have any null values present in the Dataset**

In [12]:
percentage_of_null_values = (data.isnull().sum() / data.isnull().count()) * 100
round(percentage_of_null_values,2)

type              0.0
amount            0.0
nameOrig          0.0
oldbalanceOrg     0.0
newbalanceOrig    0.0
nameDest          0.0
oldbalanceDest    0.0
newbalanceDest    0.0
isFraud           0.0
isFlaggedFraud    0.0
dtype: float64

## <font color=dodgerblue>Mapping the Values in isFraud and isFlagged Column</font>

 - **Mapping 0 as Genuine and 1 as Fraud in isFraud Column**
 - **Mapping 0 as Flagged Genuine and 1 as Flagged Fraud in isFlaggedFraud Column**

In [13]:
data["isFraud"]=data.isFraud.map({0:"Genuine",1:"Fraud"})

In [14]:
data["isFlaggedFraud"]=data.isFlaggedFraud.map({0:"Flagged Genuine",1:"Flagged Fraud"})

## <font color=dodgerblue>Counting the Genuine and Fraud Data in isFraud Column</font>

In [16]:
data.isFraud.value_counts()

Genuine    6354407
Fraud         8213
Name: isFraud, dtype: int64

## <font color=dodgerblue>Counting the Flagged Genuine and Flagged Fraud Data in isFlaggedFraud Column</font>

In [17]:
data.isFlaggedFraud.value_counts()

Flagged Genuine    6362604
Flagged Fraud           16
Name: isFlaggedFraud, dtype: int64

## <font color=dodgerblue>Creating separate dataframes for the Genuine and the Fraud Data</font>

 - **We are creating separate dataframes as working with the fraud data wont give us the right insights**

- **<font color=deeppink>genuine_data</font> dataframe for the genuine data**
- **<font color=deeppink>fraud_data</font> dataframe for the fraud data**

In [19]:
genuine_data = data[data["isFraud"]=="Genuine"]

In [20]:
fraud_data = data[data["isFraud"]=="Fraud"]

## <font color=dodgerblue>Checking the Dataframes</font>

In [22]:
genuine_data.head()

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,Genuine,Flagged Genuine
1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,Genuine,Flagged Genuine
4,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,Genuine,Flagged Genuine
5,PAYMENT,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,Genuine,Flagged Genuine
6,PAYMENT,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,Genuine,Flagged Genuine


In [23]:
fraud_data.head()

Unnamed: 0,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,Flagged Genuine
3,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,Flagged Genuine
251,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,Fraud,Flagged Genuine
252,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,Fraud,Flagged Genuine
680,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,Fraud,Flagged Genuine


## <font color=dodgerblue>Analysis on the Fraud Data</font>

 - **Total Count of the Fraud Data**

In [26]:
print(f"Total Count of Fraud Data - {len(fraud_data)}")

Total Count of Fraud Data - 8213


## <font color=dodgerblue>Checking Flagged Correctly or Wrongly</font>

 - **Flagged Correctly means the transaction was fraud and it was flagged as fraud**

In [27]:
print(f'Flagged Fraud Count - {len(fraud_data[fraud_data["isFlaggedFraud"]=="Flagged Fraud"])}')
print(f"Flagged Fraud Percentage - {round(len(fraud_data[fraud_data['isFlaggedFraud']=='Flagged Fraud'])/len(fraud_data),4)*100} %")

Flagged Fraud Count - 16
Flagged Fraud Percentage - 0.19 %


 - **Flagged Wrongly means the transaction was fraud but it was flagged as genuine**

In [28]:
print(f'Flagged Genuine Count - {len(fraud_data[fraud_data["isFlaggedFraud"]=="Flagged Genuine"])}')
print(f"Flagged Genuine Percentage - {round(len(fraud_data[fraud_data['isFlaggedFraud']=='Flagged Genuine'])/len(fraud_data),4)*100} %")

Flagged Genuine Count - 8197
Flagged Genuine Percentage - 99.81 %


## <font color=dodgerblue>Analysis on the Genuine Data</font>

 - **Total Count of the Genuine Data**

In [29]:
print(f" Total Count of Genuine Data - {len(genuine_data)}")

 Total Count of Genuine Data - 6354407


## <font color=dodgerblue>Checking Flagged Correctly or Wrongly</font>

 - **Flagged Correctly means the transaction was genuine and it was flagged as genuine**

In [30]:
print(f'Flagged Genuine Count - {len(genuine_data[genuine_data["isFlaggedFraud"]=="Flagged Genuine"])}')
print(f"Flagged Genuine Percentage - {round(len(genuine_data[genuine_data['isFlaggedFraud']=='Flagged Genuine'])/len(genuine_data),4)*100} %")

Flagged Genuine Count - 6354407
Flagged Genuine Percentage - 100.0 %


 - **Flagged Wrongly means the transaction was genuine but it was flagged as fraud**

In [31]:
print(f'Flagged Fraud Count - {len(genuine_data[genuine_data["isFlaggedFraud"]=="Flagged Fraud"])}')
print(f"Flagged Fraud Percentage - {round(len(genuine_data[genuine_data['isFlaggedFraud']=='Flagged Fraud'])/len(genuine_data),4)*100} %")

Flagged Fraud Count - 0
Flagged Fraud Percentage - 0.0 %
