# **Statistical analysis**

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/drive/MyDrive/ML and DL DataSets/End_to_End_Financial_Fraud_Anomaly_Detection/Financial_Fraud_Dataset.csv")

In [None]:
#1. print the shape of the data
df.shape

(6362620, 11)

In [None]:
#2. printing the information of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


**The columns include:**
1. **step:** An integer representing the time step of the transaction.
2. **type:** Categorical variable indicating the type of transaction.
3. **amount:** Float value representing the amount of the transaction.
4. **nameOrig:** Object type representing the name of the origin account.
5. **oldbalanceOrg:** Float value indicating the old balance of the origin account before the transaction.
6. **newbalanceOrig:** Float value indicating the new balance of the origin account after the transaction.
7. **nameDest:** Object type representing the name of the destination account.
8. **oldbalanceDest:** Float value indicating the old balance of the destination account before the transaction.
9. **newbalanceDest:** Float value indicating the new balance of the destination account after the transaction.
10. **isFraud:** Binary integer indicating whether the transaction is fraudulent (1) or not (0).
11. **isFlaggedFraud:** Binary integer indicating whether the transaction was flagged as fraudulent (1) or not (0)

In [None]:
#3. describing the data
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [None]:
#4. checking the missing values in the data
df.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [None]:
#5. Value Counts for Categorical Variables
print("\nValue Counts for Categorical Variables:")
print(df['type'].value_counts())
print(df['isFraud'].value_counts())
print(df['isFlaggedFraud'].value_counts())


Value Counts for Categorical Variables:
type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64
isFraud
0    6354407
1       8213
Name: count, dtype: int64
isFlaggedFraud
0    6362604
1         16
Name: count, dtype: int64


In [None]:
#6. Flagged Fraud analysis
flagged_fraud_counts = df['isFlaggedFraud'].value_counts()
print("\nFlagged Fraud Counts:")
print(flagged_fraud_counts)


Flagged Fraud Counts:
isFlaggedFraud
0    6362604
1         16
Name: count, dtype: int64


In [None]:
#7 printing unique values counts for each column
for col in df.columns:
    print(f"Unique values for column {col}")
    print(df[col].value_counts())
    print("======================================================")

Unique values for column step
step
19     51352
18     49579
187    49083
235    47491
307    46968
       ...  
706        4
721        4
693        4
112        2
662        2
Name: count, Length: 743, dtype: int64
Unique values for column type
type
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: count, dtype: int64
Unique values for column amount
amount
10000000.00    3207
10000.00         88
5000.00          79
15000.00         68
500.00           65
               ... 
151849.59         1
341252.72         1
431409.04         1
344148.22         1
258347.61         1
Name: count, Length: 5316900, dtype: int64
Unique values for column nameOrig
nameOrig
C1530544995    3
C545315117     3
C724452879     3
C1784010646    3
C1677795071    3
              ..
C1567523029    1
C644777639     1
C1256645416    1
C1231536757    1
C1971151096    1
Name: count, Length: 6353307, dtype: int64
Unique values for column oldbalanceOrg
oldbalan

In [None]:
#7. Calculate the counts of isFraud and isFlaggedFraud
total_transactions = len(df)
fraud_count = df['isFraud'].sum()
flagged_fraud_count = df['isFlaggedFraud'].sum()

# Calculate the percentages
fraud_percentage = (fraud_count / total_transactions) * 100
flagged_fraud_percentage = (flagged_fraud_count / total_transactions) * 100

# Print the percentages
print(f"Percentage of isFraud: {fraud_percentage:.2f}%")
print(f"Percentage of isFlaggedFraud: {flagged_fraud_percentage:.5f}%")

Percentage of isFraud: 0.13%
Percentage of isFlaggedFraud: 0.00025%


In [None]:
#8. For each transaction type (like PAYMENT, TRANSFER, CASH_OUT, etc.), how many transactions were fraudulent (isFraud = 1) and how many were not fraudulent (isFraud = 0)
df.groupby(['type','isFraud']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFlaggedFraud
type,isFraud,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CASH_IN,0,1399284,1399284,1399284,1399284,1399284,1399284,1399284,1399284,1399284
CASH_OUT,0,2233384,2233384,2233384,2233384,2233384,2233384,2233384,2233384,2233384
CASH_OUT,1,4116,4116,4116,4116,4116,4116,4116,4116,4116
DEBIT,0,41432,41432,41432,41432,41432,41432,41432,41432,41432
PAYMENT,0,2151495,2151495,2151495,2151495,2151495,2151495,2151495,2151495,2151495
TRANSFER,0,528812,528812,528812,528812,528812,528812,528812,528812,528812
TRANSFER,1,4097,4097,4097,4097,4097,4097,4097,4097,4097


**CASH_OUT** and **TRANSFER** types have relatively higher counts of fraudulent transactions compared to others, which could indicate that these types are more vulnerable to fraudulent activities.