In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Mounting the google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
original_data_path = "/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Data/Bank_Transaction.csv"
synthetic_data_path = "/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Data/Synthetic_Bank_Data.csv"
augmented_data_path = "/content/drive/MyDrive/GNCIPL_Internship_Projects/Finance/Data/Augmented_data.csv"

In [4]:
# Loading data
original_data = pd.read_csv(original_data_path)
synthetic_data = pd.read_csv(synthetic_data_path)

# Analysing the Synthetic Data

In [6]:
# ---- Step 1: Basic Characteristics ----
print("\n" + "="*50)
print("## Step 1: Basic Characteristics of the Data")
print("="*50 + "\n")

# Dataset shape
print(f"🔹 Shape of dataset: {synthetic_data.shape} ")
print(f"🔹 Number of observations: {synthetic_data.shape[0]}")
print(f"🔹 Number of variables: {synthetic_data.shape[1]}")
print("\n" + "-"*50)


# Data types
print("🔹 Data Types:\n")
print(synthetic_data.dtypes.to_string())
print("="*50 + "\n")



## Step 1: Basic Characteristics of the Data

🔹 Shape of dataset: (20000, 24) 
🔹 Number of observations: 20000
🔹 Number of variables: 24

--------------------------------------------------
🔹 Data Types:

Customer_ID                 object
Customer_Name               object
Gender                      object
Age                          int64
State                       object
City                        object
Bank_Branch                 object
Account_Type                object
Transaction_ID              object
Transaction_Date            object
Transaction_Time            object
Transaction_Amount         float64
Merchant_ID                 object
Transaction_Type            object
Merchant_Category           object
Account_Balance            float64
Transaction_Device          object
Transaction_Location        object
Device_Type                 object
Is_Fraud                     int64
Transaction_Currency        object
Customer_Contact            object
Transaction_Description  

In [7]:
# ---- Step 2: Overall Structure ----
print("\n" + "="*50)
print("## Step 2: Overall Structure of the Dataset")
print("="*50 + "\n")

# Missing values
missing = synthetic_data.isnull().sum()
print(f"🔹 Total Missing Values: {missing.sum()}\n")

# Target variable distribution
print("🔹 Target Variable: Class Distribution")
print(synthetic_data['Is_Fraud'].value_counts().to_string())
print("\nPercentage Distribution:")
print((synthetic_data['Is_Fraud'].value_counts(normalize=True) * 100).round(3).to_string())
print("\n" + "-"*50)


## Step 2: Overall Structure of the Dataset

🔹 Total Missing Values: 0

🔹 Target Variable: Class Distribution
Is_Fraud
1    20000

Percentage Distribution:
Is_Fraud
1    100.0

--------------------------------------------------


### Class Distribution

In [8]:
value_count = synthetic_data['Is_Fraud'].value_counts()
fig = px.pie(
    names=['Fraud'],
    values=value_count,
    title="Class Distribution",
    color=[ 'Fraud'],
    color_discrete_map={'Fraud': 'salmon'}
)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

# Now Analyzing Augmented Data

In [11]:
# Loading the Augmented Data
augmented_data = pd.read_csv(augmented_data_path)

#Shape of Data
augmented_data.shape

(235000, 24)

In [10]:
# Now Checking the distribution of data
value_count = augmented_data['Is_Fraud'].value_counts()
fig = px.pie(
    names=['No Fraud', 'Fraud'],
    values=value_count,
    title="Class Distribution",
    color=['No Fraud', 'Fraud'],
    color_discrete_map={'No Fraud': 'skyblue', 'Fraud': 'salmon'}
)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()