In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE  # if using oversampling

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [11]:
# 1. Load data
df = pd.read_csv('Synthetic_Financial_datasets_log.csv')

In [None]:

# First, let's load your dataset (adjust the filename if needed)
# df = pd.read_csv('your_fraud_dataset.csv')

# If you already have it loaded, let's explore it
print("=" * 50)
print("DATASET OVERVIEW")
print("=" * 50)

# Basic info
print(f"\nDataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

print("\n" + "=" * 50)
print("COLUMN NAMES")
print("=" * 50)
print(df.columns.tolist())

print("\n" + "=" * 50)
print("DATA TYPES")
print("=" * 50)
print(df.dtypes)

print("\n" + "=" * 50)
print("FIRST 5 ROWS")
print("=" * 50)
print(df.head())

print("\n" + "=" * 50)
print("BASIC STATISTICS")
print("=" * 50)
print(df.describe())

print("\n" + "=" * 50)
print("MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())

print("\n" + "=" * 50)
print("CATEGORICAL COLUMNS (object type)")
print("=" * 50)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)

for col in categorical_cols:
    print(f"\n{col} - Unique values:")
    print(df[col].value_counts())

print("\n" + "=" * 50)
print("TARGET VARIABLE DISTRIBUTION")
print("=" * 50)
# Common names for fraud column: 'isFraud', 'is_fraud', 'fraud', 'Class'
fraud_col_names = ['isFraud', 'is_fraud', 'fraud', 'Class', 'label']
for col_name in fraud_col_names:
    if col_name in df.columns:
        print(f"\n{col_name} distribution:")
        print(df[col_name].value_counts())
        print(f"\nFraud percentage: {(df[col_name].sum() / len(df)) * 100:.2f}%")
        break

DATASET OVERVIEW

Dataset shape: (6362620, 11)
Number of rows: 6362620
Number of columns: 11

COLUMN NAMES
['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']

DATA TYPES
step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

FIRST 5 ROWS
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720 