## **Feature_Engineering**

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/drive/MyDrive/ML and DL DataSets/End_to_End_Financial_Fraud_Anomaly_Detection/Financial_Fraud_Dataset.csv")

### **Create new features**

In [None]:
df['balanceDeltaOrg'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['balanceDeltaDest'] = df['newbalanceDest'] - df['oldbalanceDest']

In [None]:
print(df['balanceDeltaDest'].head())

0        0.0
1        0.0
2        0.0
3   -21182.0
4        0.0
Name: balanceDeltaDest, dtype: float64


In [None]:
print(df['balanceDeltaOrg'].head())

0     9839.64
1     1864.28
2      181.00
3      181.00
4    11668.14
Name: balanceDeltaOrg, dtype: float64


In [None]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,balanceDeltaOrg,balanceDeltaDest
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,9839.64,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,1864.28,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,181.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,181.0,-21182.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,11668.14,0.0


### **Dropping features**

In [None]:
# Dropping step column, it's not much relevant here
df = df.drop(['step', 'nameOrig', 'nameDest'], axis=1)
df.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,balanceDeltaOrg,balanceDeltaDest
0,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0,9839.64,0.0
1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0,1864.28,0.0
2,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0,181.0,0.0
3,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0,181.0,-21182.0
4,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0,11668.14,0.0


## **Preprocessing**

### **Implement StandardScaler and OneHotEncoder**

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# Define which columns are numeric and which are categorical
numeric_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'balanceDeltaOrg', 'balanceDeltaDest']
categorical_features = ['type']

In [None]:
# Build transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])

# Combine preprocessing for numeric and categorical features
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

## **Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df.drop(['isFraud', 'isFlaggedFraud'], axis=1)  # Drop target columns
y = df['isFraud']

# 1. Split into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42  # 15% for testing
)

# 2. Split training + validation into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.15 / 0.85, random_state=42  # 15% of total / 85% remaining = ~17.65% for validation
)

# Print the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

X_train shape: (4453833, 8)
X_val shape: (954394, 8)
X_test shape: (954393, 8)
y_train shape: (4453833,)
y_val shape: (954394,)
y_test shape: (954393,)


In [None]:
# Printing the sample data of X_train
X_train.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,balanceDeltaOrg,balanceDeltaDest
3572030,PAYMENT,6585.21,232407.98,225822.77,0.0,0.0,6585.21,0.0
1919224,PAYMENT,2025.88,375421.38,373395.5,0.0,0.0,2025.88,0.0
5272205,PAYMENT,9969.13,0.0,0.0,0.0,0.0,0.0,0.0
84405,PAYMENT,3465.7,50939.0,47473.3,0.0,0.0,3465.7,0.0
770006,CASH_OUT,147562.88,0.0,0.0,3322010.27,3469573.15,0.0,147562.88


In [None]:
# Printing the sample data of X_test
X_test.head()

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,balanceDeltaOrg,balanceDeltaDest
3737323,CASH_IN,330218.42,20866.0,351084.42,452419.57,122201.15,-330218.42,-330218.42
264914,PAYMENT,11647.08,30370.0,18722.92,0.0,0.0,11647.08,0.0
85647,CASH_IN,152264.21,106589.0,258853.21,201303.01,49038.8,-152264.21,-152264.21
5899326,TRANSFER,1551760.63,0.0,0.0,3198359.45,4750120.08,0.0,1551760.63
2544263,CASH_IN,78172.3,2921331.58,2999503.88,415821.9,337649.6,-78172.3,-78172.3


In [None]:
# Printing the sample data of y_train
y_train.head()

Unnamed: 0,isFraud
3572030,0
1919224,0
5272205,0
84405,0
770006,0


In [None]:
# Printing the sample data of X_test
y_test.head()

Unnamed: 0,isFraud
3737323,0
264914,0
85647,0
5899326,0
2544263,0


In [None]:
# Apply the preprocessor on the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# use the same transformation for the validation data
X_val_transformed = preprocessor.transform(X_val)

# use the same transformation for the test data
X_test_transformed = preprocessor.transform(X_test)

In [None]:
# Print the first 5 rows of the transformed data
print(X_train_transformed[:5])

[[-0.28962896 -0.208166   -0.21512383 -0.32304018 -0.33262238  0.18819303
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.29726023 -0.15867157 -0.16467294 -0.32304018 -0.33262238  0.15728551
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.28396505 -0.28859832 -0.29232615 -0.32304018 -0.33262238  0.14355214
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.2948503  -0.27096923 -0.27609639 -0.32304018 -0.33262238  0.16704599
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.05366467 -0.28859832 -0.29232615  0.65252943  0.61018118  0.14355214
   0.02901186  0.          1.          0.          0.          0.        ]]


In [None]:
# Print the first 5 rows of the transformed data
print(X_val_transformed[:5])

[[ 0.15466307 -0.17629678 -0.27439001  0.61922223  0.61318342  1.98762648
   0.18260984  0.          1.          0.          0.          0.        ]
 [ 0.34719656 -0.26949734 -0.29232615 -0.31097387 -0.21627996  0.51769663
   0.32456288  0.          0.          0.          0.          1.        ]
 [ 0.0170275   3.70156564  3.71417184 -0.01144693 -0.09587637 -1.14308243
  -0.38731007  1.          0.          0.          0.          0.        ]
 [-0.15042267 -0.28859832 -0.29232615  0.4303727   0.31639797  0.14355214
  -0.37162573  0.          1.          0.          0.          0.        ]
 [-0.05116221 -0.27873324 -0.29232615 -0.32304018 -0.29211815  0.33678646
   0.03085689  0.          1.          0.          0.          0.        ]]


In [None]:
# Print the first 5 rows of the transformed data
print(X_test_transformed[:5])

[[ 0.25205882 -0.28137696 -0.17230046 -0.19017883 -0.29941608 -2.09498608
  -0.56059633  1.          0.          0.          0.          0.        ]
 [-0.28115655 -0.2780878  -0.28592532 -0.32304018 -0.33262238  0.22250726
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.04579572 -0.25170974 -0.20383167 -0.26392384 -0.31929683 -0.8886412
  -0.34099113  1.          0.          0.          0.          0.        ]
 [ 2.29664036 -0.28859832 -0.29232615  0.61621709  0.95815026  0.14355214
   1.7618685   0.          0.          0.          0.          1.        ]
 [-0.16980859  0.72242327  0.73311815 -0.20092641 -0.24087125 -0.38637493
  -0.24955767  1.          0.          0.          0.          0.        ]]


In [None]:
# Viewing all the feature names after preprocssing
feature_names = preprocessor.get_feature_names_out()
print(feature_names)

['num__amount' 'num__oldbalanceOrg' 'num__newbalanceOrig'
 'num__oldbalanceDest' 'num__newbalanceDest' 'num__balanceDeltaOrg'
 'num__balanceDeltaDest' 'cat__type_CASH_IN' 'cat__type_CASH_OUT'
 'cat__type_DEBIT' 'cat__type_PAYMENT' 'cat__type_TRANSFER']


## **SMOTE_Technique**

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(sampling_strategy=0.4,
              k_neighbors=3,
              random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_transformed, y_train)

In [None]:
print("X_train_resampled shape:", X_train_resampled.shape)
print("y_train_resampled shape:", y_train_resampled.shape)

X_train_resampled shape: (6227267, 12)
y_train_resampled shape: (6227267,)


In [None]:
print(X_train_resampled[:5])

[[-0.28962896 -0.208166   -0.21512383 -0.32304018 -0.33262238  0.18819303
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.29726023 -0.15867157 -0.16467294 -0.32304018 -0.33262238  0.15728551
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.28396505 -0.28859832 -0.29232615 -0.32304018 -0.33262238  0.14355214
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.2948503  -0.27096923 -0.27609639 -0.32304018 -0.33262238  0.16704599
  -0.15308879  0.          0.          0.          1.          0.        ]
 [-0.05366467 -0.28859832 -0.29232615  0.65252943  0.61018118  0.14355214
   0.02901186  0.          1.          0.          0.          0.        ]]


In [None]:
print(y_train_resampled[:5])

0    0
1    0
2    0
3    0
4    0
Name: isFraud, dtype: int64


In [None]:
print(y_train_resampled.value_counts())

isFraud
0    4448048
1    1779219
Name: count, dtype: int64
