In [40]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [41]:
df = pd.read_csv("D:\MY WORKSPACE\Project Competition\save_N_spend\Bank Transaction Data\dataset.csv")

In [42]:
df.head()

Unnamed: 0,Date,Balance Amount,Transaction_Amount,Type,Recipient,Category
0,2024-12-16,16822.55,242.54,DR,Zomato,Food & Dining
1,2024-12-16,16649.55,173.0,DR,ZOMATO,Food & Dining
2,2024-12-16,16619.55,30.0,DR,SHRI G,Friend/Family
3,2024-12-16,16639.55,20.0,CR,Master,Banking & Transfers
4,2024-12-17,16374.55,265.0,DR,SHREE,Friend/Family


### 1) null values

In [43]:
df.isnull().sum()

Date                  0
Balance Amount        0
Transaction_Amount    0
Type                  0
Recipient             0
Category              0
dtype: int64

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                272 non-null    object 
 1   Balance Amount      272 non-null    float64
 2   Transaction_Amount  272 non-null    float64
 3   Type                272 non-null    object 
 4   Recipient           272 non-null    object 
 5   Category            272 non-null    object 
dtypes: float64(2), object(4)
memory usage: 12.9+ KB


In [45]:
df.shape

(272, 6)

In [46]:
df.duplicated().sum()

np.int64(0)

In [47]:
#unique value of each column
df.nunique()

Date                   69
Balance Amount        264
Transaction_Amount    132
Type                    2
Recipient              99
Category               10
dtype: int64

In [48]:
df.describe()

Unnamed: 0,Balance Amount,Transaction_Amount
count,272.0,272.0
mean,15203.927426,430.150478
std,6802.467001,1986.068199
min,596.67,1.0
25%,10141.54,30.0
50%,15788.04,70.0
75%,19176.175,172.25
max,33088.66,25000.0


COnvert Date to DateTime

In [49]:
df['Date'] = pd.to_datetime(df['Date'])

In [50]:
df.head(2)

Unnamed: 0,Date,Balance Amount,Transaction_Amount,Type,Recipient,Category
0,2024-12-16,16822.55,242.54,DR,Zomato,Food & Dining
1,2024-12-16,16649.55,173.0,DR,ZOMATO,Food & Dining


# Feature Engineering

Extract features like Day of Week, Day of Month, Month, etc., to capture temporal patterns.

Monday=0
.
.
. 
Sunday=6

In [51]:
df['Day_of_Week'] = df['Date'].dt.dayofweek 
df['Day_of_Month'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month

In [52]:
df

Unnamed: 0,Date,Balance Amount,Transaction_Amount,Type,Recipient,Category,Day_of_Week,Day_of_Month,Month
0,2024-12-16,16822.55,242.54,DR,Zomato,Food & Dining,0,16,12
1,2024-12-16,16649.55,173.00,DR,ZOMATO,Food & Dining,0,16,12
2,2024-12-16,16619.55,30.00,DR,SHRI G,Friend/Family,0,16,12
3,2024-12-16,16639.55,20.00,CR,Master,Banking & Transfers,0,16,12
4,2024-12-17,16374.55,265.00,DR,SHREE,Friend/Family,1,17,12
...,...,...,...,...,...,...,...,...,...
267,2025-03-15,9104.04,1000.00,DR,Master,Friend/Family,5,15,3
268,2025-03-15,9046.16,57.88,DR,Unknown,Other,5,15,3
269,2025-03-15,8996.16,50.00,DR,Mr SAV,Other,5,15,3
270,2025-03-15,8926.16,70.00,DR,Kailas,Other,5,15,3


In [53]:
# df = df.drop(columns=['Date'])

2. 'Transaction Frequency' column

Transaction frequency refers to how often a user makes transactions. You can calculate this as the number of transactions per day 

In [54]:
df['Transaction_Count_Per_Day'] = df.groupby(['Recipient', 'Date'])['Transaction_Amount'].transform('count')

In [55]:
df

Unnamed: 0,Date,Balance Amount,Transaction_Amount,Type,Recipient,Category,Day_of_Week,Day_of_Month,Month,Transaction_Count_Per_Day
0,2024-12-16,16822.55,242.54,DR,Zomato,Food & Dining,0,16,12,1
1,2024-12-16,16649.55,173.00,DR,ZOMATO,Food & Dining,0,16,12,1
2,2024-12-16,16619.55,30.00,DR,SHRI G,Friend/Family,0,16,12,1
3,2024-12-16,16639.55,20.00,CR,Master,Banking & Transfers,0,16,12,1
4,2024-12-17,16374.55,265.00,DR,SHREE,Friend/Family,1,17,12,1
...,...,...,...,...,...,...,...,...,...,...
267,2025-03-15,9104.04,1000.00,DR,Master,Friend/Family,5,15,3,1
268,2025-03-15,9046.16,57.88,DR,Unknown,Other,5,15,3,1
269,2025-03-15,8996.16,50.00,DR,Mr SAV,Other,5,15,3,1
270,2025-03-15,8926.16,70.00,DR,Kailas,Other,5,15,3,1


In [56]:
df.shape

(272, 10)

3. Z-Score of Transaction_Amount

The Z-score helps measure how far a transaction deviates from the user's typical spending behavior.

In [57]:
# Compute rolling mean and standard deviation without storing them in the dataframe
rolling_mean = df['Transaction_Amount'].rolling(window=30, min_periods=1).mean()
rolling_std = df['Transaction_Amount'].rolling(window=30, min_periods=1).std()

# Compute Z-score and assign it directly to the dataframe
df['Z_Score_Transaction'] = (df['Transaction_Amount'] - rolling_mean) / (rolling_std + 1e-9)


In [58]:
df

Unnamed: 0,Date,Balance Amount,Transaction_Amount,Type,Recipient,Category,Day_of_Week,Day_of_Month,Month,Transaction_Count_Per_Day,Z_Score_Transaction
0,2024-12-16,16822.55,242.54,DR,Zomato,Food & Dining,0,16,12,1,
1,2024-12-16,16649.55,173.00,DR,ZOMATO,Food & Dining,0,16,12,1,-0.707107
2,2024-12-16,16619.55,30.00,DR,SHRI G,Friend/Family,0,16,12,1,-1.093648
3,2024-12-16,16639.55,20.00,CR,Master,Banking & Transfers,0,16,12,1,-0.881429
4,2024-12-17,16374.55,265.00,DR,SHREE,Friend/Family,1,17,12,1,1.027627
...,...,...,...,...,...,...,...,...,...,...,...
267,2025-03-15,9104.04,1000.00,DR,Master,Friend/Family,5,15,3,1,0.988596
268,2025-03-15,9046.16,57.88,DR,Unknown,Other,5,15,3,1,-0.377263
269,2025-03-15,8996.16,50.00,DR,Mr SAV,Other,5,15,3,1,-0.387616
270,2025-03-15,8926.16,70.00,DR,Kailas,Other,5,15,3,1,-0.360180


In [60]:
df.drop(columns=['Date'], inplace=True)

In [61]:
df.isnull().sum()

Balance Amount               0
Transaction_Amount           0
Type                         0
Recipient                    0
Category                     0
Day_of_Week                  0
Day_of_Month                 0
Month                        0
Transaction_Count_Per_Day    0
Z_Score_Transaction          1
dtype: int64

In [64]:
df.to_csv('training_dataset.csv', index=False)

In [None]:
[[10282.04 57.88 'DR' 'RELIANCE SMART BAZAAR' 'Food & Dining' 5 15 3 12
  1.7286338588186547]]

In [65]:
df.head(2)

Unnamed: 0,Balance Amount,Transaction_Amount,Type,Recipient,Category,Day_of_Week,Day_of_Month,Month,Transaction_Count_Per_Day,Z_Score_Transaction
0,16822.55,242.54,DR,Zomato,Food & Dining,0,16,12,1,
1,16649.55,173.0,DR,ZOMATO,Food & Dining,0,16,12,1,-0.707107


there is 1 missing value in Z_Score_Transaction, fill it with median

In [66]:
df['Z_Score_Transaction'].fillna(df['Z_Score_Transaction'].median(), inplace=True)

# label Encoding

In [67]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [68]:
label_encoders = {}
for col in ['Type', 'Recipient', 'Category']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [69]:
df

Unnamed: 0,Balance Amount,Transaction_Amount,Type,Recipient,Category,Day_of_Week,Day_of_Month,Month,Transaction_Count_Per_Day,Z_Score_Transaction
0,16822.55,242.54,1,98,3,0,16,12,1,-0.253590
1,16649.55,173.00,1,96,3,0,16,12,1,-0.707107
2,16619.55,30.00,1,79,4,0,16,12,1,-1.093648
3,16639.55,20.00,0,38,1,0,16,12,1,-0.881429
4,16374.55,265.00,1,78,4,1,17,12,1,1.027627
...,...,...,...,...,...,...,...,...,...,...
267,9104.04,1000.00,1,38,4,5,15,3,1,0.988596
268,9046.16,57.88,1,85,0,5,15,3,1,-0.377263
269,8996.16,50.00,1,47,0,5,15,3,1,-0.387616
270,8926.16,70.00,1,33,0,5,15,3,1,-0.360180


# Standardize numerical features

In [70]:
scaler = StandardScaler()
num_cols = ['Balance Amount', 'Transaction_Amount', 'Day_of_Week', 'Day_of_Month',
            'Month', 'Transaction_Count_Per_Day', 'Z_Score_Transaction']
df[num_cols] = scaler.fit_transform(df[num_cols])

# Confirm preprocessing
df.head()

Unnamed: 0,Balance Amount,Transaction_Amount,Type,Recipient,Category,Day_of_Week,Day_of_Month,Month,Transaction_Count_Per_Day,Z_Score_Transaction
0,0.238385,-0.094637,1,98,3,-1.553932,0.136532,2.339549,-0.432472,-0.26239
1,0.212906,-0.129716,1,96,3,-1.553932,0.136532,2.339549,-0.432472,-0.678847
2,0.208488,-0.20185,1,79,4,-1.553932,0.136532,2.339549,-0.432472,-1.033801
3,0.211433,-0.206894,0,38,1,-1.553932,0.136532,2.339549,-0.432472,-0.838925
4,0.172405,-0.083308,1,78,4,-1.040983,0.254802,2.339549,-0.432472,0.91413


# Train isolation forest

In [71]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV

In [72]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_samples": ["auto", 0.5, 0.75],
    "contamination": ["auto", 0.05, 0.1],
    "bootstrap": [True, False],
}

In [73]:
isolation_forest = IsolationForest(random_state=42)

In [74]:
grid_search = GridSearchCV(isolation_forest, param_grid, scoring="accuracy", cv=5, n_jobs=-1)
grid_search.fit(df)

In [75]:
# Best parameters and score
grid_search.best_params_, grid_search.best_score_

({'bootstrap': True,
  'contamination': 'auto',
  'max_samples': 'auto',
  'n_estimators': 50},
 np.float64(nan))

# Training model on best params

In [76]:
best_params = {'bootstrap': True, 'contamination': 'auto', 'max_samples': 'auto', 'n_estimators': 50}
isolation_forest = IsolationForest(**best_params, random_state=42)
isolation_forest.fit(df)

In [81]:
df2 = isolation_forest.predict(df)

In [84]:
# Define default category
default_category = 'Unknown'

# Ensure the default category exists in LabelEncoders
for col in ['Type', 'Recipient', 'Category']:
    if default_category not in label_encoders[col].classes_:
        label_encoders[col].classes_ = np.append(label_encoders[col].classes_, default_category)

# Define user input
user_input = np.array([[10282.04, 57.88, 'DR', 'RELIANCE SMART BAZAAR', 'Food & Dining', 5, 15, 3, 12, 1.7286338588186547]], dtype=object)

# Encode categorical values, replacing unseen labels with the default category
for i, col in enumerate(['Type', 'Recipient', 'Category']):
    value = user_input[0, i + 2]
    if value not in label_encoders[col].classes_:
        value = default_category  # Assign the default category
    user_input[0, i + 2] = label_encoders[col].transform([value])[0]

# Convert to DataFrame and standardize numerical features
user_df = pd.DataFrame(user_input, columns=['Balance Amount', 'Transaction_Amount', 'Type', 'Recipient', 'Category',
                                            'Day_of_Week', 'Day_of_Month', 'Month', 'Transaction_Count_Per_Day', 'Z_Score_Transaction'])

user_df[num_cols] = scaler.transform(user_df[num_cols])

# Predict anomaly status
prediction = isolation_forest.predict(user_df)
prediction[0]


np.int64(-1)