In [1]:
import pandas as pd

# Load the dataset
file_path = "/home/nazar/Muhammad Raees Azam/jazzcash/notebooks/SyntheticProcessed.csv"
df = pd.read_csv(file_path)

# Display structure
print("Columns in Dataset:\n", df.columns.tolist())
print("\n Data Types:\n", df.dtypes)
print("\n First 5 Rows:\n", df.head())
print("\n Summary Statistics:\n", df.describe(include='all'))


Columns in Dataset:
 ['Type', 'Amount', 'ID Source', 'Old Balance', 'New Balance', 'State Source', 'City Source', 'Device Name', 'IMEI', 'KYC Status', 'Service Charges', 'Channel', 'ID Dest', 'State Dest', 'City Dest', 'Is Fraud', 'year', 'month', 'day', 'hour', 'minute', 'second', 'weekday', 'day_of_year']

 Data Types:
 Type                 int64
Amount             float64
ID Source            int64
Old Balance        float64
New Balance        float64
State Source         int64
City Source          int64
Device Name          int64
IMEI                 int64
KYC Status           int64
Service Charges    float64
Channel              int64
ID Dest              int64
State Dest           int64
City Dest            int64
Is Fraud             int64
year                 int64
month                int64
day                  int64
hour                 int64
minute               int64
second               int64
weekday              int64
day_of_year          int64
dtype: object

 First 5 Rows

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("/home/nazar/Muhammad Raees Azam/jazzcash/notebooks/SyntheticProcessed.csv")

# Construct datetime from individual components
df['timestamp'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])
df.sort_values(by=['ID Source', 'timestamp'], inplace=True)  # Sort for future calculations


In [3]:
#Rule 1: n transactions within m minutes (e.g., 3 txns in 5 minutes)

from collections import deque

n, m = 3, 5  # example: 3 transactions in 5 minutes
flag_rapid_txns = []

for _, group in df.groupby('ID Source'):
    dq = deque()
    group_flags = []
    
    for t in group['timestamp']:
        dq.append(t)
        while (dq[-1] - dq[0]).total_seconds() > m * 60:
            dq.popleft()
        group_flags.append(1 if len(dq) >= n else 0)
    
    flag_rapid_txns.extend(group_flags)

df['flag_rapid_txns'] = flag_rapid_txns


In [4]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,year,month,day,hour,minute,second,weekday,day_of_year,timestamp,flag_rapid_txns
3196942,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,2019,9,14,17,14,8,5,257,2019-09-14 17:14:08,0
2798983,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,2021,10,12,12,54,18,1,285,2021-10-12 12:54:18,0
1013585,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,2020,2,19,3,18,31,2,50,2020-02-19 03:18:31,0
2980283,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,2020,12,3,4,11,27,3,338,2020-12-03 04:11:27,0
1920204,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,2024,8,17,1,53,24,5,230,2024-08-17 01:53:24,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5467102,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,2025,4,9,16,42,41,2,99,2025-04-09 16:42:41,0
323336,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,2021,6,4,17,18,1,4,155,2021-06-04 17:18:01,0
768108,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,2020,2,19,17,43,10,2,50,2020-02-19 17:43:10,0
2310367,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,2025,12,7,10,18,22,6,341,2025-12-07 10:18:22,0


In [5]:
#Rule 2: High-risk cities

# Example: city codes 101, 202 are high-risk
high_risk_cities = [101, 202]

df['flag_high_risk_city'] = df['City Source'].isin(high_risk_cities) | df['City Dest'].isin(high_risk_cities)


In [6]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,month,day,hour,minute,second,weekday,day_of_year,timestamp,flag_rapid_txns,flag_high_risk_city
3196942,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,9,14,17,14,8,5,257,2019-09-14 17:14:08,0,False
2798983,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,10,12,12,54,18,1,285,2021-10-12 12:54:18,0,False
1013585,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,2,19,3,18,31,2,50,2020-02-19 03:18:31,0,False
2980283,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,12,3,4,11,27,3,338,2020-12-03 04:11:27,0,False
1920204,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,8,17,1,53,24,5,230,2024-08-17 01:53:24,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5467102,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,4,9,16,42,41,2,99,2025-04-09 16:42:41,0,False
323336,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,6,4,17,18,1,4,155,2021-06-04 17:18:01,0,False
768108,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,2,19,17,43,10,2,50,2020-02-19 17:43:10,0,False
2310367,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,12,7,10,18,22,6,341,2025-12-07 10:18:22,0,False


In [7]:
#Rule 3: High-risk region and odd time (e.g., 12 AM - 5 AM)

high_risk_states = [10, 12]  # Example state codes
df['flag_risky_time_region'] = ((df['State Source'].isin(high_risk_states)) & (df['hour'].between(0, 5)))


In [None]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,day,hour,minute,second,weekday,day_of_year,timestamp,flag_rapid_txns,flag_high_risk_city,flag_risky_time_region
3196942,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,14,17,14,8,5,257,2019-09-14 17:14:08,0,False,False
2798983,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,12,12,54,18,1,285,2021-10-12 12:54:18,0,False,False
1013585,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,19,3,18,31,2,50,2020-02-19 03:18:31,0,False,False
2980283,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,3,4,11,27,3,338,2020-12-03 04:11:27,0,False,False
1920204,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,17,1,53,24,5,230,2024-08-17 01:53:24,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5467102,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,9,16,42,41,2,99,2025-04-09 16:42:41,0,False,False
323336,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,4,17,18,1,4,155,2021-06-04 17:18:01,0,False,False
768108,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,19,17,43,10,2,50,2020-02-19 17:43:10,0,False,False
2310367,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,7,10,18,22,6,341,2025-12-07 10:18:22,0,False,False


In [9]:
#Rule 4: Newly created receiving account and large amount
#Let’s assume accounts with IDs above a certain value (e.g., recent IDs > 10000) are new accounts.

amount_threshold = 100000  # Example threshold
df['flag_new_acc_large_amt'] = ((df['ID Dest'] > 10000) & (df['Amount'] > amount_threshold))


In [10]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,hour,minute,second,weekday,day_of_year,timestamp,flag_rapid_txns,flag_high_risk_city,flag_risky_time_region,flag_new_acc_large_amt
3196942,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,17,14,8,5,257,2019-09-14 17:14:08,0,False,False,True
2798983,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,12,54,18,1,285,2021-10-12 12:54:18,0,False,False,False
1013585,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,3,18,31,2,50,2020-02-19 03:18:31,0,False,False,False
2980283,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,4,11,27,3,338,2020-12-03 04:11:27,0,False,False,True
1920204,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,1,53,24,5,230,2024-08-17 01:53:24,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5467102,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,16,42,41,2,99,2025-04-09 16:42:41,0,False,False,False
323336,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,17,18,1,4,155,2021-06-04 17:18:01,0,False,False,False
768108,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,17,43,10,2,50,2020-02-19 17:43:10,0,False,False,True
2310367,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,10,18,22,6,341,2025-12-07 10:18:22,0,False,False,False


In [11]:
#Rule 5: Amount > mean + 3 * std for the user


user_stats = df.groupby('ID Source')['Amount'].agg(['mean', 'std']).reset_index()
df = df.merge(user_stats, on='ID Source', how='left')
df['flag_high_amount'] = df['Amount'] > (df['mean'] + 3 * df['std'])



In [12]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,weekday,day_of_year,timestamp,flag_rapid_txns,flag_high_risk_city,flag_risky_time_region,flag_new_acc_large_amt,mean,std,flag_high_amount
0,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,5,257,2019-09-14 17:14:08,0,False,False,True,244486.46,,False
1,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,1,285,2021-10-12 12:54:18,0,False,False,False,3170.28,,False
2,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,2,50,2020-02-19 03:18:31,0,False,False,False,8424.74,,False
3,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,3,338,2020-12-03 04:11:27,0,False,False,True,261877.19,,False
4,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,5,230,2024-08-17 01:53:24,0,False,False,False,20528.65,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,2,99,2025-04-09 16:42:41,0,False,False,False,26585.43,,False
6362616,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,4,155,2021-06-04 17:18:01,0,False,False,False,37516.21,,False
6362617,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,2,50,2020-02-19 17:43:10,0,False,False,True,244962.21,,False
6362618,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,6,341,2025-12-07 10:18:22,0,False,False,False,15714.59,,False


In [13]:
# Rule 6: Time outside user's 99% activity window
# Calculate the 0.5th and 99.5th percentile activity hour range per user

# Efficient quantile calculation using groupby and unstack
quantiles = df.groupby('ID Source')['hour'].quantile([0.005, 0.995]).unstack()
quantiles.columns = ['lower_bound', 'upper_bound']

# Merge the quantiles back to the original dataframe
df = df.merge(quantiles, left_on='ID Source', right_index=True)

# Flag if 'hour' is outside the user's 99% activity range
df['flag_time_outlier'] = ~df['hour'].between(df['lower_bound'], df['upper_bound'])


In [14]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,flag_rapid_txns,flag_high_risk_city,flag_risky_time_region,flag_new_acc_large_amt,mean,std,flag_high_amount,lower_bound,upper_bound,flag_time_outlier
0,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,0,False,False,True,244486.46,,False,17.0,17.0,False
1,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,0,False,False,False,3170.28,,False,12.0,12.0,False
2,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,0,False,False,False,8424.74,,False,3.0,3.0,False
3,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,0,False,False,True,261877.19,,False,4.0,4.0,False
4,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,0,False,False,False,20528.65,,False,1.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,0,False,False,False,26585.43,,False,16.0,16.0,False
6362616,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,0,False,False,False,37516.21,,False,17.0,17.0,False
6362617,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,0,False,False,True,244962.21,,False,17.0,17.0,False
6362618,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,0,False,False,False,15714.59,,False,10.0,10.0,False


In [3]:
# ---------------------------------------------
# Rule 7: Flag Based on Sudden Distant Activity
# ---------------------------------------------
# 🚩 Rule 7:
# Flag a transaction if:
# - The distance between the user's last two transactions exceeds a defined threshold
# - AND the time difference between those transactions is less than `m` minutes

# ✅ What We Need:
# - Latitude and longitude of each transaction (currently not in dataset)
# - OR city codes mapped to coordinates (can simulate with a dictionary)
# - Timestamp for each transaction (can create from 'year', 'month', 'day', 'hour', 'minute', 'second' columns)

# 🎯 Thresholds (to define):
# - `MAX_DISTANCE_KM`: Maximum allowed distance between two transactions
# - `MIN_TIME_MINUTES`: Minimum allowed time between two transactions

# 🧠 Since We Don't Have lat/lon Right Now:
# - We can simulate distances based on city codes or names
# - OR skip the distance check and only apply time-based session logic

# 🧪 Simulated Strategy:
# - Use a city-to-coordinates dictionary (`city_coords`) if available
# - Calculate distance between coordinates (using haversine formula)
# - Calculate time difference between transactions
# - Flag if: distance > MAX_DISTANCE_KM and time_diff < MIN_TIME_MINUTES

# 🔧 You can later plug in actual lat/lon data (from city mappings or external sources) if needed

# Example coordinates (for simulation):
# city_coords = {
#     18: (31.5497, 74.3436),   # Lahore
#     25: (33.6844, 73.0479),   # Islamabad
#     ...
# }

# 🧭 Final Step:
# Loop over transactions grouped by user (`ID Source`),
# calculate distance and time difference between consecutive transactions,
# and flag those that break the rule as `flag_distance_time = True`

# Note: For now, we implemented this logic using simulated data and just the time-based condition
# You can enhance it further once you integrate lat/lon or city-level coordinates.


In [15]:
city_coords = {
    'Lahore': (31.5497, 74.3436),
    'Karachi': (24.8607, 67.0011),
    'Islamabad': (33.6844, 73.0479),
    'Peshawar': (34.0151, 71.5249),
    'Quetta': (30.1798, 66.9750),
    'Multan': (30.1575, 71.5249),
    'Faisalabad': (31.4504, 73.1350),
    'Rawalpindi': (33.5651, 73.0169),
    'Hyderabad': (25.3960, 68.3578),
    'Sukkur': (27.7052, 68.8574),
    # Add remaining cities...
}


In [17]:
pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.4/125.4 kB[0m [31m592.7 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading geographiclib-2.0-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1
Note: you may need to restart the kernel to use updated packages.


In [18]:
import pandas as pd
from geopy.distance import geodesic

# Assuming 'df' is your DataFrame and 'city_coords' is the dictionary from above

# Create a timestamp column if not already present
df['timestamp'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute', 'second']])

# Sort the DataFrame by user and timestamp
df.sort_values(by=['ID Source', 'timestamp'], inplace=True)

# Initialize a column for the flag
df['flag_distance_time'] = False

# Define thresholds
DIST_THRESHOLD = 200  # in kilometers
TIME_THRESHOLD = 10   # in minutes

# Group by user
for user_id, group in df.groupby('ID Source'):
    group = group.reset_index()
    for i in range(1, len(group)):
        prev_city = group.loc[i - 1, 'City Source']
        curr_city = group.loc[i, 'City Source']
        prev_time = group.loc[i - 1, 'timestamp']
        curr_time = group.loc[i, 'timestamp']

        # Get coordinates
        prev_coords = city_coords.get(prev_city)
        curr_coords = city_coords.get(curr_city)

        # Proceed if both cities have coordinates
        if prev_coords and curr_coords:
            distance = geodesic(prev_coords, curr_coords).kilometers
            time_diff = (curr_time - prev_time).total_seconds() / 60.0  # in minutes

            if distance > DIST_THRESHOLD and time_diff < TIME_THRESHOLD:
                df.at[group.loc[i, 'index'], 'flag_distance_time'] = True


In [19]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,flag_high_risk_city,flag_risky_time_region,flag_new_acc_large_amt,mean,std,flag_high_amount,lower_bound,upper_bound,flag_time_outlier,flag_distance_time
0,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,False,False,True,244486.46,,False,17.0,17.0,False,False
1,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,False,False,False,3170.28,,False,12.0,12.0,False,False
2,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,False,False,False,8424.74,,False,3.0,3.0,False,False
3,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,False,False,True,261877.19,,False,4.0,4.0,False,False
4,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,False,False,False,20528.65,,False,1.0,1.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,False,False,False,26585.43,,False,16.0,16.0,False,False
6362616,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,False,False,False,37516.21,,False,17.0,17.0,False,False
6362617,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,False,False,True,244962.21,,False,17.0,17.0,False,False
6362618,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,False,False,False,15714.59,,False,10.0,10.0,False,False


In [20]:
#Rule 8: Transactions per session vs. user median
#Approximate sessions by grouping transactions less than 15 mins apart.

df['prev_time'] = df.groupby('ID Source')['timestamp'].shift()
df['time_diff'] = (df['timestamp'] - df['prev_time']).dt.total_seconds() / 60.0
df['new_session'] = (df['time_diff'] > 15) | (df['time_diff'].isna())
df['session_id'] = df.groupby('ID Source')['new_session'].cumsum()

session_counts = df.groupby(['ID Source', 'session_id']).size().reset_index(name='session_txn_count')
median_txn_per_session = session_counts.groupby('ID Source')['session_txn_count'].median().reset_index(name='median_count')

df = df.merge(session_counts, on=['ID Source', 'session_id'])
df = df.merge(median_txn_per_session, on='ID Source')
df['flag_heavy_session'] = df['session_txn_count'] > df['median_count']


In [21]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,upper_bound,flag_time_outlier,flag_distance_time,prev_time,time_diff,new_session,session_id,session_txn_count,median_count,flag_heavy_session
0,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,17.0,False,False,NaT,,True,1,1,1.0,False
1,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,12.0,False,False,NaT,,True,1,1,1.0,False
2,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,3.0,False,False,NaT,,True,1,1,1.0,False
3,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,4.0,False,False,NaT,,True,1,1,1.0,False
4,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,1.0,False,False,NaT,,True,1,1,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,16.0,False,False,NaT,,True,1,1,1.0,False
6362616,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,17.0,False,False,NaT,,True,1,1,1.0,False
6362617,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,17.0,False,False,NaT,,True,1,1,1.0,False
6362618,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,10.0,False,False,NaT,,True,1,1,1.0,False


In [22]:
#Final Step: Combine all flags
flag_cols = [col for col in df.columns if col.startswith('flag_')]
df['fraud_flag_combined'] = df[flag_cols].any(axis=1).astype(int)

print(f"Total flagged transactions: {df['fraud_flag_combined'].sum()}")


Total flagged transactions: 2797361


In [23]:
df[flag_cols].sum().sort_values(ascending=False)


flag_new_acc_large_amt    2787291
flag_time_outlier           17812
flag_high_risk_city             0
flag_rapid_txns                 0
flag_risky_time_region          0
flag_high_amount                0
flag_distance_time              0
flag_heavy_session              0
dtype: int64

In [24]:
df['flag_sum'] = df[flag_cols].sum(axis=1)
df['flag_sum'].value_counts()


flag_sum
0    3565259
1    2789619
2       7742
Name: count, dtype: int64

In [25]:
df[df['fraud_flag_combined'] == 1].to_csv('flagged_transactions.csv', index=False)


In [26]:
df

Unnamed: 0,Type,Amount,ID Source,Old Balance,New Balance,State Source,City Source,Device Name,IMEI,KYC Status,...,flag_distance_time,prev_time,time_diff,new_session,session_id,session_txn_count,median_count,flag_heavy_session,fraud_flag_combined,flag_sum
0,1,244486.46,0,8946.00,0.00,3,18,0,927866134346668,0,...,False,NaT,,True,1,1,1.0,False,1,1
1,3,3170.28,1,58089.00,54918.72,3,18,0,523023208809408,2,...,False,NaT,,True,1,1,1.0,False,0,0
2,3,8424.74,2,783.00,0.00,1,50,4,826769516434938,2,...,False,NaT,,True,1,1,1.0,False,0,0
3,0,261877.19,3,7596.00,269473.19,1,37,3,808816955619213,1,...,False,NaT,,True,1,1,1.0,False,1,1
4,0,20528.65,4,2302074.12,2322602.77,2,13,3,46281521129530,3,...,False,NaT,,True,1,1,1.0,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,3,26585.43,6353302,0.00,0.00,6,30,1,358722602404200,0,...,False,NaT,,True,1,1,1.0,False,0,0
6362616,0,37516.21,6353303,18820782.28,18858298.49,5,25,4,955144871951531,3,...,False,NaT,,True,1,1,1.0,False,0,0
6362617,1,244962.21,6353304,22027.00,0.00,6,30,1,270348759515170,3,...,False,NaT,,True,1,1,1.0,False,1,1
6362618,3,15714.59,6353305,0.00,0.00,6,48,3,376257479668904,3,...,False,NaT,,True,1,1,1.0,False,0,0
