In [66]:
from kafka import KafkaConsumer
import json
import pandas as pd

In [67]:

consumer = KafkaConsumer(
    'bank-transactions-ml',
    bootstrap_servers='localhost:9092',
    value_deserializer=lambda m: json.loads(m.decode('utf-8')),
    auto_offset_reset='earliest',
    enable_auto_commit=True,
    consumer_timeout_ms=5000  # <-- stop after 5s if no messages
)

data = []

print("📥 Collecting training data from Kafka...")
for message in consumer:
    txn = message.value
    data.append(txn)
    
    if len(data) >= 1000:
        break

consumer.close()

df = pd.DataFrame(data)
print(df.head())

📥 Collecting training data from Kafka...
                          transactionId accountId    type        amount  \
0  72255aab-2756-4ee6-9778-0d645b08a196   ACC4964  CREDIT  33711.676386   
1  147025d8-11a2-4f5a-bbac-3805b0f89343   ACC9706  CREDIT   4257.083251   
2  afe944fe-c8c6-40e9-989f-bbf6cf1ebafb   ACC2635  CREDIT  43616.042779   
3  ea0f4a14-fab6-4210-bfff-675c46e23081   ACC6301  CREDIT  14224.955741   
4  6da4b388-59ec-4d7f-87c5-9e3536e47ee5   ACC3343  CREDIT  21679.047254   

      timestamp     location                      email  
0  1.756754e+09       London  arohirajput3850@gmail.com  
1  1.755947e+09     New York  vivekrajput8244@gmail.com  
2  1.755673e+09       Mumbai  arohirajput3850@gmail.com  
3  1.756121e+09        Paris     vrrajput1720@gmail.com  
4  1.756883e+09  Pune, India  vivekrajput8244@gmail.com  


In [68]:
df

Unnamed: 0,transactionId,accountId,type,amount,timestamp,location,email
0,72255aab-2756-4ee6-9778-0d645b08a196,ACC4964,CREDIT,33711.676386,1.756754e+09,London,arohirajput3850@gmail.com
1,147025d8-11a2-4f5a-bbac-3805b0f89343,ACC9706,CREDIT,4257.083251,1.755947e+09,New York,vivekrajput8244@gmail.com
2,afe944fe-c8c6-40e9-989f-bbf6cf1ebafb,ACC2635,CREDIT,43616.042779,1.755673e+09,Mumbai,arohirajput3850@gmail.com
3,ea0f4a14-fab6-4210-bfff-675c46e23081,ACC6301,CREDIT,14224.955741,1.756121e+09,Paris,vrrajput1720@gmail.com
4,6da4b388-59ec-4d7f-87c5-9e3536e47ee5,ACC3343,CREDIT,21679.047254,1.756883e+09,"Pune, India",vivekrajput8244@gmail.com
...,...,...,...,...,...,...,...
95,3ce1518e-f7ba-4251-ac31-05ef4fc13f19,ACC7546,CREDIT,27916.158125,1.755528e+09,Paris,arohirajput3850@gmail.com
96,64fcc934-1999-4e6a-b5af-e123c8439c12,ACC6563,CREDIT,33779.105552,1.755290e+09,London,vrrajput1720@gmail.com
97,be22c705-36c4-40ab-83d0-2f7403f15767,ACC5443,DEBIT,46016.237302,1.756489e+09,Delhi,vivekrajput8244@gmail.com
98,a78c2869-e6db-42cb-a132-ad636a01d92e,ACC4764,DEBIT,17818.435835,1.757721e+09,Mumbai,arohirajput3850@gmail.com


In [69]:
df = df.drop_duplicates(subset='transactionId')


In [70]:
df

Unnamed: 0,transactionId,accountId,type,amount,timestamp,location,email
0,72255aab-2756-4ee6-9778-0d645b08a196,ACC4964,CREDIT,33711.676386,1.756754e+09,London,arohirajput3850@gmail.com
1,147025d8-11a2-4f5a-bbac-3805b0f89343,ACC9706,CREDIT,4257.083251,1.755947e+09,New York,vivekrajput8244@gmail.com
2,afe944fe-c8c6-40e9-989f-bbf6cf1ebafb,ACC2635,CREDIT,43616.042779,1.755673e+09,Mumbai,arohirajput3850@gmail.com
3,ea0f4a14-fab6-4210-bfff-675c46e23081,ACC6301,CREDIT,14224.955741,1.756121e+09,Paris,vrrajput1720@gmail.com
4,6da4b388-59ec-4d7f-87c5-9e3536e47ee5,ACC3343,CREDIT,21679.047254,1.756883e+09,"Pune, India",vivekrajput8244@gmail.com
...,...,...,...,...,...,...,...
95,3ce1518e-f7ba-4251-ac31-05ef4fc13f19,ACC7546,CREDIT,27916.158125,1.755528e+09,Paris,arohirajput3850@gmail.com
96,64fcc934-1999-4e6a-b5af-e123c8439c12,ACC6563,CREDIT,33779.105552,1.755290e+09,London,vrrajput1720@gmail.com
97,be22c705-36c4-40ab-83d0-2f7403f15767,ACC5443,DEBIT,46016.237302,1.756489e+09,Delhi,vivekrajput8244@gmail.com
98,a78c2869-e6db-42cb-a132-ad636a01d92e,ACC4764,DEBIT,17818.435835,1.757721e+09,Mumbai,arohirajput3850@gmail.com


In [71]:
df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df.loc[:, 'hour'] = df['timestamp'].dt.hour
df.loc[:, 'day'] = df['timestamp'].dt.dayofweek


['2025-09-01 19:07:51.050647736', '2025-08-23 11:07:22.051649332',
 '2025-08-20 06:54:48.051649332', '2025-08-25 11:18:27.051649332',
 '2025-09-03 07:04:22.051649332', '2025-09-10 09:30:02.051649332',
 '2025-08-23 21:55:27.051649332', '2025-08-30 23:41:57.051649332',
 '2025-09-08 14:53:40.051649332', '2025-09-05 08:17:11.051649332',
 '2025-09-01 18:17:12.051649332', '2025-08-20 09:14:53.051649332',
 '2025-08-31 10:23:38.051649332', '2025-08-29 19:35:34.051649332',
 '2025-08-31 23:53:52.051649332', '2025-08-21 14:11:04.051649332',
 '2025-08-19 03:18:53.051649332', '2025-09-05 23:35:44.051649332',
 '2025-09-13 00:35:41.051649332', '2025-09-01 11:13:42.051649332',
 '2025-09-12 12:10:36.051649332', '2025-08-23 03:07:04.051649332',
 '2025-08-20 05:46:21.051649332', '2025-09-05 22:27:17.051649332',
 '2025-08-28 19:09:08.051649332', '2025-08-16 10:16:39.051649332',
 '2025-08-25 00:57:45.051649332', '2025-09-12 06:14:14.051649332',
 '2025-08-18 12:45:22.051649332', '2025-09-06 18:54:12.0516493

In [72]:
# Handle missing values
df.loc[:,'email'] = df['email'].fillna("unknown")

In [73]:
df

Unnamed: 0,transactionId,accountId,type,amount,timestamp,location,email,hour,day
0,72255aab-2756-4ee6-9778-0d645b08a196,ACC4964,CREDIT,33711.676386,2025-09-01 19:07:51.050647736,London,arohirajput3850@gmail.com,19,0
1,147025d8-11a2-4f5a-bbac-3805b0f89343,ACC9706,CREDIT,4257.083251,2025-08-23 11:07:22.051649332,New York,vivekrajput8244@gmail.com,11,5
2,afe944fe-c8c6-40e9-989f-bbf6cf1ebafb,ACC2635,CREDIT,43616.042779,2025-08-20 06:54:48.051649332,Mumbai,arohirajput3850@gmail.com,6,2
3,ea0f4a14-fab6-4210-bfff-675c46e23081,ACC6301,CREDIT,14224.955741,2025-08-25 11:18:27.051649332,Paris,vrrajput1720@gmail.com,11,0
4,6da4b388-59ec-4d7f-87c5-9e3536e47ee5,ACC3343,CREDIT,21679.047254,2025-09-03 07:04:22.051649332,"Pune, India",vivekrajput8244@gmail.com,7,2
...,...,...,...,...,...,...,...,...,...
95,3ce1518e-f7ba-4251-ac31-05ef4fc13f19,ACC7546,CREDIT,27916.158125,2025-08-18 14:45:33.053672075,Paris,arohirajput3850@gmail.com,14,0
96,64fcc934-1999-4e6a-b5af-e123c8439c12,ACC6563,CREDIT,33779.105552,2025-08-15 20:25:12.053672075,London,vrrajput1720@gmail.com,20,4
97,be22c705-36c4-40ab-83d0-2f7403f15767,ACC5443,DEBIT,46016.237302,2025-08-29 17:35:28.053672075,Delhi,vivekrajput8244@gmail.com,17,4
98,a78c2869-e6db-42cb-a132-ad636a01d92e,ACC4764,DEBIT,17818.435835,2025-09-12 23:46:06.053672075,Mumbai,arohirajput3850@gmail.com,23,4


In [74]:

# Encode categorical fields
df = pd.get_dummies(df, columns=['type', 'location'], drop_first=True)

In [75]:
df

Unnamed: 0,transactionId,accountId,amount,timestamp,email,hour,day,type_DEBIT,location_London,location_Mumbai,location_New York,location_Paris,"location_Pune, India"
0,72255aab-2756-4ee6-9778-0d645b08a196,ACC4964,33711.676386,2025-09-01 19:07:51.050647736,arohirajput3850@gmail.com,19,0,False,True,False,False,False,False
1,147025d8-11a2-4f5a-bbac-3805b0f89343,ACC9706,4257.083251,2025-08-23 11:07:22.051649332,vivekrajput8244@gmail.com,11,5,False,False,False,True,False,False
2,afe944fe-c8c6-40e9-989f-bbf6cf1ebafb,ACC2635,43616.042779,2025-08-20 06:54:48.051649332,arohirajput3850@gmail.com,6,2,False,False,True,False,False,False
3,ea0f4a14-fab6-4210-bfff-675c46e23081,ACC6301,14224.955741,2025-08-25 11:18:27.051649332,vrrajput1720@gmail.com,11,0,False,False,False,False,True,False
4,6da4b388-59ec-4d7f-87c5-9e3536e47ee5,ACC3343,21679.047254,2025-09-03 07:04:22.051649332,vivekrajput8244@gmail.com,7,2,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3ce1518e-f7ba-4251-ac31-05ef4fc13f19,ACC7546,27916.158125,2025-08-18 14:45:33.053672075,arohirajput3850@gmail.com,14,0,False,False,False,False,True,False
96,64fcc934-1999-4e6a-b5af-e123c8439c12,ACC6563,33779.105552,2025-08-15 20:25:12.053672075,vrrajput1720@gmail.com,20,4,False,True,False,False,False,False
97,be22c705-36c4-40ab-83d0-2f7403f15767,ACC5443,46016.237302,2025-08-29 17:35:28.053672075,vivekrajput8244@gmail.com,17,4,True,False,False,False,False,False
98,a78c2869-e6db-42cb-a132-ad636a01d92e,ACC4764,17818.435835,2025-09-12 23:46:06.053672075,arohirajput3850@gmail.com,23,4,True,False,True,False,False,False


In [76]:
# Drop fields not useful for ML (IDs are unique identifiers, not features)
df = df.drop(columns=['transactionId', 'accountId', 'timestamp', 'email'])


In [77]:
df

Unnamed: 0,amount,hour,day,type_DEBIT,location_London,location_Mumbai,location_New York,location_Paris,"location_Pune, India"
0,33711.676386,19,0,False,True,False,False,False,False
1,4257.083251,11,5,False,False,False,True,False,False
2,43616.042779,6,2,False,False,True,False,False,False
3,14224.955741,11,0,False,False,False,False,True,False
4,21679.047254,7,2,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...
95,27916.158125,14,0,False,False,False,False,True,False
96,33779.105552,20,4,False,True,False,False,False,False
97,46016.237302,17,4,True,False,False,False,False,False
98,17818.435835,23,4,True,False,True,False,False,False


In [78]:
X = df.drop(columns=['type_DEBIT'])   # features
y = df['type_DEBIT']                  # target (1=DEBIT, 0=CREDIT)


In [79]:
y

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97     True
98     True
99    False
Name: type_DEBIT, Length: 100, dtype: bool

In [80]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [81]:
# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)


In [82]:
# from sklearn.metrics import classification_report, accuracy_score

# y_pred = model.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))


In [83]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.65
              precision    recall  f1-score   support

       False       0.75      0.69      0.72        13
        True       0.50      0.57      0.53         7

    accuracy                           0.65        20
   macro avg       0.62      0.63      0.63        20
weighted avg       0.66      0.65      0.65        20



In [84]:
import joblib
joblib.dump(model, "fraud_model_1.pkl")

['fraud_model_1.pkl']

In [85]:

model = joblib.load("fraud_model_1.pkl")