In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from drop_columns import DropColumns
from feature_interactions import FeatureInteractions
import joblib

In [2]:
fraud_data = pd.read_csv("cleaned_fraud_data.csv").drop(columns=["Unnamed: 0"])
fraud_data.head()

Unnamed: 0,transaction_id,user_id,transaction_type,amount,location,device_type,network_provider,user_type,"time_of_day(morning, afternoon, evening, night)",is_foreign_number,is_sim_recently_swapped,has_multiple_accounts,datetime,date,hour
0,TX100000,user_8270,Withdraw Cash,2646.35,Nakuru,Feature Phone,Telkom Kenya,Individual,Evening,0,0,0,2024-06-16 21:45:13,2024-06-16,21
1,TX100001,user_1860,Send Money,2844.69,Garissa,Ios,Safaricom,Agent,Night,0,0,0,2024-06-05 00:49:25,2024-06-05,0
2,TX100002,user_6390,Deposit Cash,2384.46,Nyeri,Feature Phone,Telkom Kenya,Agent,Afternoon,0,0,1,2024-06-13 15:54:02,2024-06-13,15
3,TX100003,user_6191,Withdraw Cash,1846.01,Nairobi,Ios,Safaricom,Individual,Night,0,0,1,2024-06-10 01:05:49,2024-06-10,1
4,TX100004,user_6734,Send Money,1017.6,Machakos,Feature Phone,Telkom Kenya,Individual,Night,0,0,0,2024-06-27 02:28:53,2024-06-27,2


In [3]:
# rename column
fraud_data = fraud_data.rename(columns={"time_of_day(morning, afternoon, evening, night)": "time_of_day"})

In [4]:
# Numerical columns to scale
cols_to_scale = ["amount_log", "transaction_frequency", "avg_amount_per_user", "fraud_risk_score"]
# Categorical Columns
categorical_cols = ['transaction_type', 'location', 'device_type', 'user_type']
# Binary Columns
binary_cols = ['is_foreign_number', 'is_sim_recently_swapped', 'has_multiple_accounts', 'night_transaction']

preprocessor = ColumnTransformer(
    transformers=[
        ("num", RobustScaler(), cols_to_scale),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop="first"), categorical_cols),
        ('bin', 'passthrough', binary_cols)
    ]
)

In [5]:
X = fraud_data.copy()

In [6]:
model = Pipeline(steps=[
    ("feature_interactions", FeatureInteractions()),
    ("drop_columns", DropColumns(columns_to_drop=["transaction_id", "user_id", "amount", "network_provider",
                                            "time_of_day", "datetime", "date", "hour",
                                            "weighted_foreign", "weighted_sim_swap"])),
    ('preprocessor', preprocessor),
    ("classifier", IsolationForest(contamination=0.02, random_state=42))
])

model.fit(X)

0,1,2
,steps,"[('feature_interactions', ...), ('drop_columns', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,weight_foreign,2.0
,weight_sim_swap,2.5

0,1,2
,columns_to_drop,"['transaction_id', 'user_id', ...]"

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,with_centering,True
,with_scaling,True
,quantile_range,"(25.0, ...)"
,copy,True
,unit_variance,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,max_samples,'auto'
,contamination,0.02
,max_features,1.0
,bootstrap,False
,n_jobs,
,random_state,42
,verbose,0
,warm_start,False


In [7]:
# save model
joblib.dump(model, "fraud_detection_model.pkl")

['fraud_detection_model.pkl']

In [8]:
# Predict anomalies
fraud_data["anomaly_score"] = model.score_samples(X)
fraud_data["is_anomalous"] = model.predict(X)

# Display results
print("Anomaly Distribution:")
print(fraud_data["is_anomalous"].value_counts(normalize=True))

print("\nTop 10 Anomalous Transactions:")
display(fraud_data[fraud_data["is_anomalous"] == -1].sort_values("anomaly_score")[:-10])

Anomaly Distribution:
is_anomalous
 1    0.98
-1    0.02
Name: proportion, dtype: float64

Top 10 Anomalous Transactions:


Unnamed: 0,transaction_id,user_id,transaction_type,amount,location,device_type,network_provider,user_type,time_of_day,is_foreign_number,is_sim_recently_swapped,has_multiple_accounts,datetime,date,hour,anomaly_score,is_anomalous
4292,TX104292,user_9970,Deposit Cash,25058.95,Thika,Feature Phone,Telkom Kenya,Individual,Night,1,1,0,2024-06-22 04:02:23,2024-06-22,4,-0.650268,-1
284,TX100284,user_1980,Send Money,12767.11,Nyeri,Ios,Safaricom,Agent,Night,0,1,0,2024-06-11 03:37:11,2024-06-11,3,-0.619297,-1
1780,TX101780,user_6051,Lipa Na M-Pesa,2060.12,Nyeri,Ios,Airtel,Individual,Night,1,1,0,2024-06-14 01:47:33,2024-06-14,1,-0.618297,-1
6150,TX106150,user_5305,Withdraw Cash,2208.80,Nairobi,Ios,Telkom Kenya,Individual,Night,1,0,1,2024-06-20 04:28:01,2024-06-20,4,-0.612914,-1
662,TX100662,user_6115,Send Money,3000.92,Garissa,Feature Phone,Telkom Kenya,Individual,Night,1,1,0,2024-06-06 03:49:46,2024-06-06,3,-0.611327,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,TX100076,user_6276,Withdraw Cash,1228.73,Nakuru,Ios,Airtel,Individual,Morning,0,1,1,2024-06-17 08:56:21,2024-06-17,8,-0.558935,-1
6545,TX106545,user_5601,Withdraw Cash,6794.54,Nyeri,Ios,Safaricom,Individual,Afternoon,1,0,0,2024-06-25 13:10:44,2024-06-25,13,-0.558934,-1
2838,TX102838,user_1182,Send Money,193.94,Garissa,Ios,Airtel,Agent,Morning,1,0,0,2024-06-08 06:49:32,2024-06-08,6,-0.558921,-1
7544,TX107544,user_5922,Deposit Cash,1857.52,Nyeri,Android,Safaricom,Individual,Night,0,1,0,2024-06-08 04:14:51,2024-06-08,4,-0.558850,-1


In [9]:
import pandas as pd
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)


# Anomaly Score Distribution
fig1 = px.histogram(fraud_data, x="anomaly_score", nbins=50, title="Anomaly Score Distribution",
                    labels={"anomaly_score": "Anomaly Score", "count": "Count"},
                    color="is_anomalous", color_discrete_map={0: "blue", 1: "red"})
fig1.update_layout(bargap=0.1)
fig1.show()
fig1.write_html("anomaly_score_distribution.html")