In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import xgboost as xgb


In [None]:
# Loadind the data
df = pd.read_csv("/Users/parthgajera/Documents/Thesis_Data/LI-Small_Trans.csv")

In [None]:
print(df.head(5))

In [None]:
# Convert timestamp
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

In [None]:
# Core Feature Engineering
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['SameBank'] = (df['From Bank'] == df['To Bank']).astype(int)
df['SameAccount'] = (df['Account'] == df['To Bank']).astype(int)
df['CurrencyMismatch'] = (df['Receiving Currency'] != df['Payment Currency']).astype(int)

In [None]:

# Frequency of transactions per account
df['Txn Count From Account'] = df.groupby('Account')['Timestamp'].transform('count')

# Rolling average of amounts from each account
df['Rolling Avg From Account'] = df.sort_values(by='Timestamp') \
    .groupby('Account')['Amount Paid'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# Final Features for Modeling
features = [
    'Amount Received',
    'Amount Paid',
    'SameBank',
    'SameAccount',
    'CurrencyMismatch',
    'Txn Count From Account',
    'Rolling Avg From Account',
    'Payment Format'
]

X = df[features]

#'Hour','DayOfWeek',

In [None]:

'''preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Payment Format']),
    ],
    remainder='passthrough'
)

# Isolation Forest pipeline
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('iso_forest', IsolationForest(contamination='auto', random_state=42))
])

# Fit model
pipeline.fit(X)

# Predict anomaly scores and labels
X_transformed = pipeline.named_steps['preprocess'].transform(X)
df['anomaly_score'] = pipeline.named_steps['iso_forest'].decision_function(X_transformed)
df['anomaly_label'] = pipeline.named_steps['iso_forest'].predict(X_transformed)
df['anomaly_label'] = df['anomaly_label'].map({1: 'Normal', -1: 'Anomaly'})

# Output the flagged anomalies
anomalies = df[df['anomaly_label'] == 'Anomaly']
print(anomalies[['Timestamp', 'Account', 'Amount Paid', 'anomaly_score']]) '''

In [None]:
# --------------------------------------------
# 2. Building of the pipeline for Isolation Forest
# --------------------------------------------
# One-hot encode Payment Format while leaving the other features unchanged.
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Payment Format']),
    ],
    remainder='passthrough'
)

# pipeline with preprocessing and the Isolation Forest
iso_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('iso_forest', IsolationForest(contamination='auto', random_state=42))
])

# Fit the model on X
iso_pipeline.fit(X)

In [None]:
# --------------------------------------------
# 3. Anomaly scores and labels from Isolation Forest
# --------------------------------------------
# Transform X (apply one-hot encoding on 'Payment Format' and passthrough other features)
X_transformed = iso_pipeline.named_steps['preprocess'].transform(X)

# Compute the anomaly score and assign the output label
df['anomaly_score'] = iso_pipeline.named_steps['iso_forest'].decision_function(X_transformed)
df['anomaly_label'] = iso_pipeline.named_steps['iso_forest'].predict(X_transformed)

# Map labels from {1, -1} to more interpretable strings if needed
df['anomaly_label'] = df['anomaly_label'].map({1: 'Normal', -1: 'Anomaly'})

# Print the rows flagged as anomalies by Isolation Forest
anomalies = df[df['anomaly_label'] == 'Anomaly']
print("\nIsolation Forest flagged anomalies:")
print(anomalies[['Timestamp', 'Account', 'Amount Paid', 'anomaly_score']])

In [None]:
# --------------------------------------------
# 4. Set up Hybrid Model with XGBoost
# --------------------------------------------
# The Isolation Forest label as a pseudo target for the XGBoost classifier.
# Binary target: 1 for Anomaly, 0 for Normal.
label_map = {'Anomaly': 1, 'Normal': 0}
df['hybrid_label'] = df['anomaly_label'].map(label_map)

# New feature set for the supervised step.
# 'Payment Format' is left out here because its encoded version is used by the pipeline;
hybrid_features = [
    'Amount Received',
    'Amount Paid',
    'SameBank',
    'SameAccount',
    'CurrencyMismatch',
    'Txn Count From Account',
    'Rolling Avg From Account',
    'anomaly_score'  # additional feature from Isolation Forest
]

X_hybrid = df[hybrid_features]
y_hybrid = df['hybrid_label']

#spliting of the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_hybrid, y_hybrid, test_size=0.2, random_state=42, stratify=y_hybrid)

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss')

# Train the hybrid XGBoost model
xgb_model.fit(X_train, y_train)

# Evaluate the model
y_pred = xgb_model.predict(X_test)
print("\nClassification Report for the Hybrid Model (XGBoost):")
print(classification_report(y_test, y_pred))


In [None]:
# --------------------------------------------
# 5. Hybrid Model Predictions
# --------------------------------------------
df['hybrid_prediction'] = xgb_model.predict(X_hybrid)
df['hybrid_prediction'] = df['hybrid_prediction'].map({1: 'Anomaly', 0: 'Normal'})

print("\nSample of Hybrid Model Predictions:")
print(df[['Timestamp', 'Account', 'Amount Paid', 'anomaly_score','Is Laundering', 'hybrid_prediction']])




In [None]:
print(df[df['Is Laundering'] == 1])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(14,6))
sns.lineplot(data=df, x='Timestamp', y='anomaly_score', hue='anomaly_label', palette={'Normal': 'blue', 'Anomaly': 'red'})
plt.axhline(y=0, color='gray', linestyle='--', linewidth=1)
plt.title('Anomaly Score Over Time')
plt.xlabel('Timestamp')
plt.ylabel('Anomaly Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df['anomaly_score'], bins=50, kde=True, color='purple')
plt.title('Distribution of Anomaly Scores')
plt.xlabel('Anomaly Score')
plt.ylabel('Frequency')
plt.axvline(x=0, color='red', linestyle='--', label='Anomaly Threshold')
plt.legend()
plt.show()


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_transformed)

plt.figure(figsize=(10,6))
sns.scatterplot(x=X_2d[:,0], y=X_2d[:,1], hue=df['anomaly_label'], palette={'Normal': 'gray', 'Anomaly': 'red'})
plt.title('PCA Visualization of Anomalies')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.show()
