In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Step 1: Generate a synthetic dataset with more complexity
np.random.seed(42)
n_samples = 1000

data = {
    "Transaction_ID": np.arange(1, n_samples + 1),
    "User_ID": np.random.randint(1000, 2000, n_samples),
    "Amount": np.append(np.random.normal(500, 100, n_samples - 10), np.random.normal(2000, 200, 10)),  # Injecting anomalies
    "Time": np.random.randint(0, 24, n_samples),
    "Location": np.random.choice(["NY", "LA", "TX", "FL", "CA"], n_samples),
    "Transaction_Type": np.random.choice(["Online", "In-store", "ATM"], n_samples),
    "Fraud_Label": [0] * (n_samples - 10) + [1] * 10  # Marking injected anomalies
}

df = pd.DataFrame(data)



In [2]:
# Step 2: Encode categorical features
label_enc = LabelEncoder()
df["Location"] = label_enc.fit_transform(df["Location"])
df["Transaction_Type"] = label_enc.fit_transform(df["Transaction_Type"])

# Step 3: Feature Engineering
df["Amount_MA_5"] = df["Amount"].rolling(window=5, min_periods=1).mean()  # 5-transaction moving average
df["Transaction_Freq"] = df.groupby("User_ID")["Transaction_ID"].transform("count")

# Step 4: Compute Mahalanobis Distance for anomaly detection
def mahalanobis_distance(x, mean, inv_cov):
    diff = x - mean
    return np.sqrt(diff.T @ inv_cov @ diff)




In [3]:
features = ["Amount", "Time", "Transaction_Freq"]
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[features])

mean_vec = np.mean(df_scaled, axis=0)
cov_matrix = np.cov(df_scaled, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)

df["Mahalanobis_Dist"] = [mahalanobis_distance(x, mean_vec, inv_cov_matrix) for x in df_scaled]
threshold = np.percentile(df["Mahalanobis_Dist"], 97)
df["Anomaly_Mahalanobis"] = (df["Mahalanobis_Dist"] > threshold).astype(int)

# Step 5: Isolation Forest for anomaly detection
iso_forest = IsolationForest(contamination=0.01, random_state=42)
df["Anomaly_IsoForest"] = iso_forest.fit_predict(df[features])
df["Anomaly_IsoForest"] = df["Anomaly_IsoForest"].apply(lambda x: 1 if x == -1 else 0)



In [6]:
# Step 6: Analyze detected anomalies
anomalies = df[(df["Anomaly_Mahalanobis"] == 1) | (df["Anomaly_IsoForest"] == 1)]
print("Detected Anomalies:\n", anomalies)

# Step 7: Compare results
print("\nMahalanobis Anomalies:", df["Anomaly_Mahalanobis"].sum())
print("Isolation Forest Anomalies:", df["Anomaly_IsoForest"].sum())

# Step 8: Evaluate the Performance


Detected Anomalies:
      Transaction_ID  User_ID       Amount  Time  Location  Transaction_Type  \
47               48     1189   496.730525    22         4                 2   
48               49     1957   293.255790    17         2                 0   
50               51     1957   369.553050    20         4                 1   
103             104     1957   480.966132    21         4                 0   
166             167     1098   528.977486    19         0                 0   
229             230     1957   634.542005    16         2                 0   
241             242     1957   334.514333     5         3                 2   
343             344     1928   598.269098    23         4                 1   
377             378     1098   443.753322     1         1                 2   
401             402     1098   605.315285     5         2                 1   
472             473     1098   528.586539    10         3                 0   
482             483     1098   

In [7]:
true_positive = len(df[(df["Fraud_Label"] == 1) & ((df["Anomaly_Mahalanobis"] == 1) | (df["Anomaly_IsoForest"] == 1))])
false_positive = len(df[(df["Fraud_Label"] == 0) & ((df["Anomaly_Mahalanobis"] == 1) | (df["Anomaly_IsoForest"] == 1))])
precision = true_positive / (true_positive + false_positive)
recall = true_positive / df["Fraud_Label"].sum()

print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")

Precision: 0.33, Recall: 1.00
