# **Simulating the Database**

In [1]:
!pip install faker pandas numpy tqdm



In [2]:
import pandas as pd
import numpy as np
import random
import uuid
from faker import Faker
from tqdm import tqdm

In [3]:
# Initialize Faker
fake = Faker()

# Define Regions
REGIONS = ["North America", "Europe", "Asia", "South America", "Africa"]
BROWSERS = ["Chrome", "Firefox", "Safari", "Edge", "Brave"]
DEVICE_TYPES = ["Mobile", "Desktop", "Tablet"]

# Define Merchant Categories
MERCHANTS = {
    "Groceries": ["Walmart", "Whole Foods", "Trader Joe's", "Costco"],
    "Electronics": ["Best Buy", "Apple Store", "Newegg"],
    "Travel": ["Uber", "Lyft", "Airbnb", "Delta Airlines"],
    "Clothing": ["Nike", "Adidas", "H&M", "Zara"],
    "Entertainment": ["Netflix", "Spotify", "AMC Theatres"],
    "Dining": ["McDonald's", "Starbucks", "Chipotle"],
    "Gas": ["Shell", "Exxon", "Chevron"],
    "Health": ["CVS Pharmacy", "Walgreens", "Rite Aid"]
}

In [4]:
# Function to generate a synthetic transaction
def generate_transaction(user_id, is_fraud=False):
    amount = round(random.uniform(5, 5000), 2)
    category = random.choice(list(MERCHANTS.keys()))
    merchant = random.choice(MERCHANTS[category])
    location = fake.city()
    timestamp = fake.date_time_between(start_date="-90d", end_date="now")
    ip_address = fake.ipv4()
    browser = random.choice(BROWSERS)
    device_type = random.choice(DEVICE_TYPES)
    session_metadata = f"Latency: {random.randint(10, 500)}ms | Device: {device_type} | Browser: {browser}"

    # Fraudulent transaction modifications
    fraud_flag = 0  # Normal
    if is_fraud:
        amount *= random.uniform(2, 10)  # Inflate amount
        location = fake.city()  # Different location
        ip_address = fake.ipv4_private()  # Private IP (hidden)
        fraud_flag = 1  # Fraudulent

    return {
        "transaction_id": str(uuid.uuid4()),
        "user_id": user_id,
        "amount": amount,
        "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
        "merchant": merchant,
        "category": category,
        "location": location,
        "ip_address": ip_address,
        "browser": browser,
        "device_type": device_type,
        "session_metadata": session_metadata,
        "fraud_flag": fraud_flag
    }

In [5]:
# Generate Users
num_users = 10000  # 10K Users
users = [{"user_id": i, "age": random.randint(18, 75), "region": random.choice(REGIONS), "credit_score": random.randint(300, 850)} for i in range(num_users)]

# Generate Transactions
num_transactions = 1000000  # 1M Transactions
fraud_ratio = 0.1  # 10% fraud transactions

transactions = []
for _ in tqdm(range(num_transactions), desc="Generating Transactions"):
    user_id = random.randint(0, num_users - 1)
    is_fraud = random.random() < fraud_ratio
    transactions.append(generate_transaction(user_id, is_fraud))

# Convert to DataFrame
df = pd.DataFrame(transactions)

# Save to CSV
df.to_csv("synthetic_transactions.csv", index=False)

print("Dataset generated and saved as 'synthetic_transactions.csv'")

Generating Transactions: 100%|██████████| 1000000/1000000 [04:54<00:00, 3390.33it/s]


Dataset generated and saved as 'synthetic_transactions.csv'


In [6]:
pd.options.display.max_columns = None

In [7]:
df.head()

Unnamed: 0,transaction_id,user_id,amount,timestamp,merchant,category,location,ip_address,browser,device_type,session_metadata,fraud_flag
0,fc12e211-cd56-4394-a4c6-dc1c360b4ff4,1956,4105.52,2025-02-15 15:38:38,Uber,Travel,Jenniferchester,183.141.185.242,Chrome,Desktop,Latency: 71ms | Device: Desktop | Browser: Chrome,0
1,13f84663-5bc5-4b5f-9146-eb42c2e63876,8666,4384.74,2025-01-08 21:15:15,CVS Pharmacy,Health,East Katherine,155.102.247.225,Firefox,Tablet,Latency: 95ms | Device: Tablet | Browser: Firefox,0
2,072d7ad4-7aa7-4e12-b4f0-e6f942bb0709,588,4696.77,2025-01-12 05:20:56,Costco,Groceries,Davidstad,1.208.9.139,Brave,Mobile,Latency: 303ms | Device: Mobile | Browser: Brave,0
3,243e6d3b-3268-4662-a494-b76100ef8a33,658,3598.41,2025-03-14 01:00:39,Uber,Travel,South Brandon,135.111.80.184,Firefox,Desktop,Latency: 177ms | Device: Desktop | Browser: Fi...,0
4,bc283fe1-69fa-4055-a2fb-b1286c2e932e,6581,2067.84,2025-03-11 19:24:14,Chipotle,Dining,Austinhaven,122.148.188.96,Safari,Tablet,Latency: 325ms | Device: Tablet | Browser: Safari,0


**Preprocessing**

In [8]:
!pip install nltk pandas numpy



In [9]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [10]:
# Download NLTK resources
nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Load dataset
df = pd.read_csv("synthetic_transactions.csv")

# Initialize NLP tools
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [12]:
# Define preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    words = word_tokenize(text)  # Tokenize text
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize & remove stopwords
    return " ".join(words)

In [13]:
# Apply preprocessing to transaction descriptions
df["cleaned_description"] = df["merchant"] + " " + df["location"] + " " + df["category"] + " $" + df["amount"].astype(str)
df["cleaned_description"] = df["cleaned_description"].astype(str).apply(preprocess_text)

# Save cleaned dataset
df.to_csv("cleaned_transactions.csv", index=False)

**Tokenization**

In [14]:
!pip install sentencepiece



In [15]:
import sentencepiece as spm

In [16]:
# Save descriptions to a text file (required for SentencePiece training)
with open("descriptions.txt", "w", encoding="utf-8") as f:
    for desc in df["cleaned_description"]:
        f.write(desc + "\n")

# Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(input="descriptions.txt", model_prefix="tokenizer", vocab_size=5000)

In [17]:
# Load trained tokenizer
sp = spm.SentencePieceProcessor()
sp.load("tokenizer.model")

True

In [18]:
# Tokenize all transactions
df["tokenized_description"] = df["cleaned_description"].apply(lambda x: sp.encode_as_pieces(str(x)))

# Save dataset with tokenized descriptions
df.to_csv("tokenized_transactions.csv", index=False)

**Word2Vec embedding**

In [19]:
!pip install gensim




In [20]:
from gensim.models import Word2Vec
import ast  # To safely convert string tokens back into lists

# Load tokenized dataset
df = pd.read_csv("tokenized_transactions.csv")

# Convert tokenized text from string to list
df["tokenized_description"] = df["tokenized_description"].apply(ast.literal_eval)

# Prepare tokenized sentences for training
sentences = df["tokenized_description"].tolist()

# Train Word2Vec Model
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
w2v_model.save("word2vec.model")

In [21]:
# Load trained Word2Vec model
w2v_model = Word2Vec.load("word2vec.model")

In [22]:
# Function to get embedding for a transaction
def get_embedding(tokens, model):
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)  # Return zero vector if no valid words
    return np.mean(embeddings, axis=0)  # Take the mean of word embeddings

In [23]:
# Apply embedding extraction
df["word2vec_embedding"] = df["tokenized_description"].apply(lambda x: get_embedding(x, w2v_model).tolist())

# Save dataset with embeddings
df.to_csv("word2vec_transactions.csv", index=False)

**Autoencoder**

In [24]:
!pip install tensorflow



In [25]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense

In [26]:
# Load dataset
df = pd.read_csv("word2vec_transactions.csv")

# Print available columns
print("Available Columns:", df.columns.tolist())

Available Columns: ['transaction_id', 'user_id', 'amount', 'timestamp', 'merchant', 'category', 'location', 'ip_address', 'browser', 'device_type', 'session_metadata', 'fraud_flag', 'cleaned_description', 'tokenized_description', 'word2vec_embedding']


In [27]:
# Load dataset
df = pd.read_csv("word2vec_transactions.csv")

# Select numerical features
numerical_features = ["amount"]  # Modify as needed
X = df[numerical_features].values.astype("float32")

# Normalize data
X = (X - X.mean(axis=0)) / X.std(axis=0)

# Define AutoEncoder model
encoding_dim = 5  # Compressed feature size
input_dim = X.shape[1]

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation="relu")(input_layer)
decoded = Dense(input_dim, activation="sigmoid")(encoded)

autoencoder = keras.Model(input_layer, decoded)
autoencoder.compile(optimizer="adam", loss="mse")

# Train AutoEncoder
autoencoder.fit(X, X, epochs=20, batch_size=256, shuffle=True, verbose=1)

# Extract encoded (low-dimensional) features
encoder = keras.Model(input_layer, encoded)
structured_embeddings = encoder.predict(X)

# Save embeddings
np.save("structured_embeddings.npy", structured_embeddings)

Epoch 1/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.8404
Epoch 2/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - loss: 0.6443
Epoch 3/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 0.6390
Epoch 4/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.6377
Epoch 5/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 3ms/step - loss: 0.6381
Epoch 6/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - loss: 0.6322
Epoch 7/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.6412
Epoch 8/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3ms/step - loss: 0.6288
Epoch 9/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.6323
Epoch 10/20
[1m3907/3907[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [28]:
!pip install scikit-learn



In [29]:
from sklearn.decomposition import PCA

In [30]:
# Load structured embeddings
structured_embeddings = np.load("structured_embeddings.npy")

# Apply PCA
pca = PCA(n_components=3)  # Reduce to 3D for visualization
pca_features = pca.fit_transform(structured_embeddings)

# Save PCA-transformed data
np.save("pca_embeddings.npy", pca_features)

In [31]:
import torch

# If using PyTorch, clear CUDA memory
torch.cuda.empty_cache()

# If using TensorFlow, reset session
import tensorflow as tf
tf.keras.backend.clear_session()

In [36]:
# Delete large variables
del  structured_embeddings

# Manually trigger garbage collection
import gc
gc.collect()


19

In [37]:
pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [38]:
import faiss

In [39]:
# Load structured embeddings
structured_embeddings = np.load("pca_embeddings.npy").astype("float32")

# Ensure embeddings are C-contiguous
structured_embeddings = np.ascontiguousarray(structured_embeddings)

# Normalize embeddings for cosine similarity
faiss.normalize_L2(structured_embeddings)

# Initialize FAISS index for Cosine Similarity
d = structured_embeddings.shape[1]  # Number of dimensions
index = faiss.IndexFlatIP(d)  # Inner Product (Cosine Similarity)

# Add embeddings to FAISS index
index.add(structured_embeddings)

# Save FAISS index
faiss.write_index(index, "faiss_fraud_index.idx")

In [41]:
# Load FAISS index
index = faiss.read_index("faiss_fraud_index.idx")

# Load dataset
df = pd.read_csv("word2vec_transactions.csv")

# Load structured embeddings
structured_embeddings = np.load("pca_embeddings.npy").astype("float32")

In [43]:
#  Define anomaly detection function
def detect_anomalies(embeddings, threshold=0.3, k=5):
    """
    Detect transactions that are anomalous based on similarity search.
    - threshold: Lower cosine similarity means more anomalous.
    - k: Number of nearest neighbors to consider.
    """
    anomalies = []

    for i, emb in enumerate(embeddings):
        query = emb.reshape(1, -1)
        distances, _ = index.search(query, k)

        # Get the average similarity score
        avg_similarity = np.mean(distances)  # Closer to 1 = similar, closer to 0 = anomalous

        # Flag as fraud if similarity is too low
        if avg_similarity < threshold:
            anomalies.append((i, avg_similarity))

    return anomalies

In [None]:
# Detect anomalies with a lower similarity threshold
anomalies = detect_anomalies(structured_embeddings, threshold=0.3, k=5)

# Save suspicious transactions
anomalous_indices = [a[0] for a in anomalies]
anomalous_transactions = df.iloc[anomalous_indices]
anomalous_transactions.to_csv("flagged_anomalies.csv", index=False)