<a href="https://colab.research.google.com/github/Rohan7767/XNL-21BCE11181-LLM-3/blob/main/Simulate_Transaction_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas numpy faker torch torchvision torchaudio tqdm sentencepiece tiktoken spacy gensim scikit-learn matplotlib seaborn
!python -m spacy download en_core_web_sm

Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downl

In [None]:
import os
os._exit(00)

In [1]:
!pip install --upgrade --force-reinstall gensim scipy

Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy
  Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.m

In [2]:
import pandas as pd
import numpy as np
import random
from faker import Faker
import sentencepiece as spm
import tiktoken
import spacy
from gensim.models import Word2Vec, FastText
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

fake = Faker()

In [13]:
import multiprocessing as mp

# ---------------------- 1. SIMULATE TRANSACTION DATASET ----------------------

def generate_transaction(_):
    return {
        "user_id": fake.uuid4(),
        "age": random.randint(18, 75),
        "region": fake.state(),
        "credit_score": random.randint(300, 850),
        "behavior_history": random.choice(["Good", "Average", "Poor"]),
        "timestamp": fake.date_time_this_year(),
        "amount": round(random.uniform(1, 5000), 2),
        "merchant": fake.company(),
        "ip_address": fake.ipv4(),
        "device_fingerprint": fake.sha256(),
        "location": fake.city(),
        "velocity": random.randint(1, 10),
        "browser_info": fake.user_agent(),
        "geolocation": fake.coordinate(),
        "network_latency": round(random.uniform(10, 500), 2),
        "session_metadata": fake.word(),
        "label": random.choice(["normal", "suspicious", "fraudulent"])
    }

# ---------------------- 2. MULTIPROCESSING FOR FASTER GENERATION ----------------------

def generate_data_parallel(num_samples, num_workers=mp.cpu_count()):
    with mp.Pool(num_workers) as pool:
        transactions = list(tqdm(pool.imap(generate_transaction, range(num_samples)), total=num_samples, desc="Generating Transactions"))
    return transactions

# ---------------------- 3. RUN & SAVE ----------------------

num_samples = 10000  # Adjust batch size for performance
transactions = generate_data_parallel(num_samples)

df = pd.DataFrame(transactions)
df.to_parquet("simulated_transactions.parquet")

print(f"✅ {num_samples} Transactions Generated & Saved Successfully!")

# ---------------------- 2. TOKENIZATION ----------------------
merchant_texts = df["merchant"].astype(str).tolist()
with open("merchant_texts.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(merchant_texts))

# Train SentencePiece tokenizer
spm.SentencePieceTrainer.train(input="merchant_texts.txt", model_prefix="tokenizer", vocab_size=1700)
sp = spm.SentencePieceProcessor(model_file="tokenizer.model")

# Tokenize merchant names
df["merchant_tokens"] = df["merchant"].apply(lambda x: sp.encode(x, out_type=str))
print("✅ Tokenization completed!")

Generating Transactions: 100%|██████████| 10000/10000 [00:09<00:00, 1102.39it/s]


✅ 10000 Transactions Generated & Saved Successfully!
✅ Tokenization completed!


In [14]:
import spacy
import pandas as pd
import multiprocessing as mp
from tqdm import tqdm

# Load SpaCy model (Optimized for CPU) - Disable unnecessary components
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "lemmatizer"])  # Fully disable lemmatization

# Function to extract named entities
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Multiprocessing function
def process_batch(text_list):
    return [extract_entities(text) for text in text_list]

# Load dataset
df = pd.read_parquet("simulated_transactions.parquet")

# ----------------------  MULTIPROCESSING FOR FASTER EXECUTION  ----------------------

def parallel_ner(df, column="merchant", num_workers=mp.cpu_count()):
    texts = df[column].tolist()
    batch_size = len(texts) // num_workers  # Distribute workload

    with mp.Pool(num_workers) as pool:
        results = list(tqdm(pool.imap(process_batch, [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]),
                            total=num_workers, desc="Processing NER in Parallel"))

    return [ent for batch in results for ent in batch]  # Flatten list

# Run parallel NER
df["merchant_entities"] = parallel_ner(df, column="merchant")

# Save results
df.to_parquet("ner_processed_transactions.parquet")
print("✅ Named Entity Recognition Completed!")
print(df.head())  # Show first few rows

Processing NER in Parallel: 100%|██████████| 2/2 [00:46<00:00, 23.20s/it]


✅ Named Entity Recognition Completed!
                                user_id  age      region  credit_score  \
0  ad21af12-d388-43e8-bb5d-7357ca5cc149   40  California           788   
1  ad21af12-d388-43e8-bb5d-7357ca5cc149   38  California           546   
2  d01026b3-64b3-4744-829e-6ec2abb22c57   40  Washington           453   
3  26d6b725-26e8-4842-92a2-439846f5139e   67   Louisiana           490   
4  542ab2b0-2407-4146-827a-107b433ca9c3   50    Oklahoma           635   

  behavior_history                  timestamp   amount  \
0             Poor 2025-01-11 15:18:53.203135   247.77   
1             Poor 2025-01-11 15:18:53.203135   560.26   
2             Good 2025-01-13 22:34:46.292795  3778.96   
3             Poor 2025-02-26 21:12:36.109180   983.13   
4             Poor 2025-02-01 01:31:58.184352   834.21   

                      merchant      ip_address  \
0    Martinez, Lopez and Smith  117.13.148.155   
1    Martinez, Lopez and Smith  117.13.148.155   
2               Ri

In [35]:
pip install gensim



In [None]:
import pandas as pd
from gensim.models import Word2Vec, FastText

# ---------------------- 1. LOAD DATA ----------------------
df = pd.read_parquet("ner_processed_transactions.parquet")

# Ensure 'merchant_entities' column exists
if "merchant_entities" not in df.columns:
    raise KeyError("❌ 'merchant_entities' column is missing. Ensure NER processing was successful.")

# Convert merchant_entities into sentences (lists of words)
sentences = df["merchant_entities"].apply(lambda entities: [ent[0] for ent in entities if isinstance(ent, (tuple, list))]).tolist()

# Remove empty sentences
sentences = [sentence for sentence in sentences if sentence]

# ---------------------- 2. TRAIN WORD2VEC ----------------------
word2vec_model = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.build_vocab(sentences)
word2vec_model.train(sentences, total_examples=word2vec_model.corpus_count, epochs=10)

# ---------------------- 3. TRAIN FASTTEXT ----------------------
fasttext_model = FastText(vector_size=100, window=5, min_count=1, workers=4)
fasttext_model.build_vocab(sentences)
fasttext_model.train(sentences, total_examples=fasttext_model.corpus_count, epochs=10)

# ---------------------- 4. GENERATE EMBEDDINGS ----------------------
def get_embedding(words, model):
    """Get word embeddings and handle missing words gracefully."""
    return [model.wv[word].tolist() if word in model.wv else [0] * 100 for word in words]

# Apply embeddings
df["word2vec_embedding"] = df["merchant_entities"].apply(lambda x: get_embedding([ent[0] for ent in x if isinstance(ent, (tuple, list))], word2vec_model))
df["fasttext_embedding"] = df["merchant_entities"].apply(lambda x: get_embedding([ent[0] for ent in x if isinstance(ent, (tuple, list))], fasttext_model))

# ---------------------- 5. SAVE PROCESSED DATA ----------------------
df.to_parquet("embedded_transactions.parquet")
print("✅ Word2Vec & FastText embeddings generated and saved!")


In [None]:
# ---------------------- 5. AUTOENCODER FOR NUMERICAL EMBEDDINGS ----------------------
scaler = StandardScaler()
structured_data = df.drop(columns=["merchant", "merchant_tokens", "merchant_entities", "word2vec_embedding", "fasttext_embedding"])
structured_data = scaler.fit_transform(structured_data)

# Define AutoEncoder
class AutoEncoder(nn.Module):
    def __init__(self, input_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Train AutoEncoder
input_dim = structured_data.shape[1]
model = AutoEncoder(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

structured_data_tensor = torch.tensor(structured_data, dtype=torch.float32)
for epoch in range(100):
    optimizer.zero_grad()
    encoded, decoded = model(structured_data_tensor)
    loss = criterion(decoded, structured_data_tensor)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

df["autoencoder_embedding"] = encoded.detach().numpy().tolist()
print("✅ AutoEncoder embeddings generated!")

In [None]:
# ---------------------- 6. PCA DIMENSIONALITY REDUCTION ----------------------
pca = PCA(n_components=10)
pca_embeddings = pca.fit_transform(structured_data)
df["pca_embedding"] = pca_embeddings.tolist()
print("✅ PCA dimensionality reduction completed!")

In [None]:
# ---------------------- 7. t-SNE FRAUD PATTERN CLUSTERING ----------------------
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_embeddings = tsne.fit_transform(structured_data)

df["tsne_x"] = tsne_embeddings[:, 0]
df["tsne_y"] = tsne_embeddings[:, 1]

# Visualize fraud clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x="tsne_x", y="tsne_y", hue=df["label"], palette="coolwarm", data=df)
plt.title("t-SNE Fraud Pattern Clustering")
plt.show()
print("✅ t-SNE visualization completed!")

# Save final dataset
df.to_parquet("processed_transactions.parquet")
print("🎯 Full Pipeline Execution Completed!")

In [None]:
pip install "pinecone[grpc]"

In [None]:
pip install annoy

In [None]:
pip install scann

In [None]:
pip install faiss

In [None]:
import faiss
from annoy import AnnoyIndex
import pinecone
import scann

dimension = 100
embeddings = np.array(df["word2vec_embedding"].tolist())

# FAISS Index
faiss_index = faiss.IndexHNSWFlat(dimension, 32)
faiss_index.add(embeddings)
faiss.write_index(faiss_index, "faiss_index.faiss")

# Annoy Index
annoy_index = AnnoyIndex(dimension, 'euclidean')
for i, vector in enumerate(embeddings):
    annoy_index.add_item(i, vector)
annoy_index.build(100)
annoy_index.save("annoy_index.ann")

# ScaNN Index
scann_index = scann.scann_ops_pybind.builder(embeddings, 50, "dot_product").tree(
    num_leaves=2000, num_leaves_to_search=100).score_ah(2).reorder(100).build()
scann_index.serialize_to_file("scann_index.scann")

# Pinecone Index
pinecone.init(api_key="pcsk_5UUHmi_PBEaZFe9GUZgaHZL3K6h3x5L43pCg5XnRcEXtaHXt4ZF4YDYAFHeExh8gD2bCMt", environment="us-west1-gcp")
pinecone.create_index("fraud-transactions", dimension=dimension, metric="cosine")
pinecone_index = pinecone.Index("fraud-transactions")
pinecone_index.upsert([{"id": str(i), "values": v.tolist()} for i, v in enumerate(embeddings)])

print("Vector indices created!")

In [None]:
def find_similar_faiss(vector, k=5):
    vector = np.expand_dims(vector, axis=0)
    distances, indices = faiss_index.search(vector, k)
    return indices[0], distances[0]

test_vector = np.array(df["word2vec_embedding"].iloc[0])
neighbors, scores = find_similar_faiss(test_vector)
print(f"Similar transactions (FAISS): {neighbors}, Scores: {scores}")

In [None]:
from kafka import KafkaProducer, KafkaConsumer
import json

# Kafka Producer
producer = KafkaProducer(bootstrap_servers="localhost:9092",
                         value_serializer=lambda v: json.dumps(v).encode("utf-8"))
for _, row in df.iterrows():
    producer.send("fraud_transactions", row.to_dict())

# Kafka Consumer
consumer = KafkaConsumer("fraud_transactions", bootstrap_servers="localhost:9092",
                         auto_offset_reset="earliest",
                         value_deserializer=lambda v: json.loads(v.decode("utf-8")))

for message in consumer:
    transaction = message.value
    print("Real-time transaction received:", transaction)