In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/finance_data/Transactions Data.csv')

In [4]:
import pandas as pd
import numpy as np
import re


df = pd.read_csv("/content/drive/MyDrive/finance_data/Transactions Data.csv")

df = df.sample(n=5000, random_state=42).reset_index(drop=True)

print("Before preprocessing:")
print(df.info())


df.dropna(inplace=True)

def generate_description(row):
    if row['type'] == 'PAYMENT':
        return f"Payment of {row['amount']} made."
    elif row['type'] == 'TRANSFER':
        return f"Transfer of {row['amount']} to account {row['nameDest']}."
    elif row['type'] == 'CASH_OUT':
        return f"Cash withdrawal of {row['amount']}."
    elif row['type'] == 'DEBIT':
        return f"Debit transaction of {row['amount']}."
    else:
        return f"Transaction of {row['amount']}."

df['Description'] = df.apply(generate_description, axis=1)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

df['Cleaned_Description'] = df['Description'].apply(clean_text)

Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3 - Q1


lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['amount'] >= lower_bound) & (df['amount'] <= upper_bound)]


df.to_csv("cleaned_dataset.csv", index=False)


print("\nAfter preprocessing:")
print(df.info())


Before preprocessing:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            5000 non-null   int64  
 1   type            5000 non-null   object 
 2   amount          5000 non-null   float64
 3   nameOrig        5000 non-null   object 
 4   oldbalanceOrg   5000 non-null   float64
 5   newbalanceOrig  5000 non-null   float64
 6   nameDest        5000 non-null   object 
 7   oldbalanceDest  5000 non-null   float64
 8   newbalanceDest  5000 non-null   float64
 9   isFraud         5000 non-null   int64  
 10  isFlaggedFraud  5000 non-null   int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 429.8+ KB
None

After preprocessing:
<class 'pandas.core.frame.DataFrame'>
Index: 4733 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -

In [34]:
!pip install sentence-transformers





In [35]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np


df = pd.read_csv("/content/cleaned_dataset.csv")


model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

df['embeddings'] = df['Cleaned_Description'].astype(str).apply(lambda x: model.encode(x))

embeddings = np.vstack(df['embeddings'].values)

np.save("transaction_embeddings.npy", embeddings)

df.drop(columns=['embeddings']).to_csv("dataset_with_embeddings.csv", index=False)

print("Embeddings generated and saved!")


Embeddings generated and saved!


In [37]:
!pip install faiss-cpu



In [38]:
import faiss
import numpy as np

embeddings = np.load("/content/transaction_embeddings.npy")

d = embeddings.shape[1]
faiss.normalize_L2(embeddings)

index = faiss.IndexFlatL2(d)
index.add(embeddings)

faiss.write_index(index, "faiss_index.bin")

print("FAISS index created and saved successfully!")


FAISS index created and saved successfully!


In [39]:

index = faiss.read_index("faiss_index.bin")

query_vector = embeddings[0].reshape(1, -1)
faiss.normalize_L2(query_vector)

k = 5
distances, indices = index.search(query_vector, k)

print(f" Top {k} Similar Transactions Found:")
print(indices)
print(distances)


 Top 5 Similar Transactions Found:
[[ 0  2  3 19 23]]
[[0. 0. 0. 0. 0.]]


In [40]:
import numpy as np
import faiss
import pandas as pd

df = pd.read_csv("/content/cleaned_dataset.csv")
embeddings = np.load("/content/transaction_embeddings.npy")
index = faiss.read_index("/content/faiss_index.bin")

def detect_fraud(query_embedding, threshold=0.5, k=5):

    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k)

    avg_distance = np.mean(distances)
    is_anomalous = avg_distance > threshold

    return is_anomalous, avg_distance, indices


query_idx = 10
query_embedding = embeddings[query_idx].reshape(1, -1)

is_fraud, avg_distance, similar_txns = detect_fraud(query_embedding)

print(f" Fraud Detected: {is_fraud} | Avg Distance: {avg_distance}")
print(f" Similar Transactions (Indices): {similar_txns}")


 Fraud Detected: False | Avg Distance: 0.0
 Similar Transactions (Indices): [[ 5  6  8  9 10]]


In [41]:
import logging

logging.basicConfig(filename="fraud_alerts.log", level=logging.INFO, format="%(asctime)s - %(message)s")

def generate_alert(txn_index, is_fraud, avg_distance):

    if is_fraud:
        txn_details = df.iloc[txn_index].to_dict()
        logging.info(f" Fraud Alert! Transaction Index: {txn_index}, Avg Distance: {avg_distance}")
        print(f" FRAUD DETECTED! Transaction {txn_index} flagged for review.")
        print(" Transaction Details:", txn_details)
    else:
        print(f" Transaction {txn_index} is normal.")

txn_index = 10
query_embedding = embeddings[txn_index].reshape(1, -1)

is_fraud, avg_distance, _ = detect_fraud(query_embedding)
generate_alert(txn_index, is_fraud, avg_distance)


 Transaction 10 is normal.


In [42]:
import time

def test_fraud_detection(num_tests=100):
    fraud_count = 0
    total_time = 0

    for _ in range(num_tests):
        txn_index = np.random.randint(0, len(df))
        query_embedding = embeddings[txn_index].reshape(1, -1)

        start_time = time.time()
        is_fraud, avg_distance, _ = detect_fraud(query_embedding)
        total_time += (time.time() - start_time)

        if is_fraud:
            fraud_count += 1

    avg_latency = total_time / num_tests
    fraud_rate = fraud_count / num_tests

    print(f" Test Completed: {num_tests} Transactions")
    print(f" Average Latency: {avg_latency:.4f} sec")
    print(f" Fraud Detection Rate: {fraud_rate:.2%}")

test_fraud_detection()


 Test Completed: 100 Transactions
 Average Latency: 0.0011 sec
 Fraud Detection Rate: 0.00%


In [43]:
def benchmark_faiss_search(k=5):

    txn_index = np.random.randint(0, len(df))
    query_embedding = embeddings[txn_index].reshape(1, -1)

    start_time = time.time()
    distances, indices = index.search(query_embedding, k)
    search_time = time.time() - start_time

    print(f" FAISS Search Time: {search_time:.4f} sec")
    print(f" Top {k} Similar Transactions: {indices}")

benchmark_faiss_search()


 FAISS Search Time: 0.0063 sec
 Top 5 Similar Transactions: [[ 0  2  3 19 23]]


In [44]:
import numpy as np

threshold = np.percentile(df["amount"], 99)

df["is_anomaly_baseline"] = df["amount"] > threshold

baseline_fraud_rate = df["is_anomaly_baseline"].mean()

print(f" Baseline Fraud Rate: {baseline_fraud_rate:.2%}")


 Baseline Fraud Rate: 1.01%


In [45]:
def compare_methods():

    num_tests = 500
    llm_correct = 0
    baseline_correct = 0

    for _ in range(num_tests):
        txn_index = np.random.randint(0, len(df))
        query_embedding = embeddings[txn_index].reshape(1, -1)

        is_fraud_llm, _, _ = detect_fraud(query_embedding)
        is_fraud_baseline = df.iloc[txn_index]["is_anomaly_baseline"]

        if is_fraud_llm == df.iloc[txn_index]["isFraud"]:
            llm_correct += 1
        if is_fraud_baseline == df.iloc[txn_index]["isFraud"]:
            baseline_correct += 1

    llm_accuracy = llm_correct / num_tests
    baseline_accuracy = baseline_correct / num_tests

    print(" Performance Comparison:")
    print(f" LLM-Based Accuracy: {llm_accuracy:.2%}")
    print(f" Baseline Accuracy: {baseline_accuracy:.2%}")

compare_methods()


 Performance Comparison:
 LLM-Based Accuracy: 100.00%
 Baseline Accuracy: 99.00%


In [46]:
!pip install streamlit



In [47]:
!pip install gradio




In [48]:
import pickle

with open("fraud_model.pkl", "wb") as f:
    pickle.dump(model, f)


In [31]:
import pickle
import os

if not os.path.exists("fraud_model.pkl"):
    print("⚠️ Model file is missing! Train the model first.")
else:
    with open("fraud_model.pkl", "rb") as f:
        model = pickle.load(f)
    print("✅ Model loaded successfully.")


✅ Model loaded successfully.


In [51]:
import pandas as pd
import numpy as np
import faiss
import logging
import time
from sentence_transformers import SentenceTransformer


df = pd.read_csv("/content/cleaned_dataset.csv")

df.fillna(0, inplace=True)

valid_types = ["CASH_OUT", "TRANSFER", "PAYMENT"]
df = df[df["type"].isin(valid_types)]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df['embeddings'] = df['Cleaned_Description'].astype(str).apply(lambda x: model.encode(x))


embeddings = np.vstack(df['embeddings'].values)
np.save("transaction_embeddings.npy", embeddings)
df.drop(columns=['embeddings']).to_csv("dataset_with_embeddings.csv", index=False)
print("Embeddings generated and saved!")

d = embeddings.shape[1]
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatL2(d)
index.add(embeddings)
faiss.write_index(index, "faiss_index.bin")
print("FAISS index created and saved successfully!")

index = faiss.read_index("faiss_index.bin")

def detect_fraud(query_embedding, threshold=0.5, k=5):
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, k)
    avg_distance = np.mean(distances)
    is_anomalous = avg_distance > threshold
    return is_anomalous, avg_distance, indices

# Logging setup
logging.basicConfig(filename="fraud_alerts.log", level=logging.INFO, format="%(asctime)s - %(message)s")

def generate_alert(txn_index, is_fraud, avg_distance):
    if is_fraud:
        txn_details = df.iloc[txn_index].to_dict()
        logging.info(f"Fraud Alert! Transaction Index: {txn_index}, Avg Distance: {avg_distance}")
        print(f"FRAUD DETECTED! Transaction {txn_index} flagged for review.")
        print("Transaction Details:", txn_details)
    else:
        print(f"Transaction {txn_index} is normal.")

def test_fraud_detection(num_tests=100):
    fraud_count = 0
    total_time = 0
    for _ in range(num_tests):
        txn_index = np.random.randint(0, len(df))
        query_embedding = embeddings[txn_index].reshape(1, -1)
        start_time = time.time()
        is_fraud, avg_distance, _ = detect_fraud(query_embedding)
        total_time += (time.time() - start_time)
        if is_fraud:
            fraud_count += 1
    avg_latency = total_time / num_tests
    fraud_rate = fraud_count / num_tests
    print(f"Test Completed: {num_tests} Transactions")
    print(f"Average Latency: {avg_latency:.4f} sec")
    print(f"Fraud Detection Rate: {fraud_rate:.2%}")

def benchmark_faiss_search(k=5):
    txn_index = np.random.randint(0, len(df))
    query_embedding = embeddings[txn_index].reshape(1, -1)
    start_time = time.time()
    distances, indices = index.search(query_embedding, k)
    search_time = time.time() - start_time
    print(f"FAISS Search Time: {search_time:.4f} sec")
    print(f"Top {k} Similar Transactions: {indices}")

threshold = np.percentile(df["amount"], 99)
df["is_anomaly_baseline"] = df["amount"] > threshold
baseline_fraud_rate = df["is_anomaly_baseline"].mean()
print(f"Baseline Fraud Rate: {baseline_fraud_rate:.2%}")

def compare_methods(num_tests=500):
    llm_correct = 0
    baseline_correct = 0
    for _ in range(num_tests):
        txn_index = np.random.randint(0, len(df))
        query_embedding = embeddings[txn_index].reshape(1, -1)
        is_fraud_llm, _, _ = detect_fraud(query_embedding)
        is_fraud_baseline = df.iloc[txn_index]["is_anomaly_baseline"]
        if is_fraud_llm == df.iloc[txn_index]["isFraud"]:
            llm_correct += 1
        if is_fraud_baseline == df.iloc[txn_index]["isFraud"]:
            baseline_correct += 1
    llm_accuracy = llm_correct / num_tests
    baseline_accuracy = baseline_correct / num_tests
    print("Performance Comparison:")
    print(f"LLM-Based Accuracy: {llm_accuracy:.2%}")
    print(f"Baseline Accuracy: {baseline_accuracy:.2%}")

test_fraud_detection()
benchmark_faiss_search()
compare_methods()

Embeddings generated and saved!
FAISS index created and saved successfully!
Baseline Fraud Rate: 1.02%
Test Completed: 100 Transactions
Average Latency: 0.0006 sec
Fraud Detection Rate: 0.00%
FAISS Search Time: 0.0008 sec
Top 5 Similar Transactions: [[ 0  1  4 10 11]]
Performance Comparison:
LLM-Based Accuracy: 100.00%
Baseline Accuracy: 97.80%


In [52]:
import gradio as gr
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


index = faiss.read_index("faiss_index.bin")
stored_embeddings = np.load("transaction_embeddings.npy")


ANOMALY_THRESHOLD = 0.3

def detect_fraud(transaction_text):


    transaction_embedding = model.encode([transaction_text])


    D, I = index.search(transaction_embedding, 5)
    avg_distance = np.mean(D)

    is_fraud = avg_distance > ANOMALY_THRESHOLD
    fraud_status = "🚨 Fraudulent" if is_fraud else "✅ Normal"

    return fraud_status, f"Top 5 Similar Transactions (Indexes): {I.tolist()}"

demo = gr.Interface(
    fn=detect_fraud,
    inputs=gr.Textbox(placeholder="Enter transaction description..."),
    outputs=[gr.Text(label="Fraud Status"), gr.Text(label="Similar Transactions")],
    title="LLM-Based Fraud Detection",
    description="Enter a transaction description to check for fraud using LLM embeddings and FAISS similarity search."
)

demo.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0dcb42cb3f8271a4d9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


