
# RAG Example for Analyzing Financial Transactions

## Imports

In [2]:
import pandas as pd
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration, pipeline, AutoTokenizer, AutoModel, AutoModelForQuestionAnswering
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.cluster import KMeans
import spacy
import openai

  from .autonotebook import tqdm as notebook_tqdm


## Step 1: Loading Documents

In [45]:
# Load the dataset containing transaction information
fraud_file = 'data/fraud.csv'
df = pd.read_csv(fraud_file)
print(df.head())

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


## Step 2: Data Preparation and Splitting Documents

In [81]:
# Split the DataFrame into fraud and non-fraud transactions
fraud_df = df[df['isFraud'] == 1]
non_fraud_df = df[df['isFraud'] == 0]

# Determine the number of samples to take
n_samples = 25  
fraud_samples = min(len(fraud_df), n_samples // 2)  # Take up to half from the fraud group
non_fraud_samples = n_samples - fraud_samples  

# Take random samples from both groups
fraud_sample_df = fraud_df.sample(n=fraud_samples, random_state=42)
non_fraud_sample_df = non_fraud_df.sample(n=non_fraud_samples, random_state=42)

# Concatenate the samples
sample_df = pd.concat([fraud_sample_df, non_fraud_sample_df])

# Shuffle the merged data to randomize the order
sample_df = sample_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Generate descriptions for this random sample
descriptions = sample_df.apply(lambda row: f"Transaction type {row['type']}, amount {row['amount']}, \
origin balance {row['oldbalanceOrg']} to {row['newbalanceOrig']}, \
destination balance {row['oldbalanceDest']} to {row['newbalanceDest']}, \
fraud: {'yes' if row['isFraud'] == 1 else 'no'}", axis=1).tolist()

# Print the generated descriptions
for i, description in enumerate(descriptions, start=1):
    print(f"Description {i}: {description}")


Description 1: Transaction type TRANSFER, amount 244814.12, origin balance 244814.12 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Description 2: Transaction type CASH_IN, amount 253129.93, origin balance 1328499.49 to 1581629.42, destination balance 2713220.48 to 2460090.55, fraud: no
Description 3: Transaction type TRANSFER, amount 56510.5, origin balance 56510.5 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Description 4: Transaction type CASH_OUT, amount 195227.91, origin balance 0.0 to 0.0, destination balance 1256726.0 to 1451953.91, fraud: no
Description 5: Transaction type CASH_OUT, amount 5721871.91, origin balance 5721871.91 to 0.0, destination balance 146633.42 to 5868505.33, fraud: yes
Description 6: Transaction type CASH_OUT, amount 285382.07, origin balance 285382.07 to 0.0, destination balance 0.0 to 285382.07, fraud: yes
Description 7: Transaction type PAYMENT, amount 521.37, origin balance 0.0 to 0.0, destination balance 0.0 to 0.0, fraud: no
Description 8: T

In [86]:
# Sort the sample DataFrame first by 'isFraud' and then by 'amount'
# 'ascending=False' means sorting in descending order
sorted_sample_df = sample_df.sort_values(by=['isFraud', 'amount'], ascending=[False, False]).reset_index(drop=True)

# Generate descriptions for the sorted sample
sorted_descriptions = sorted_sample_df.apply(lambda row: f"Transaction type {row['type']}, amount {row['amount']}, \
origin balance {row['oldbalanceOrg']} to {row['newbalanceOrig']}, \
destination balance {row['oldbalanceDest']} to {row['newbalanceDest']}, \
fraud: {'yes' if row['isFraud'] == 1 else 'no'}", axis=1).tolist()

# Print the generated descriptions for the sorted sample
for i, description in enumerate(sorted_descriptions, start=1):
    print(f"Sorted Description {i}: {description}")


Sorted Description 1: Transaction type TRANSFER, amount 10000000.0, origin balance 10390763.24 to 390763.24, destination balance 0.0 to 0.0, fraud: yes
Sorted Description 2: Transaction type CASH_OUT, amount 5721871.91, origin balance 5721871.91 to 0.0, destination balance 146633.42 to 5868505.33, fraud: yes
Sorted Description 3: Transaction type CASH_OUT, amount 1576531.53, origin balance 1576531.53 to 0.0, destination balance 0.0 to 1576531.53, fraud: yes
Sorted Description 4: Transaction type CASH_OUT, amount 1390332.39, origin balance 1390332.39 to 0.0, destination balance 0.0 to 1390332.39, fraud: yes
Sorted Description 5: Transaction type TRANSFER, amount 488243.65, origin balance 488243.65 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Sorted Description 6: Transaction type CASH_OUT, amount 285382.07, origin balance 285382.07 to 0.0, destination balance 0.0 to 285382.07, fraud: yes
Sorted Description 7: Transaction type TRANSFER, amount 244814.12, origin balance 244814.12 to

## Step 3: Storage

## Step 4: Retrieval

In [82]:
# Initialize tokenizer and model for embedding generation
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

In [126]:
def get_embedding(text, tokenizer, model):
    # Encode the input text and return tensor
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Ensure the model is in evaluation mode
    model.eval()
    # Pass the tensor to the model and get the output
    with torch.no_grad():  # Ensure no gradients are computed
        outputs = model(**inputs)
    # Compute the mean across all tokens, resulting in a single vector per description
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Convert each description to an embedding
description_embeddings = np.array([get_embedding(desc, tokenizer, model) for desc in descriptions])

# Now check the shape again
print("Shape of the embeddings:", description_embeddings.shape)


Shape of the embeddings: (25, 768)


In [131]:
search_query = "High amount transactions flagged as fraud: yes"

# Generate embedding for the search query
query_vector = get_embedding(search_query, tokenizer, model)

# Calculate similarity scores
similarities = cosine_similarity(query_vector.reshape(1, -1), description_embeddings)

# Retrieve top 5 similar transactions
top_n = 5
retrieved_indices = similarities.argsort()[0][-top_n:][::-1]
retrieved_descriptions = [descriptions[i] for i in retrieved_indices]

print("Retrieved transactions based on the query:")
for desc in retrieved_descriptions:
    print(desc)

Retrieved transactions based on the query:
Transaction type PAYMENT, amount 3478.18, origin balance 19853.0 to 16374.82, destination balance 0.0 to 0.0, fraud: no
Transaction type PAYMENT, amount 24768.57, origin balance 21573.0 to 0.0, destination balance 0.0 to 0.0, fraud: no
Transaction type PAYMENT, amount 1716.05, origin balance 5769.17 to 4053.13, destination balance 0.0 to 0.0, fraud: no
Transaction type TRANSFER, amount 21580.37, origin balance 21580.37 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Transaction type TRANSFER, amount 177680.54, origin balance 177680.54 to 0.0, destination balance 0.0 to 0.0, fraud: yes


## Step 5: Generating Answer

In [128]:
def generate_answer(question, descriptions, embeddings, tokenizer, model):
    question_embedding = get_embedding(question, tokenizer, model).reshape(1, -1)  # Reshape to 2D
    scores = cosine_similarity(question_embedding, embeddings)  
    most_relevant_idx = scores.argsort()[0][-1]
    return descriptions[most_relevant_idx]

question = "Which transaction marked as fraud has the highest amount?"
# Ensure description_embeddings is a 2D array for the similarity comparison
description_embeddings = np.vstack(description_embeddings)
answer = generate_answer(question, descriptions, description_embeddings, tokenizer, model)
print("\nAnswer to the question:")
print(answer)


Answer to the question:
Transaction type PAYMENT, amount 1716.05, origin balance 5769.17 to 4053.13, destination balance 0.0 to 0.0, fraud: no


In [132]:
# Filter descriptions to include only those flagged as fraud
fraud_descriptions = [desc for desc in descriptions if "fraud: yes" in desc]
fraud_description_embeddings = np.array([get_embedding(desc, tokenizer, model) for desc in fraud_descriptions])

# Semantic search among fraud descriptions for high amount transactions
high_amount_query = "high amount transactions"
high_amount_vector = get_embedding(high_amount_query, tokenizer, model).reshape(1, -1)

# Calculate similarity only among fraud descriptions
high_amount_similarities = cosine_similarity(high_amount_vector, fraud_description_embeddings)

# Find the most similar fraud transaction
most_similar_fraud_index = high_amount_similarities.argmax()
most_similar_fraud_description = fraud_descriptions[most_similar_fraud_index]

print(f"Question: {high_amount_query}")
print(f"Most relevant high amount fraud transaction: {most_similar_fraud_description}")

Question: high amount transactions
Most relevant high amount fraud transaction: Transaction type TRANSFER, amount 21580.37, origin balance 21580.37 to 0.0, destination balance 0.0 to 0.0, fraud: yes


In [133]:
sorted_df = df[df['isFraud'] == 1].sort_values(by='amount', ascending=False)

highest_fraud_transaction = sorted_df.iloc[0]

highest_fraud_description = f"Transaction type {highest_fraud_transaction['type']}, amount {highest_fraud_transaction['amount']}, \
origin balance {highest_fraud_transaction['oldbalanceOrg']} to {highest_fraud_transaction['newbalanceOrig']}, \
destination balance {highest_fraud_transaction['oldbalanceDest']} to {highest_fraud_transaction['newbalanceDest']}, \
fraud: {'yes' if highest_fraud_transaction['isFraud'] == 1 else 'no'}"

print(f"Question: Which transaction marked as fraud has the highest amount?")
print(f"Most relevant high amount fraud transaction: {highest_fraud_description}")


Question: Which transaction marked as fraud has the highest amount?
Most relevant high amount fraud transaction: Transaction type CASH_OUT, amount 10000000.0, origin balance 10000000.0 to 0.0, destination balance 0.0 to 10000000.0, fraud: yes


## Clustering Transaction

In [108]:
num_clusters = 5  # Die Anzahl der Cluster kann basierend auf Ihrer Datenanalyse angepasst werden
clustering_model = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = clustering_model.fit_predict(description_embeddings)

# Anzeigen von Transaktionen in jedem Cluster
for i in range(num_clusters):
    print(f"Transaktionen im Cluster {i}:")
    for j, label in enumerate(cluster_labels):
        if label == i:
            print(f"Description {j+1}: {descriptions[j]}")
    print("\n")


Transaktionen im Cluster 0:
Description 13: Transaction type PAYMENT, amount 1716.05, origin balance 5769.17 to 4053.13, destination balance 0.0 to 0.0, fraud: no
Description 16: Transaction type TRANSFER, amount 353783.72, origin balance 0.0 to 0.0, destination balance 1639764.23 to 1993547.95, fraud: no
Description 17: Transaction type PAYMENT, amount 24768.57, origin balance 21573.0 to 0.0, destination balance 0.0 to 0.0, fraud: no
Description 23: Transaction type PAYMENT, amount 3478.18, origin balance 19853.0 to 16374.82, destination balance 0.0 to 0.0, fraud: no
Description 24: Transaction type PAYMENT, amount 1464.13, origin balance 60483.0 to 59018.87, destination balance 0.0 to 0.0, fraud: no


Transaktionen im Cluster 1:
Description 2: Transaction type CASH_IN, amount 253129.93, origin balance 1328499.49 to 1581629.42, destination balance 2713220.48 to 2460090.55, fraud: no
Description 5: Transaction type CASH_OUT, amount 5721871.91, origin balance 5721871.91 to 0.0, destinat

  super()._check_params_vs_input(X, default_n_init=10)


## Finding the most relevant description to a question

In [118]:
similarities = cosine_similarity(query_vector.reshape(1, -1), description_embeddings)

# Find the description with the highest similarity
most_similar_index = similarities.argmax()
most_similar_description = descriptions[most_similar_index]

print(f"Question: {search_query}")
print(f"Most relevant description: {most_similar_description}")

Question: High transfer transactions that are flagged as fraud: yes
Most relevant description: Transaction type TRANSFER, amount 21580.37, origin balance 21580.37 to 0.0, destination balance 0.0 to 0.0, fraud: yes


In [124]:
# Prepare the question vector
search_query = "High transfer transactions that are flagged as fraud: yes"
query_vector = get_embedding(search_query, tokenizer, model)  # Use the previously defined get_embedding function

# This calculates the cosine similarity between the question vector and all description embeddings
similarities = cosine_similarity(query_vector.reshape(1, -1), description_embeddings)

## Find Top N similar transactions
top_n = 5
top_n_indices = similarities.argsort()[0][-top_n:][::-1]

print("Top similar transactions to the search query:")
for index in top_n_indices:
    print(descriptions[index])

Top similar transactions to the search query:
Transaction type TRANSFER, amount 21580.37, origin balance 21580.37 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Transaction type TRANSFER, amount 488243.65, origin balance 488243.65 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Transaction type PAYMENT, amount 3478.18, origin balance 19853.0 to 16374.82, destination balance 0.0 to 0.0, fraud: no
Transaction type TRANSFER, amount 244814.12, origin balance 244814.12 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Transaction type PAYMENT, amount 24768.57, origin balance 21573.0 to 0.0, destination balance 0.0 to 0.0, fraud: no


In [125]:
filtered_indices = [index for index in top_n_indices if "fraud: yes" in descriptions[index]]

print("Top similar transactions to the search query, filtered by fraud:")
for index in filtered_indices:
    print(descriptions[index])

Top similar transactions to the search query, filtered by fraud:
Transaction type TRANSFER, amount 21580.37, origin balance 21580.37 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Transaction type TRANSFER, amount 488243.65, origin balance 488243.65 to 0.0, destination balance 0.0 to 0.0, fraud: yes
Transaction type TRANSFER, amount 244814.12, origin balance 244814.12 to 0.0, destination balance 0.0 to 0.0, fraud: yes


## Using a Question-Answering Model

In [120]:
# GPT-3 model
question_answering_pipeline = pipeline("question-answering")

def generate_answer(description, question):
    context = description  
    return question_answering_pipeline(context=context, question=question)

description = descriptions[0]  
question = "How high is the amount of the first transaction?"
answer = generate_answer(description, question)
print(f"Question: {question}\nAnswer: {answer['answer']}")

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Question: How high is the amount of the first transaction?
Answer: 244814.12
