In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import random


In [2]:
# -----------------------------------
# STEP 1: Load and prepare FAQ dataset
# -----------------------------------
faq_df = pd.read_csv('/Users/tshmacm1172/Desktop/DimowKay_FinBot/test_data.csv')  # your main Q&A file
faq_df.dropna(subset=['question', 'answer'], inplace=True)
faq_df['question'] = faq_df['question'].str.strip()
faq_df['answer'] = faq_df['answer'].str.strip()
faq_df.drop_duplicates(subset='question', inplace=True)
faq_df.reset_index(drop=True, inplace=True)

faq_questions = faq_df['question'].tolist()
faq_answers = faq_df['answer'].tolist()

In [3]:
# -----------------------------------
# STEP 2: Encode FAQ questions using BERT
# -----------------------------------
print("🔄 Encoding FAQ questions...")
model = SentenceTransformer('all-MiniLM-L6-v2')
question_embeddings = model.encode(faq_questions, show_progress_bar=True)

🔄 Encoding FAQ questions...


Batches:   0%|          | 0/66 [00:00<?, ?it/s]

In [4]:
# -----------------------------------
# STEP 3: Define response function
# -----------------------------------
def get_response_with_score(user_query):
    query_embedding = model.encode([user_query])
    similarities = cosine_similarity(query_embedding, question_embeddings)
    best_idx = np.argmax(similarities)
    best_score = similarities[0][best_idx]
    return faq_answers[best_idx], faq_questions[best_idx], best_score

In [5]:

# -----------------------------------
# STEP 4: test on validation and test datasets
# -----------------------------------
validation_df = pd.read_csv('/Users/tshmacm1172/Desktop/DimowKay_FinBot/val_data.csv')  # your validation file with user queries
test_df = pd.read_csv('/Users/tshmacm1172/Desktop/DimowKay_FinBot/test_data.csv')  # your test file with user queries

print("\n📝 Validation Dataset Columns:", validation_df.columns.tolist())
print("📝 Test Dataset Columns:", test_df.columns.tolist())

# Adjust these if your columns are named differently
input_col = 'question'
expected_col = 'answer'

validation_df.dropna(subset=[input_col, expected_col], inplace=True)
test_df.dropna(subset=[input_col, expected_col], inplace=True)


📝 Validation Dataset Columns: ['question', 'answer']
📝 Test Dataset Columns: ['question', 'answer']


In [6]:

# -----------------------------------
# STEP 5: Run tests on validation and test datasets (first 5 rows of each)
# -----------------------------------
def evaluate_on_dataset(dataset, dataset_name):
    print(f"\n🔍 Running evaluation on {dataset_name} dataset...")
    correct = 0
    total = len(dataset)
    for i in range(min(5, total)):
        user_query = dataset.loc[i, input_col]
        expected = dataset.loc[i, expected_col]

        predicted_answer, matched_question, score = get_response_with_score(user_query)

        print(f"\n🟢 Query: {user_query}")
        print(f"✅ Expected: {expected}")
        print(f"🤖 Predicted: {predicted_answer}")
        print(f"🧠 Matched FAQ Question: {matched_question}")
        print(f"📊 Similarity Score: {score:.2f}")
        print("-" * 60)

        if predicted_answer.strip().lower() == expected.strip().lower():
            correct += 1
    
    accuracy = correct / total * 100
    print(f"\n✅ {dataset_name} Accuracy: {accuracy:.2f}% ({correct}/{total})")

# Evaluate on validation dataset
evaluate_on_dataset(validation_df, 'Validation')

# Evaluate on test dataset
evaluate_on_dataset(test_df, 'Test')



🔍 Running evaluation on Validation dataset...

🟢 Query: Stability of a Broker What if your broker goes bankrupt? Could you lose equity in your account?
✅ Expected: The Securities Investor Protection Corporation is roughly analogous to the FDIC for investments. There are some important differences like a lack of 100 guarantee you get all of your funds back. The SIPC understands you invested knowing there was some risk, and therefore you take that same risk in getting your money from a failed brokerage. However there is still a level of commitment and trust that lessen the risk of investing in the wrong place. Also, do not typo the acronym at your work computer. In the US and perhaps elsewhere it is a racist term, and you are likely to get some bad search results. httpwww.sipc.orghowbrochure.cfm
🤖 Predicted: Ill give the credit to Quid in the comments section of the question. You put out 10k, you got back 20k, thats a cash gain of 10k, how the asset was valued between your purchase and 

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')


train_examples = []


for i in range(len(faq_df)):
    question = faq_df.loc[i, 'question']
    answer = faq_df.loc[i, 'answer']
    train_examples.append(InputExample(texts=[question, answer], label=1.0))


for _ in range(len(faq_df)):
    q_idx = random.randint(0, len(faq_df)-1)
    a_idx = random.randint(0, len(faq_df)-1)
    if q_idx != a_idx:
        q = faq_df.loc[q_idx, 'question']
        a = faq_df.loc[a_idx, 'answer']
        train_examples.append(InputExample(texts=[q, a], label=0.0))


train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)


train_loss = losses.CosineSimilarityLoss(model=model)


model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=10,
    show_progress_bar=True
)


model.save("fine_tuned_faq_model")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0944


In [None]:
from sentence_transformers import SentenceTransformer, util



model = SentenceTransformer("fine_tuned_faq_model")


test_examples = []

# Positive pairs
for i in range(100):  # Choose smaller number if needed
    question = faq_df.loc[i, 'question']
    answer = faq_df.loc[i, 'answer']
    test_examples.append((question, answer, 1.0))

# Negative pairs
for i in range(100):
    q_idx = random.randint(0, len(faq_df) - 1)
    a_idx = random.randint(0, len(faq_df) - 1)
    if q_idx != a_idx:
        q = faq_df.loc[q_idx, 'question']
        a = faq_df.loc[a_idx, 'answer']
        test_examples.append((q, a, 0.0))


random.shuffle(test_examples)


y_true = []
y_pred = []

for q, a, label in test_examples:
    embedding_q = model.encode(q, convert_to_tensor=True)
    embedding_a = model.encode(a, convert_to_tensor=True)
    cosine_score = util.cos_sim(embedding_q, embedding_a).item()

    y_true.append(label)
    y_pred.append(cosine_score)

# Convert predictions to binary: similarity > 0.5 = match
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

binary_pred = [1 if score > 0.5 else 0 for score in y_pred]

# Print metrics
print("Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_true, binary_pred):.4f}")
print(f"F1 Score: {f1_score(y_true, binary_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_true, y_pred):.4f}")

Evaluation Results:
Accuracy: 0.9300
F1 Score: 0.9278
ROC AUC: 0.9775
