# Sentence BERT

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


faq_df = pd.read_csv('/Users/tshmacm1172/Desktop/DimowKay_FinBot/train_data.csv')  
faq_df.dropna(subset=['question', 'answer'], inplace=True)
faq_df['question'] = faq_df['question'].str.strip()
faq_df['answer'] = faq_df['answer'].str.strip()
faq_df.drop_duplicates(subset='question', inplace=True)
faq_df.reset_index(drop=True, inplace=True)

faq_questions = faq_df['question'].tolist()
faq_answers = faq_df['answer'].tolist()


# Model
print("Encoding FAQ questions...")
model = SentenceTransformer('all-MiniLM-L6-v2')
question_embeddings = model.encode(faq_questions, show_progress_bar=True)

# Response
def get_response_with_score(user_query):
    query_embedding = model.encode([user_query])
    similarities = cosine_similarity(query_embedding, question_embeddings)
    best_idx = np.argmax(similarities)
    best_score = similarities[0][best_idx]
    return faq_answers[best_idx], faq_questions[best_idx], best_score


test_df = pd.read_csv('/Users/tshmacm1172/Desktop/DimowKay_FinBot/test_data.csv')  
print("\n📝 Test Columns:", test_df.columns.tolist())


input_col = 'question'
expected_col = 'answer'

test_df.dropna(subset=[input_col, expected_col], inplace=True)


print("\n Running test queries...")
for i in range(min(5, len(test_df))):
    user_query = test_df.loc[i, input_col]
    expected = test_df.loc[i, expected_col]

    predicted_answer, matched_question, score = get_response_with_score(user_query)

    print(f"\n Query: {user_query}")
    print(f" Expected: {expected}")
    print(f" Predicted: {predicted_answer}")
    print(f" Matched FAQ Question: {matched_question}")
    print(f" Similarity Score: {score:.2f}")
    print("-" * 60)
