In [1]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn

# Define DualEncoder class again
class DualEncoder(nn.Module):
    def __init__(self, model_name, output_dim=128):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.projection = nn.Linear(self.base_model.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = output.last_hidden_state[:, 0]
        return self.projection(cls_token)

# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Reload models
question_encoder = DualEncoder(model_name).to(device)
answer_encoder = DualEncoder(model_name).to(device)
question_encoder.load_state_dict(torch.load("q_encoder_finetuned.pth", map_location=device))
answer_encoder.load_state_dict(torch.load("a_encoder_finetuned.pth", map_location=device))
question_encoder.eval()
answer_encoder.eval()

# Load data
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.sample(n=1000, random_state=42).reset_index(drop=True)  # use a subset for testing

# Helper function
def get_embeddings(model, texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=64)
    if 'token_type_ids' in tokens:
        del tokens['token_type_ids']
    tokens = {k: v.to(device) for k, v in tokens.items()}
    with torch.no_grad():
        return model(**tokens)

# Encode all answers
answer_texts = df['Answer'].tolist()
answer_embeddings = get_embeddings(answer_encoder, answer_texts)

# Try a few random questions
test_indices = [5, 100, 250]  # or use any other indices
for idx in test_indices:
    question = df.loc[idx, 'Question']
    true_answer = df.loc[idx, 'Answer']
    
    # Get question embedding
    q_emb = get_embeddings(question_encoder, [question])  # shape [1, 128]
    
    # Compute similarities
    sims = torch.matmul(q_emb, answer_embeddings.T).squeeze()  # shape [N]
    top_idx = sims.argmax().item()
    
    print(f"\n🔍 Question: {question}")
    print(f"✅ Predicted Answer: {df.loc[top_idx, 'Answer']}")
    print(f"🎯 Ground Truth: {true_answer}")
    print(f"📊 Similarity Score: {sims[top_idx]:.4f}")


  question_encoder.load_state_dict(torch.load("q_encoder_finetuned.pth", map_location=device))
  answer_encoder.load_state_dict(torch.load("a_encoder_finetuned.pth", map_location=device))



🔍 Question: How secure is it from bugs?
✅ Predicted Answer: I had no problems. Even if you leave a gap with the elastic, at worst I had a fly or two. Most was when I left the openings unzipped to go in and out. On damp ground at one camp ground I had a couple dozen crickets UNDER the tent when I took it down. They crunched when I walked on the tent floor, yuk! But unless I let them in, I did not wake up to any additional boarders.
🎯 Ground Truth: I had no problems. Even if you leave a gap with the elastic, at worst I had a fly or two. Most was when I left the openings unzipped to go in and out. On damp ground at one camp ground I had a couple dozen crickets UNDER the tent when I took it down. They crunched when I walked on the tent floor, yuk! But unless I let them in, I did not wake up to any additional boarders.
📊 Similarity Score: 11.9449

🔍 Question: does anyone use this blow dryer to diffuse their hair? if so, how are the results?
✅ Predicted Answer: I use it to diffuse. Works fi

In [3]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_similarity

# Define DualEncoder class again
class DualEncoder(nn.Module):
    def __init__(self, model_name, output_dim=128):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.projection = nn.Linear(self.base_model.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = output.last_hidden_state[:, 0]
        return self.projection(cls_token)

# Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Reload models
question_encoder = DualEncoder(model_name).to(device)
answer_encoder = DualEncoder(model_name).to(device)
question_encoder.load_state_dict(torch.load("q_encoder_finetuned.pth", map_location=device))
answer_encoder.load_state_dict(torch.load("a_encoder_finetuned.pth", map_location=device))
question_encoder.eval()
answer_encoder.eval()

# Load data
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.sample(n=1000, random_state=42).reset_index(drop=True)

# Embedding helper
def get_embeddings(model, texts, batch_size=32):
    all_embeddings = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            tokens = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=64)
            if 'token_type_ids' in tokens:
                del tokens['token_type_ids']
            tokens = {k: v.to(device) for k, v in tokens.items()}
            embeddings = model(**tokens)
            all_embeddings.append(embeddings.cpu())
    return torch.cat(all_embeddings)

# Get all embeddings
question_embeddings = get_embeddings(question_encoder, df['Question'].tolist())  # [N, 128]
answer_embeddings = get_embeddings(answer_encoder, df['Answer'].tolist())        # [N, 128]

# L2 normalize
q_emb_norm = torch.nn.functional.normalize(question_embeddings, p=2, dim=1)
a_emb_norm = torch.nn.functional.normalize(answer_embeddings, p=2, dim=1)

# Compute cosine similarity matrix
similarity = torch.matmul(q_emb_norm, a_emb_norm.T).cpu().numpy()  # shape [N, N]

# Evaluation metrics
def top_k_accuracy(similarity, k):
    top_k = np.argsort(-similarity, axis=1)[:, :k]
    correct = sum([i in top_k[i] for i in range(len(top_k))])
    return correct / len(top_k)

def mean_reciprocal_rank(similarity):
    ranks = []
    for i in range(similarity.shape[0]):
        sorted_indices = np.argsort(-similarity[i])
        rank = np.where(sorted_indices == i)[0][0] + 1  # rank starts from 1
        ranks.append(1.0 / rank)
    return np.mean(ranks)

# Evaluate
print("\n🔎 Evaluation Results")
print(f"✅ Top-1 Accuracy: {top_k_accuracy(similarity, 1):.4f}")
print(f"✅ Top-5 Accuracy: {top_k_accuracy(similarity, 5):.4f}")
print(f"✅ Mean Reciprocal Rank (MRR): {mean_reciprocal_rank(similarity):.4f}")


  question_encoder.load_state_dict(torch.load("q_encoder_finetuned.pth", map_location=device))
  answer_encoder.load_state_dict(torch.load("a_encoder_finetuned.pth", map_location=device))



🔎 Evaluation Results
✅ Top-1 Accuracy: 0.4830
✅ Top-5 Accuracy: 0.7640
✅ Mean Reciprocal Rank (MRR): 0.6068


In [5]:
from transformers import pipeline
import pandas as pd
import torch
from tqdm import tqdm
import evaluate  # 👈 use this instead of datasets

# ✅ Load model and tokenizer
model_path = "trainer_squad_qa_model"
tokenizer_path = "tokenizer_squad_qa_model"

qa_pipeline = pipeline(
    "question-answering",
    model=model_path,
    tokenizer=tokenizer_path,
    device=0 if torch.cuda.is_available() else -1
)

# ✅ Load data
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.iloc[100000:100200].reset_index(drop=True)  # 100 samples

# ✅ Load SQuAD evaluation metric
squad_metric = evaluate.load("squad")

# ✅ Evaluation loop
predictions = []
references = []

for i in tqdm(range(len(df))):
    question = df.loc[i, "Question"]
    context = df.loc[i, "Answer"]
    
    try:
        result = qa_pipeline({
            "question": question,
            "context": context
        })
        predicted_answer = result["answer"]
    except:
        predicted_answer = ""

    predictions.append({
        "id": str(i),
        "prediction_text": predicted_answer
    })
    
    references.append({
        "id": str(i),
        "answers": {
            "answer_start": [0],  # You can improve this if your GT has real spans
            "text": [context]
        }
    })

# ✅ Compute metrics
results = squad_metric.compute(predictions=predictions, references=references)

# ✅ Print results
print("\n📊 Evaluation Results:")
print(f"✅ Exact Match (EM): {results['exact_match']:.2f}")
print(f"✅ F1 Score: {results['f1']:.2f}")





Device set to use cuda:0
  0%|          | 1/200 [00:00<02:04,  1.59it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 200/200 [00:02<00:00, 89.53it/s] 


📊 Evaluation Results:
✅ Exact Match (EM): 35.50
✅ F1 Score: 50.14





In [None]:
from transformers import pipeline
import pandas as pd
import torch
from tqdm import tqdm
import evaluate

# ✅ Load model and tokenizer
model_path = "trainer_squad_qa_model"
tokenizer_path = "tokenizer_squad_qa_model"

qa_pipeline = pipeline(
    "question-answering",
    model=model_path,
    tokenizer=tokenizer_path,
    device=0 if torch.cuda.is_available() else -1
)

# ✅ Load CSV and subset
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.iloc[100000:100200].reset_index(drop=True)

# ✅ Load SQuAD evaluation metric
squad_metric = evaluate.load("squad")

# ✅ Test a single QA pair before full evaluation
def test_single_qa_pair_from_csv(index, df, qa_pipeline, metric):
    question = df.loc[index, "Question"]
    ground_truth = df.loc[index, "Answer"]
    
    try:
        result = qa_pipeline({
            "question": question,
            "context": ground_truth
        })
        predicted = result["answer"]
    except Exception as e:
        predicted = ""
        print(f"⚠️ Pipeline error at index {index}: {e}")
    
    pred_format = {"id": "0", "prediction_text": predicted}
    ref_format = {"id": "0", "answers": {"answer_start": [0], "text": [ground_truth]}}

    results = metric.compute(predictions=[pred_format], references=[ref_format])
    
    print(f"\n🔍 Sample QA from CSV at index {index}")
    print(f"🧠 Question: {question}")
    print(f"📘 Ground Truth: {ground_truth}")
    print(f"🤖 Predicted Answer: {predicted}")
    print(f"✅ Exact Match: {results['exact_match']:.2f}")
    print(f"✅ F1 Score: {results['f1']:.2f}")

# 🔎 Try a specific sample before full loop
test_single_qa_pair_from_csv(0, df, qa_pipeline, squad_metric)







Device set to use cuda:0



🔍 Sample QA from CSV at index 0
🧠 Question: Just want to reiterate on the intercom volume control, it can be manually turned up and can it go so loud that it would be too loud? I ask because I'm
📘 Ground Truth: Yes it can easialy be adjusted. I found it too loud once and had to turn it down but it stays in the range where you've left it last.
🤖 Predicted Answer: Yes
✅ Exact Match: 0.00
✅ F1 Score: 6.90


In [6]:
from transformers import pipeline
import pandas as pd
from tqdm import tqdm
import torch
import evaluate
import nltk

# Download nltk punkt for tokenization
nltk.download('punkt')

# Load BLEU and ROUGE metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

# Load your QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model="trainer_squad_qa_model",
    tokenizer="tokenizer_squad_qa_model",
    device=0 if torch.cuda.is_available() else -1
)

# Load your data
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.iloc[100000:100200].reset_index(drop=True)  # 200 examples for evaluation

predictions = []
references = []

print("\n🔍 Running Predictions...")
for i in tqdm(range(len(df))):
    question = df.loc[i, "Question"]
    context = df.loc[i, "Answer"]
    
    try:
        output = qa_pipeline({"question": question, "context": context})
        pred_answer = output['answer']
    except Exception:
        pred_answer = ""

    predictions.append(pred_answer)
    references.append(context)

# BLEU and ROUGE expect list of references (as list of list of tokens)
bleu_results = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
rouge_results = rouge.compute(predictions=predictions, references=references)

# Print results
print("\n📊 Evaluation Results:")
print(f"✅ BLEU Score: {bleu_results['bleu'] * 100:.2f}")
print(f"✅ ROUGE-1: {rouge_results['rouge1'] * 100:.2f}")
print(f"✅ ROUGE-2: {rouge_results['rouge2'] * 100:.2f}")
print(f"✅ ROUGE-L: {rouge_results['rougeL'] * 100:.2f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cuda:0



🔍 Running Predictions...


100%|██████████| 200/200 [00:01<00:00, 128.45it/s]



📊 Evaluation Results:
✅ BLEU Score: 0.45
✅ ROUGE-1: 50.14
✅ ROUGE-2: 40.58
✅ ROUGE-L: 49.76


In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel

# Load tokenizer
trained_tokenizer = AutoTokenizer.from_pretrained("tokenizer_Generative_qa_model")

# Load the base FLAN-T5 model
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to("cuda")

# Resize the base model's token embeddings to match the tokenizer
base_model.resize_token_embeddings(len(trained_tokenizer))

# Load the LoRA adapters on top of the resized base model
trained_model = PeftModel.from_pretrained(base_model, "trainer_Generative_qa_model").to("cuda")

def generate_answer(question, context, model, tokenizer):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(
        input_ids=input_ids,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

import pandas as pd
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.iloc[:110000].reset_index(drop=True)

# Select 5 random examples from the test dataset for evaluation
sample_test_df = df.iloc[100000:110000].sample(5, random_state=42).reset_index(drop=True)

print("\n--- Evaluating Trained Model on 5 Examples ---")
for index, row in sample_test_df.iterrows():
    question = row['Question']
    ground_truth_answer = row['Answer']
    generated_answer = generate_answer(question, ground_truth_answer, trained_model, trained_tokenizer)

    print(f"\nExample {index + 1}:")
    print(f"  Question: {question}")
    print(f"  Ground Truth Answer: {ground_truth_answer}")
    print(f"  Generated Answer: {generated_answer}")


--- Evaluating Trained Model on 5 Examples ---

Example 1:
  Question: What type of breaks does it have? Two operated by hands, or one hand and one pedal?
  Ground Truth Answer: Hello, Front hand brake. Rear coaster-brake (pedal operated when user installed). Regards, BMW - AutoGoodParts
  Generated Answer: Two operated by hands

Example 2:
  Question: What are the dimensions of the white spaces between the leaves? I want to add a label with words.
  Ground Truth Answer: It is approximately 5 1/2 " but it the label is with just the plain label maker that sticks on that won't work maybe get it engraved?
  Generated Answer: 5 1/2 "

Example 3:
  Question: Do these headlights come with the bulbs included ?
  Ground Truth Answer: The headlight bulbs are included, but *not* the blinker bulbs. However this is not a problem. When you remove the old headlights, you disconnect the main wires at the back and you twist & pull the entire blinker bulb assembly (the bulb comes out with it). Pull th

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel
import pandas as pd

# Load tokenizer
trained_tokenizer = AutoTokenizer.from_pretrained("tokenizer_Generative_qa_model")

# Load base model and apply LoRA adapter
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to("cuda")
base_model.resize_token_embeddings(len(trained_tokenizer))
trained_model = PeftModel.from_pretrained(base_model, "trainer_Generative_qa_model").to("cuda")

# Define generation function with better control
def generate_answer(question, context, model, tokenizer):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    
    outputs = model.generate(
        input_ids=input_ids,
        max_length=128,             # 🔼 Increase max_length
        min_length=20,              # ✅ Encourage longer answers
        num_beams=5,                # 🔼 Slightly better diversity
        repetition_penalty=1.2,     # 🔄 Penalize repeats
        length_penalty=1.0,         # ⚖️ Balance between short and long
        early_stopping=True
    )
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.strip()

# Load and sample dataset
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.iloc[:110000].reset_index(drop=True)
sample_test_df = df.iloc[100000:110000].sample(5, random_state=42).reset_index(drop=True)

# Evaluation
print("\n--- Evaluating Trained Model on 5 Examples ---")
for index, row in sample_test_df.iterrows():
    question = row['Question']
    ground_truth_answer = row['Answer']
    generated_answer = generate_answer(question, ground_truth_answer, trained_model, trained_tokenizer)

    print(f"\nExample {index + 1}:")
    print(f"🧠 Question: {question}")
    print(f"📘 Ground Truth Answer: {ground_truth_answer}")
    print(f"🤖 Generated Answer: {generated_answer}")



--- Evaluating Trained Model on 5 Examples ---

Example 1:
🧠 Question: What type of breaks does it have? Two operated by hands, or one hand and one pedal?
📘 Ground Truth Answer: Hello, Front hand brake. Rear coaster-brake (pedal operated when user installed). Regards, BMW - AutoGoodParts
🤖 Generated Answer: Rear coaster-brake (pedal operated when user installed) and front hand brake

Example 2:
🧠 Question: What are the dimensions of the white spaces between the leaves? I want to add a label with words.
📘 Ground Truth Answer: It is approximately 5 1/2 " but it the label is with just the plain label maker that sticks on that won't work maybe get it engraved?
🤖 Generated Answer: 2" x 2" x 2" x 2" x 2" x 2" x 2"

Example 3:
🧠 Question: Do these headlights come with the bulbs included ?
📘 Ground Truth Answer: The headlight bulbs are included, but *not* the blinker bulbs. However this is not a problem. When you remove the old headlights, you disconnect the main wires at the back and you twi

# get_top_k_answers_dual_encoder

In [4]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn

# Define DualEncoder model
class DualEncoder(nn.Module):
    def __init__(self, model_name, output_dim=128):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.projection = nn.Linear(self.base_model.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        output = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = output.last_hidden_state[:, 0]
        return self.projection(cls_token)

# Load model and tokenizer once
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load encoders and set to eval
question_encoder = DualEncoder(model_name).to(device)
answer_encoder = DualEncoder(model_name).to(device)
question_encoder.load_state_dict(torch.load("q_encoder_finetuned.pth", map_location=device))
answer_encoder.load_state_dict(torch.load("a_encoder_finetuned.pth", map_location=device))
question_encoder.eval()
answer_encoder.eval()

# Load data and precompute answer embeddings
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.sample(n=1000, random_state=42).reset_index(drop=True)  # subset

# Helper to get embeddings
def get_embeddings(model, texts):
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=64)
    tokens.pop("token_type_ids", None)
    tokens = {k: v.to(device) for k, v in tokens.items()}
    with torch.no_grad():
        return model(**tokens)

# Precompute answer embeddings
answer_texts = df['Answer'].tolist()
answer_embeddings = get_embeddings(answer_encoder, answer_texts)  # shape [N, 128]

# ✅ Function: Get top K answers for a given question
def get_top_k_answers_dual_encoder(question, top_k=3):
    q_emb = get_embeddings(question_encoder, [question])  # shape [1, 128]
    sims = torch.matmul(q_emb, answer_embeddings.T).squeeze()  # [N]
    topk_scores, topk_indices = torch.topk(sims, k=top_k)

    top_answers = []
    for score, idx in zip(topk_scores.tolist(), topk_indices.tolist()):
        top_answers.append({
            "answer": df.loc[idx, 'Answer'],
            "score": round(score, 4),
            "reference_question": df.loc[idx, 'Question']
        })
    
    return top_answers


  question_encoder.load_state_dict(torch.load("q_encoder_finetuned.pth", map_location=device))
  answer_encoder.load_state_dict(torch.load("a_encoder_finetuned.pth", map_location=device))


In [6]:
import pandas as pd

# Load your dataset (make sure this matches your environment)
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna().reset_index(drop=True)

# Select a question from the dataset (e.g., index 42)
idx = 3
sample_question = df.loc[idx, 'Question']
print(f"\n🔍 Selected Question from CSV (Index {idx}): {sample_question}")

# Call the dual encoder function
results = get_top_k_answers_dual_encoder(sample_question)

# Display results
for i, item in enumerate(results, 1):
    print(f"\n🔝 Rank {i}:")
    print(f"✅ Answer: {item['answer']}")
    print(f"📊 Score: {item['score']}")
    print(f"📝 From Original Question: {item['reference_question']}")



🔍 Selected Question from CSV (Index 3): Does this come with power cord and dishwasher hook up?

🔝 Rank 1:
✅ Answer: No, I don't believe so.
📊 Score: 228.3117
📝 From Original Question: Can this filter be used as an inline filter for refrigerator ice maker (no water dispenser)? I have been using the yellow twist and lock product specifically marketed for the purpose, but was told the green filter was more effective.

🔝 Rank 2:
✅ Answer: IT USES TWO LIGHT BULBS.
📊 Score: 223.4112
📝 From Original Question: how many light bulbs does it use

🔝 Rank 3:
✅ Answer: I measured 74 inches
📊 Score: 217.8462
📝 From Original Question: How much height does the trim kit add?


# get-generated-answer

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel
import pandas as pd

# Load tokenizer and model once
trained_tokenizer = AutoTokenizer.from_pretrained("tokenizer_Generative_qa_model")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to("cuda")
base_model.resize_token_embeddings(len(trained_tokenizer))
trained_model = PeftModel.from_pretrained(base_model, "trainer_Generative_qa_model").to("cuda")

# Core generation function
def generate_answer(question, context, model, tokenizer):
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    outputs = model.generate(
        input_ids=input_ids,
        max_length=128,
        min_length=20,
        num_beams=5,
        repetition_penalty=1.2,
        length_penalty=1.0,
        early_stopping=True
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.strip()

# ✅ Function to try on natural question
def answer_natural_question(question, context=None, fallback_csv="single_qna.csv"):
    if context is None:
        # If no context is provided, get a similar one from CSV randomly (you can plug in retrieval here)
        df = pd.read_csv(fallback_csv)[['Question', 'Answer']].dropna().sample(1, random_state=42)
        context = df.iloc[0]['Answer']
    
    generated = generate_answer(question, context, trained_model, trained_tokenizer)

    return {
        "question": question,
        "used_context": context,
        "generated_answer": generated
    }


In [None]:
question = "What is the role of mitochondria in a cell?"
result = answer_natural_question(question)

print(f"\n🧠 Question: {result['question']}")
print(f"📘 Used Context: {result['used_context']}")
print(f"🤖 Generated Answer: {result['generated_answer']}")



🧠 Question: will they fit a 1996 toyota tacoma
📘 Used Context: EM GEAR will use USPS shipping for this item. Thanks
🤖 Generated Answer: No problem. Is there anything else I can help you with? Is there anything else I can help you with?


In [9]:
from transformers import pipeline
import pandas as pd
import torch
import evaluate

# Load model and tokenizer
model_path = "trainer_squad_qa_model"
tokenizer_path = "tokenizer_squad_qa_model"

device = 0 if torch.cuda.is_available() else -1
print(f"Device set to use cuda:{device if device>=0 else 'cpu'}")

qa_pipeline = pipeline(
    "question-answering",
    model=model_path,
    tokenizer=tokenizer_path,
    device=device
)

# Load CSV and subset
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.iloc[100000:100200].reset_index(drop=True)

# Load SQuAD evaluation metric
squad_metric = evaluate.load("squad")

def test_single_qa_pair_from_csv(index, df, qa_pipeline, metric):
    question = df.loc[index, "Question"]
    ground_truth = df.loc[index, "Answer"]

    # For better prediction, ideally supply a longer context, not just ground truth answer.
    # Here, we use ground_truth as context as placeholder.
    context = ground_truth  # <-- replace this with a longer context if available

    try:
        # Use keyword arguments (not deprecated list input)
        result = qa_pipeline(question=question, context=context)
        predicted = result.get("answer", "")
        score = result.get("score", 0.0)
    except Exception as e:
        predicted = ""
        score = 0.0
        print(f"⚠️ Pipeline error at index {index}: {e}")

    pred_format = {"id": str(index), "prediction_text": predicted}
    ref_format = {"id": str(index), "answers": {"answer_start": [0], "text": [ground_truth]}}

    results = metric.compute(predictions=[pred_format], references=[ref_format])

    print(f"\n🔍 Sample QA from CSV at index {index}")
    print(f"🧠 Question: {question}")
    print(f"📘 Ground Truth: {ground_truth}")
    print(f"🤖 Predicted Answer: {predicted}")
    print(f"🔎 Confidence Score: {score:.3f}")
    print(f"✅ Exact Match: {results['exact_match']:.2f}")
    print(f"✅ F1 Score: {results['f1']:.2f}")

# Try sample index 0
test_single_qa_pair_from_csv(0, df, qa_pipeline, squad_metric)


Device set to use cuda:0


Device set to use cuda:0



🔍 Sample QA from CSV at index 0
🧠 Question: Just want to reiterate on the intercom volume control, it can be manually turned up and can it go so loud that it would be too loud? I ask because I'm
📘 Ground Truth: Yes it can easialy be adjusted. I found it too loud once and had to turn it down but it stays in the range where you've left it last.
🤖 Predicted Answer: Yes
🔎 Confidence Score: 0.000
✅ Exact Match: 0.00
✅ F1 Score: 6.90


In [10]:
from transformers import pipeline
import pandas as pd
import torch

# Load model and tokenizer
model_path = "trainer_squad_qa_model"
tokenizer_path = "tokenizer_squad_qa_model"

device = 0 if torch.cuda.is_available() else -1
print(f"Device set to use cuda:{device if device >= 0 else 'cpu'}")

qa_pipeline = pipeline(
    "question-answering",
    model=model_path,
    tokenizer=tokenizer_path,
    device=device
)

# Load CSV and subset
df = pd.read_csv("single_qna.csv")[['Question', 'Answer']].dropna()
df = df.iloc[100000:100200].reset_index(drop=True)

def get_answer_from_csv(index, df, qa_pipeline):
    question = df.loc[index, "Question"]
    ground_truth = df.loc[index, "Answer"]

    # IMPORTANT: replace this with a larger context related to the question if you have it!
    context = ground_truth  # This is just the answer text, which causes short predictions.

    try:
        result = qa_pipeline(question=question, context=context)
        predicted = result.get("answer", "")
        score = result.get("score", 0.0)
    except Exception as e:
        predicted = ""
        score = 0.0
        print(f"⚠️ Pipeline error at index {index}: {e}")

    print(f"\n🔍 QA Sample at index {index}")
    print(f"🧠 Question: {question}")
    print(f"📘 Ground Truth: {ground_truth}")
    print(f"🤖 Predicted Answer: {predicted}")
    print(f"🔎 Confidence Score: {score:.3f}")

# Test sample index 0
get_answer_from_csv(0, df, qa_pipeline)


Device set to use cuda:0


Device set to use cuda:0



🔍 QA Sample at index 0
🧠 Question: Just want to reiterate on the intercom volume control, it can be manually turned up and can it go so loud that it would be too loud? I ask because I'm
📘 Ground Truth: Yes it can easialy be adjusted. I found it too loud once and had to turn it down but it stays in the range where you've left it last.
🤖 Predicted Answer: Yes
🔎 Confidence Score: 0.000


In [None]:
def get_answer_from_csv(index, df, qa_pipeline):
    question = df.loc[index, "Question"]
    ground_truth = df.loc[index, "Answer"]

    # Use question + answer as naive larger context
    context = question + ". " + ground_truth

    try:
        result = qa_pipeline(question=question, context=context)
        predicted = result.get("answer", "")
        score = result.get("score", 0.0)
    except Exception as e:
        predicted = ""
        score = 0.0
        print(f"⚠️ Pipeline error at index {index}: {e}")

    print(f"\n🔍 QA Sample at index {index}")
    print(f"🧠 Question: {question}")
    print(f"📘 Ground Truth: {ground_truth}")
    print(f"🤖 Predicted Answer: {predicted}")
    print(f"🔎 Confidence Score: {score:.3f}")

# Test sample index 0
get_answer_from_csv(3, df, qa_pipeline)


⚠️ Pipeline error at index 3: name 'context' is not defined

🔍 QA Sample at index 3
🧠 Question: what comes in the box
📘 Ground Truth: Two Sena Bluetooth headsets, charger, windsocks. Go ahead and put the windsocks I. The mics. It will cut down on wind and give you better sound. We love our headsets.
🤖 Predicted Answer: 
🔎 Confidence Score: 0.000
