# Step 1: Decode the InsuranceQA Dataset

In [1]:
# Check the contents of the vocabulary file to understand its format
vocab_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\vocabulary"

# Read the first few lines of the file
with open(vocab_file_path, "r", encoding="utf-8") as file:
    vocab_lines = [next(file) for _ in range(10)]  # Read first 10 lines

# Load vocabulary mapping from file
vocab_dict = {}

with open(vocab_file_path, "r", encoding="utf-8") as file:
    for line in file:
        index, word = line.strip().split("\t", 1)  # Split only on the first tab
        vocab_dict[index] = word  # Store mapping

# Check a few entries in the dictionary
list(vocab_dict.items())[:10]

import gzip

# Function to decode an encoded dataset using the vocabulary dictionary
def decode_qa_pairs(encoded_file_path, vocab_dict):
    decoded_qa_pairs = []
    
    with gzip.open(encoded_file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                decoded_question = " ".join([vocab_dict.get(token, token) for token in parts[0].split()])
                decoded_answers = [" ".join([vocab_dict.get(token, token) for token in ans.split()]) for ans in parts[1:]]
                decoded_qa_pairs.append({
                    'question': decoded_question,
                    'answers': decoded_answers
                })
    
    return decoded_qa_pairs

# Paths to the dataset files
train_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1500.pool.solr.train.encoded.gz"
test1_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1500.pool.solr.test.encoded.gz"
test2_file_path = r"D:\NLPInsuranceProject\NLPINSURANCE-FINTECHPROJ\dataset insurance qa\InsuranceQA.question.anslabel.raw.1000.pool.solr.test.encoded.gz"

# Decode the datasets
try:
    train_qa_decoded = decode_qa_pairs(train_file_path, vocab_dict)
    test1_qa_decoded = decode_qa_pairs(test1_file_path, vocab_dict)
    test2_qa_decoded = decode_qa_pairs(test2_file_path, vocab_dict)
    
    # Combine all decoded datasets
    all_qa_decoded = train_qa_decoded + test1_qa_decoded + test2_qa_decoded

    # Display a few decoded examples
    all_qa_decoded[:3]
except Exception as e:
    str(e)


# Step 2 : Sentence Transformer 

In [2]:
pip install datasets 

Note: you may need to restart the kernel to use updated packages.


In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets
from torch.utils.data import DataLoader

# Load a base pre-trained model 
base_model = "all-MiniLM-L6-v2"  
model = SentenceTransformer(base_model)

# Prepare training data
train_examples = []
for qa_pair in all_qa_decoded:
    question = qa_pair['question']
    for answer in qa_pair['answers']:
        train_examples.append(InputExample(texts=[question, answer]))

# Create DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune model
num_epochs = 3  # Adjust as needed
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=100
)

# Save trained model
model_save_path = "insurance_qa_transformers"
model.save(model_save_path)

print(f"Model saved at: {model_save_path}")





Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
