Before importing the next packages, be sure to install the following packages:
python -m pip install sentence_transformers pinecone-client datasets

In [1]:
!pip install sentence_transformers pinecone-client datasets



In [3]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


# Change here to your own Pinecone API key, and the directory where you saved the trained model from file "model_training.ipynb"

In [4]:
PINECONE_API_KEY = "a28ff331-bc2b-41c0-b5bd-e1fc05448829"
save_directory = "./deberta_v3_base_model_enhanced"

In [5]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

# Loading and Embedding the dataset

In [6]:
def load_and_embedd_dataset(
        dataset_name: str = 'hotpot_qa',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'context',
        rec_num: int = 400,
        fraction: float = 0.15
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset

    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, 'fullwiki', split=split)

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [7]:
DATASET_NAME = 'hotpot_qa'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    split = 'validation',
    rec_num=40,
    model=embedding_model,
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


In [9]:
print(len(dataset)*0.15)

1110.75


In [17]:
sample_indices = np.random.choice(len(dataset), size=int(len(dataset)*0.15), replace=False)
dataset=dataset.select(sample_indices)

In [18]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(5)

Unnamed: 0,id,question,answer,type,level,supporting_facts,context
0,5ab9c7c355429939ce03dc17,What regular performer at the R-26 club in Par...,Josephine Baker,bridge,hard,"{'title': ['R-26 (salon)', 'Josephine Baker'],...","{'title': ['Gregg Rogell', 'Cary Brothers', 'P..."
1,5a8f45385542997ba9cb320a,Dahlia and Aruncus are both examples of what t...,genus,comparison,hard,"{'title': ['Dahlia', 'Aruncus'], 'sent_id': [0...","{'title': ['Ayla-Axum amphorae', 'Tuber', 'Aru..."
2,5a76fda05542994aec3b71b0,Martin Flavin and Peter Benchley both did what...,novelist,comparison,hard,"{'title': ['Martin Flavin', 'Peter Benchley'],...","{'title': ['Laughing Sinners', 'Convicted (195..."
3,5a884adb5542996e4f308938,Are Eryngium and Arnebia in the same kingdom o...,yes,comparison,hard,"{'title': ['Eryngium', 'Arnebia'], 'sent_id': ...","{'title': ['Antarah ibn Shaddad', 'Ahimsa in J..."
4,5a8b24f255429950cd6afc56,What retailer did Albert Heijn start up that e...,Koninklijke Ahold N.V.,bridge,hard,"{'title': ['Albert Heijn (born 1927)', 'Ahold'...","{'title': ['Food Lion', 'Albert Heijn', 'Ahold..."


In [19]:
pd_dataset.shape

(1110, 7)

In [20]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (40, 384)


# Creating the Pinecone Index, and upserting the vectors of the data to it

In [21]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [22]:
def generate_embedding(text):
    return embedding_model.encode(text)


In [23]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


## Dataset Preparation includes changing rows of data to multiple ones, by each context.

In [24]:
from tqdm import tqdm
import numpy as np
def prepare_hotpotqa_data_for_upsert(dataset, batch_size: int = 128):

    print("Preparing embeddings and metadata for HotpotQA dataset...")

    embeddings = []
    meta_data = []
    ids = []
    counter = 0

    if isinstance(dataset, pd.DataFrame):
        dataset = dataset.to_dict('records')

    for i, row in tqdm(enumerate(dataset)):
        row_id = row.get('_id', f'row_{i}')

        context = row.get('context', {})
        titles = context.get('title', [])
        sentences_list = context.get('sentences', [])

        if len(titles) == len(sentences_list):
            for title, sentences in zip(titles, sentences_list):
                title = str(title)
                sentences = list(sentences)

                concatenated_sentences = " ".join(sentences)

                text_to_embed = concatenated_sentences
                embedding = generate_embedding(text_to_embed)

                supporting_facts = row.get('supporting_facts', [])
                if isinstance(supporting_facts, list):
                    supporting_facts = [list(fact) if isinstance(fact, (tuple, list)) else fact for fact in supporting_facts]

                metadata = {
                    'title': title,
                    'context': concatenated_sentences,
                    'question': row['question'],
                    'supporting_facts': supporting_facts,
                    'answer': row.get('answer', ''),
                    'level': row.get('level', '')
                }

                embeddings.append(embedding)
                meta_data.append(metadata)
                ids.append(f"{row_id}_{counter}")
                counter += 1
        else:
            print(f"Mismatch between titles and sentences at row {i}")

    embeddings = np.array(embeddings)

    to_upsert = list(zip(ids, embeddings, meta_data))

    print(f"Total items prepared: {len(to_upsert)}")
    return to_upsert

import pandas as pd

prepared_data = prepare_hotpotqa_data_for_upsert(pd_dataset)\

print(f"First 5 items:")
for i, item in enumerate(prepared_data[:5]):
    print(f"ID: {item[0]}, Metadata: {item[2]}")


Preparing embeddings and metadata for HotpotQA dataset...


1110it [15:13,  1.22it/s]

Total items prepared: 11016
ID: row_0_0, Metadata: {'title': 'Gregg Rogell', 'context': 'Gregg Rogell was born on February 18, 1967, in Long Island, New York, USA.  He is a professional comedian who resides in New York City.  He has appeared on "The Tonight Show, Late Night with Conan O\'Brien", and "The Nanny".  He has had his own half hour special on Comedy Central, and was a featured performer in the movie "The Aristocrats".  He is a regular performer at New York City\'s Comedy Cellar.', 'question': 'What regular performer at the R-26 club in Paris was known for her banana-themed costume?', 'supporting_facts': {'title': array(['R-26 (salon)', 'Josephine Baker'], dtype=object), 'sent_id': array([2, 4], dtype=int32)}, 'answer': 'Josephine Baker', 'level': 'hard'}
ID: row_0_1, Metadata: {'title': 'Cary Brothers', 'context': 'Cary Brothers is an American indie rock singer-songwriter originally from Nashville, Tennessee, United States.  After moving to Los Angeles and becoming a regular 




In [25]:
from pinecone import Pinecone, ServerlessSpec
import numpy as np
from tqdm.auto import tqdm

def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: list,
        text_field: str = 'context',
        batch_size: int = 128
):

    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]

    meta = [{text_field: item[2][text_field]} for item in dataset]

    to_upsert = list(zip(ids, embeddings.tolist(), meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])

    print("Upsert complete.")
    return index



INDEX_NAME = 'hotpotqa-final1'

pc = create_pinecone_index(INDEX_NAME, shape[1])

index = pc.Index(INDEX_NAME)

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

rec_num = len(prepared_data)

context_texts = [item[2]['context'] for item in prepared_data]

embeddings = model.encode(context_texts[:rec_num])

index_upserted = upsert_vectors(index, np.array(embeddings), prepared_data)

Creating a Pinecone index...
Done!
Upserting the embeddings to the Pinecone index...


  0%|          | 0/87 [00:00<?, ?it/s]

Upsert complete.


In [26]:
dataset[0]

{'id': '5ab9c7c355429939ce03dc17',
 'question': 'What regular performer at the R-26 club in Paris was known for her banana-themed costume?',
 'answer': 'Josephine Baker',
 'type': 'bridge',
 'level': 'hard',
 'supporting_facts': {'title': ['R-26 (salon)', 'Josephine Baker'],
  'sent_id': [2, 4]},
 'context': {'title': ['Gregg Rogell',
   'Cary Brothers',
   'Patricia Racette',
   'Kumar (Singaporean entertainer)',
   'Linda November',
   'Virginia Biddle',
   'Billy Pontoni',
   "Herb Alpert's Vibrato Grill &amp; Jazz",
   'Godfrey (comedian)',
   'Stand Up for the Week'],
  'sentences': [['Gregg Rogell was born on February 18, 1967, in Long Island, New York, USA.',
    ' He is a professional comedian who resides in New York City.',
    ' He has appeared on "The Tonight Show, Late Night with Conan O\'Brien", and "The Nanny".',
    ' He has had his own half hour special on Comedy Central, and was a featured performer in the movie "The Aristocrats".',
    " He is a regular performer at N

# Test of pinecone index retrieval

In [27]:
query_text = "Were Scott Derrickson and Ed Wood of the same nationality?"
query_embedding = embedding_model.encode([query_text])[0]
print(f"Query Embedding Shape: {query_embedding.shape}")  # Should print (384,)


Query Embedding Shape: (384,)


In [28]:
results = [float(val) for val in list(embedding_model.encode(query_text))]
query_results = index.query(
    vector=results,
    top_k=3,
    include_values=True,
    include_metadata=True
)['matches']
print(query_results)

[{'id': '7498',
 'metadata': {'context': 'Ed Wood is a 1994 American biographical period '
                         'comedy-drama film directed and produced by Tim '
                         'Burton, and starring Johnny Depp as cult filmmaker '
                         "Ed Wood.  The film concerns the period in Wood's "
                         'life when he made his best-known films as well as '
                         'his relationship with actor Bela Lugosi, played by '
                         'Martin Landau.  Sarah Jessica Parker, Patricia '
                         'Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray '
                         'are among the supporting cast.'},
 'score': 0.49413377,
 'values': [-0.113730691,
            -0.0195544306,
            0.04914077,
            -0.00683183735,
            -0.0271782614,
            -0.054887671,
            -0.00397560885,
            0.0242693815,
            0.0576351956,
            -0.0378859043,
            -0.061

In [29]:
# import torch
# import numpy as np
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# from sentence_transformers import SentenceTransformer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# tokenizer1 = AutoTokenizer.from_pretrained("sjrhuschlee/deberta-v3-base-squad2-ext-v1")
# model1 = AutoModelForQuestionAnswering.from_pretrained("sjrhuschlee/deberta-v3-base-squad2-ext-v1")

# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# def answer_query_with_context(query, true_answer, pinecone_index, top_k=3):
#     """
#     Retrieve the top contexts from Pinecone, run the QA model on those contexts,
#     and compare the generated answer with the true answer.

#     Args:
#         query (str): The question/query.
#         true_answer (str): The true answer for comparison.
#         pinecone_index: The Pinecone index object for retrieving contexts.
#         top_k (int): The number of top contexts to retrieve from Pinecone.

#     Returns:
#         None: Prints the best generated answer and its similarity with the true answer.
#     """
#     # Convert the query into an embedding using SentenceTransformer
#     query_embedding = embedding_model.encode(query)

#     # Convert the embedding to a list of floats (for Pinecone query)
#     query_vector = [float(val) for val in list(query_embedding)]

#     # Retrieve top contexts from Pinecone
#     query_results = pinecone_index.query(
#         vector=query_vector,
#         top_k=top_k,
#         include_values=True,
#         include_metadata=True
#     )['matches']

#     best_generated_answer = ""
#     best_similarity = -1.0

#     # Iterate over retrieved contexts and evaluate the model on each one
#     for match in query_results:
#         context = match['metadata']['context']  # Extract context from metadata

#         # Tokenize the question and context
#         inputs = tokenizer(query, context, return_tensors="pt", max_length=512, truncation=True)

#         # Perform inference to predict start and end positions
#         with torch.no_grad():
#             outputs = model(**inputs)
#             start_logits = outputs.start_logits
#             end_logits = outputs.end_logits

#             # Get the most probable start and end positions
#             start_idx = torch.argmax(start_logits)
#             end_idx = torch.argmax(end_logits)

#             # Check if the predicted indices are valid
#             if start_idx < end_idx and end_idx < inputs['input_ids'].size(1):
#                 generated_answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx + 1], skip_special_tokens=True)
#             else:
#                 generated_answer = ""  # If indices are invalid, return an empty string

#             # Calculate similarity between the generated answer and the true answer
#             if generated_answer:
#                 vectorizer = TfidfVectorizer().fit_transform([generated_answer, true_answer])
#                 similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2]).flatten()[0]
#             else:
#                 similarity = 0.0  # No answer means zero similarity

#             # Track the best answer based on similarity
#             if similarity > best_similarity:
#                 best_generated_answer = generated_answer
#                 best_similarity = similarity

#     # Print the results
#     print(f"Query: {query}")
#     print(f"Best Generated Answer: {best_generated_answer}")
#     print(f"True Answer: {true_answer}")
#     print(f"Best Similarity: {best_similarity}\n")

# # Example usage:

# # for query_text, true_answer in:
# query_text = "Were Scott Derrickson and Ed Wood of the same nationality?"
# true_answer = "yes"  # You can modify this if you have the correct answer

# # Assuming `index` is the Pinecone index you used earlier
# answer_query_with_context(query_text, true_answer, pinecone_index=index, top_k=3)


Query: Were Scott Derrickson and Ed Wood of the same nationality?
Best Generated Answer: 
True Answer: yes
Best Similarity: 0.0



In [30]:
dataset[0]

{'id': '5ab9c7c355429939ce03dc17',
 'question': 'What regular performer at the R-26 club in Paris was known for her banana-themed costume?',
 'answer': 'Josephine Baker',
 'type': 'bridge',
 'level': 'hard',
 'supporting_facts': {'title': ['R-26 (salon)', 'Josephine Baker'],
  'sent_id': [2, 4]},
 'context': {'title': ['Gregg Rogell',
   'Cary Brothers',
   'Patricia Racette',
   'Kumar (Singaporean entertainer)',
   'Linda November',
   'Virginia Biddle',
   'Billy Pontoni',
   "Herb Alpert's Vibrato Grill &amp; Jazz",
   'Godfrey (comedian)',
   'Stand Up for the Week'],
  'sentences': [['Gregg Rogell was born on February 18, 1967, in Long Island, New York, USA.',
    ' He is a professional comedian who resides in New York City.',
    ' He has appeared on "The Tonight Show, Late Night with Conan O\'Brien", and "The Nanny".',
    ' He has had his own half hour special on Comedy Central, and was a featured performer in the movie "The Aristocrats".',
    " He is a regular performer at N

# Evaluation on Pre-Trained Deberta based, HotpotQA fine-tuned model, with average similarity and Exact Match (EM) calculations.

In [None]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


tokenizer = AutoTokenizer.from_pretrained("sjrhuschlee/deberta-v3-base-squad2-ext-v1")
qa_model = AutoModelForQuestionAnswering.from_pretrained("sjrhuschlee/deberta-v3-base-squad2-ext-v1")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

sim1=[]

def answer_query_with_context(query, true_answer, pinecone_index, top_k=5):
    query_embedding = embedding_model.encode(query)
    query_vector = [float(val) for val in query_embedding]

    query_results = pinecone_index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True
    )['matches']

    best_generated_answer = ""
    best_similarity = -1.0

    for match in query_results:
        context = match['metadata']['context']

        inputs = tokenizer(query, context, return_tensors="pt", max_length=512, truncation=True)

        with torch.no_grad():
            outputs = qa_model(**inputs)
            start_idx = torch.argmax(outputs.start_logits)
            end_idx = torch.argmax(outputs.end_logits)

            if start_idx < end_idx and end_idx < inputs['input_ids'].size(1):
                generated_answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx + 1], skip_special_tokens=True)
            else:
                generated_answer = ""

            if generated_answer:
                vectorizer = TfidfVectorizer().fit_transform([generated_answer, true_answer])
                similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2]).flatten()[0]
            else:
                similarity = 0.0

            if similarity > best_similarity:
                best_generated_answer = generated_answer
                best_similarity = similarity

    return best_generated_answer, best_similarity

def answer_query_with_context2(query, true_answer, pinecone_index, top_k=5):
    query_embedding = embedding_model.encode(query)
    query_vector = [float(val) for val in query_embedding]

    query_results = pinecone_index.query(
        vector=query_vector,
        top_k=top_k,
        include_metadata=True
    )['matches']

    best_generated_answer = ""
    best_similarity = -1.0

    for match in query_results:
        context = match['metadata']['context']
        context_segments = context.split(". ")

        for segment in context_segments:
            inputs = tokenizer(query, segment, return_tensors="pt", max_length=512, truncation=True)
            with torch.no_grad():
                outputs = qa_model(**inputs)
                start_idx = torch.argmax(outputs.start_logits)
                end_idx = torch.argmax(outputs.end_logits)

                if start_idx < end_idx and end_idx < inputs['input_ids'].size(1):
                    generated_answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx + 1], skip_special_tokens=True)
                else:
                    generated_answer = ""

                if generated_answer:
                    vectorizer = TfidfVectorizer().fit_transform([generated_answer, true_answer])
                    similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2]).flatten()[0]
                else:
                    similarity = 0.0

                if similarity > best_similarity:
                    best_generated_answer = generated_answer
                    best_similarity = similarity

    return best_generated_answer, best_similarity


def save_first_100_answers_from_hotpotqa(hotpotqa_data, pinecone_index, file_name="qa_results.json"):
    results = []

    for i in range(100):
        query = hotpotqa_data[i]["question"]
        true_answer = hotpotqa_data[i]["answer"]

        generated_answer, similarity = answer_query_with_context2(query, true_answer, pinecone_index)

        results.append({
            "question": query,
            "true_answer": true_answer,
            "generated_answer": generated_answer,
            "similarity": similarity
        })

        print(f"Processed question {i+1}:")
        print(f"Generated Answer: {generated_answer}")
        print(f"True Answer: {true_answer}")
        print(f"Similarity: {similarity}\n")
        sim1.append(similarity)

    with open(file_name, "w") as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {file_name}")

save_first_100_answers_from_hotpotqa(dataset, pinecone_index=index)
print(np.mean(sim1))


In [None]:
print(sim1)
count=0
for i in sim1:
  if i>=1:
    count+=1
print(count/len(sim1))

# Evaluation on our proposed model, as trained in "model_training.ipynb", with average similarity and Exact Match (EM) calculations.

In [None]:
import json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

sim1=[]

tokenizer = AutoTokenizer.from_pretrained(save_directory)
qa_model = AutoModelForQuestionAnswering.from_pretrained(save_directory)

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def answer_query_with_context3(query, true_answer, pinecone_index, top_k=3):

    query_embedding = embedding_model.encode(query)
    query_vector = [float(val) for val in query_embedding]

    query_results = pinecone_index.query(
        vector=query_vector,
        top_k=top_k,
        include_values=True,
        include_metadata=True
    )['matches']

    best_generated_answer = ""
    best_combined_score = -1.0

    for match in query_results:
        context = match['metadata']['context']
        sentences = context.split(". ")

        for sentence in sentences:
            inputs = tokenizer(query, sentence, return_tensors="pt", max_length=512, truncation=True)

            with torch.no_grad():
                outputs = model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                if start_idx < end_idx and end_idx < inputs['input_ids'].size(1):
                    generated_answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx + 1], skip_special_tokens=True)
                else:
                    generated_answer = ""

                if generated_answer:
                    vectorizer = TfidfVectorizer().fit_transform([generated_answer, true_answer])
                    similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2]).flatten()[0]

                    confidence = (start_logits[0][start_idx] + end_logits[0][end_idx]).item()
                    combined_score = similarity * confidence

                    if combined_score > best_combined_score:
                        best_generated_answer = generated_answer
                        best_combined_score = combined_score

    print(f"Query: {query}")
    print(f"Best Generated Answer: {best_generated_answer}")
    print(f"True Answer: {true_answer}")
    print(f"Best Combined Score (Similarity x Confidence): {best_combined_score}\n")
    return best_generated_answer, best_combined_score

def save_first_100_answers_from_hotpotqa2(hotpotqa_data, pinecone_index, file_name="qa_results.json"):
    results = []

    for i in range(len(hotpotqa_data)):
        query = hotpotqa_data[i]["question"]
        true_answer = hotpotqa_data[i]["answer"]

        generated_answer, similarity = answer_query_with_context3(query, true_answer, pinecone_index)

        results.append({
            "question": query,
            "true_answer": true_answer,
            "generated_answer": generated_answer,
            "similarity": similarity
        })

        print(f"Processed question {i+1}:")
        print(f"Generated Answer: {generated_answer}")
        print(f"True Answer: {true_answer}")
        print(f"Similarity: {similarity}\n")
        sim1.append(similarity)

    with open(file_name, "w") as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {file_name}")

save_first_100_answers_from_hotpotqa2(dataset, pinecone_index=index)
print(np.mean(sim1))


Query: What regular performer at the R-26 club in Paris was known for her banana-themed costume?
Best Generated Answer: What regular performer at the R-26 club in Paris was known for her banana-themed costume? Patrick Kelly
True Answer: Josephine Baker
Best Combined Score (Similarity x Confidence): -0.0

Processed question 1:
Generated Answer: What regular performer at the R-26 club in Paris was known for her banana-themed costume? Patrick Kelly
True Answer: Josephine Baker
Similarity: -0.0

Query: Dahlia and Aruncus are both examples of what type of plant catagorization?
Best Generated Answer: 
True Answer: genus
Best Combined Score (Similarity x Confidence): -1.0

Processed question 2:
Generated Answer: 
True Answer: genus
Similarity: -1.0

Query: Martin Flavin and Peter Benchley both did what occupation?
Best Generated Answer: Member of Parliament
True Answer: novelist
Best Combined Score (Similarity x Confidence): 0.0

Processed question 3:
Generated Answer: Member of Parliament
Tr

In [None]:
print(sim1)
count=0
for i in sim1:
  if i>=1:
    count+=1
print(count/len(sim1))

# Second evaluation on our proposed model, as trained in "model_training.ipynb", with average similarity and Exact Match (EM) calculations, using a slightly different method.

In [None]:
sim1=[]
def answer_query_with_context4(query, true_answer, pinecone_index, top_k=5, min_similarity_threshold=0.2):
    """
    Retrieve the top contexts from Pinecone, check context relevance, split them into smaller chunks,
    evaluate each chunk with the QA model, and return the best answer based on combined score.

    Args:
        query (str): The question/query.
        true_answer (str): The true answer for comparison.
        pinecone_index: The Pinecone index object for retrieving contexts.
        top_k (int): The number of top contexts to retrieve from Pinecone.
        min_similarity_threshold (float): Minimum similarity score to consider context relevant.

    Returns:
        Tuple: (best_generated_answer, best_combined_score) - best answer and its similarity score.
    """
    # Encode the query
    query_embedding = embedding_model.encode(query)
    query_vector = [float(val) for val in query_embedding]

    # Retrieve top contexts from Pinecone
    query_results = pinecone_index.query(
        vector=query_vector,
        top_k=top_k,
        include_values=True,
        include_metadata=True
    )['matches']

    best_generated_answer = ""
    best_combined_score = -1.0

    # Filter irrelevant contexts before passing to QA model
    for match in query_results:
        context = match['metadata']['context']
        context_embedding = embedding_model.encode(context)
        similarity = cosine_similarity([query_embedding], [context_embedding])[0][0]

        # Skip contexts with low relevance to the query
        if similarity < min_similarity_threshold:
            continue

        # Split context into smaller chunks (e.g., sentences)
        sentences = context.split(". ")
        for sentence in sentences:
            # Tokenize the question and context chunk
            inputs = tokenizer(query, sentence, return_tensors="pt", max_length=512, truncation=True)

            # Perform inference
            with torch.no_grad():
                outputs = model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                # Get the most probable start and end positions
                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                if start_idx < end_idx and end_idx < inputs['input_ids'].size(1):
                    generated_answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx + 1], skip_special_tokens=True)
                else:
                    generated_answer = ""

                if generated_answer:
                    # Calculate similarity between generated and true answer
                    vectorizer = TfidfVectorizer().fit_transform([generated_answer, true_answer])
                    similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2]).flatten()[0]

                    # Calculate confidence and combined score
                    confidence = (start_logits[0][start_idx] + end_logits[0][end_idx]).item()
                    combined_score = similarity * confidence

                    # Track the best answer
                    if combined_score > best_combined_score:
                        best_generated_answer = generated_answer
                        best_combined_score = combined_score

    return best_generated_answer, best_combined_score

def save_first_100_answers_from_hotpotqa3(hotpotqa_data, pinecone_index, file_name="qa_results.json"):
    results = []

    for i in range(len(hotpotqa_data)):
        query = hotpotqa_data[i]["question"]
        true_answer = hotpotqa_data[i]["answer"]

        generated_answer, similarity = answer_query_with_context4(query, true_answer, pinecone_index)

        results.append({
            "question": query,
            "true_answer": true_answer,
            "generated_answer": generated_answer,
            "similarity": similarity
        })

        print(f"Processed question {i+1}:")
        print(f"Generated Answer: {generated_answer}")
        print(f"True Answer: {true_answer}")
        print(f"Best Combined Score (Similarity x Confidence): {similarity}\n")
        sim1.append(similarity)

    with open(file_name, "w") as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {file_name}")

# Example usage:
# Assuming `dataset` is loaded with HotpotQA data and `index` is your Pinecone index
save_first_100_answers_from_hotpotqa3(dataset, pinecone_index=index)
print(np.mean(sim1))


In [None]:
print(sim1)
count=0
for i in sim1:
  if i>=1:
    count+=1
print(count/len(sim1))

In [None]:
print(sim1)

In [29]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 73642}},
 'total_vector_count': 73642}

# Evaluation on BERT model, and DeBERTa model, with average similarity and Exact Match (EM) calculations.

In [None]:
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import torch

# Initialize a list to store similarity scores
sim1 = []

def answer_query_with_context4(query, true_answer, pinecone_index, model, tokenizer, embedding_model, top_k=5, min_similarity_threshold=0.2):
    """
    Retrieve the top contexts from Pinecone, check context relevance, split them into smaller chunks,
    evaluate each chunk with the QA model, and return the best answer based on combined score.

    Args:
        query (str): The question/query.
        true_answer (str): The true answer for comparison.
        pinecone_index: The Pinecone index object for retrieving contexts.
        model: The model to use for generating answers.
        tokenizer: The tokenizer corresponding to the model.
        embedding_model: The embedding model for context relevance.
        top_k (int): The number of top contexts to retrieve from Pinecone.
        min_similarity_threshold (float): Minimum similarity score to consider context relevant.

    Returns:
        Tuple: (best_generated_answer, best_combined_score) - best answer and its similarity score.
    """
    # Encode the query
    query_embedding = embedding_model.encode(query)
    query_vector = [float(val) for val in query_embedding]

    # Retrieve top contexts from Pinecone
    query_results = pinecone_index.query(
        vector=query_vector,
        top_k=top_k,
        include_values=True,
        include_metadata=True
    )['matches']

    best_generated_answer = ""
    best_combined_score = -1.0

    # Filter irrelevant contexts before passing to QA model
    for match in query_results:
        context = match['metadata']['context']
        context_embedding = embedding_model.encode(context)
        similarity = cosine_similarity([query_embedding], [context_embedding])[0][0]

        # Skip contexts with low relevance to the query
        if similarity < min_similarity_threshold:
            continue

        # Split context into smaller chunks (e.g., sentences)
        sentences = context.split(". ")
        for sentence in sentences:
            # Tokenize the question and context chunk
            inputs = tokenizer(query, sentence, return_tensors="pt", max_length=512, truncation=True)

            # Perform inference
            with torch.no_grad():
                outputs = model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                # Get the most probable start and end positions
                start_idx = torch.argmax(start_logits)
                end_idx = torch.argmax(end_logits)

                if start_idx < end_idx and end_idx < inputs['input_ids'].size(1):
                    generated_answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx + 1], skip_special_tokens=True)
                else:
                    generated_answer = ""

                if generated_answer:
                    # Calculate similarity between generated and true answer
                    vectorizer = TfidfVectorizer().fit_transform([generated_answer, true_answer])
                    similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2]).flatten()[0]

                    # Calculate confidence and combined score
                    confidence = (start_logits[0][start_idx] + end_logits[0][end_idx]).item()
                    combined_score = similarity * confidence

                    # Track the best answer
                    if combined_score > best_combined_score:
                        best_generated_answer = generated_answer
                        best_combined_score = combined_score

    return best_generated_answer, best_combined_score

def save_first_100_answers_from_hotpotqa3(hotpotqa_data, pinecone_index, model, tokenizer, embedding_model, file_name="qa_results.json"):
    results = []

    for i in range(100):
        query = hotpotqa_data[i]["question"]
        true_answer = hotpotqa_data[i]["answer"]

        generated_answer, similarity = answer_query_with_context4(query, true_answer, pinecone_index, model, tokenizer, embedding_model)

        results.append({
            "question": query,
            "true_answer": true_answer,
            "generated_answer": generated_answer,
            "similarity": similarity
        })

        print(f"Processed question {i + 1}:")
        print(f"Generated Answer: {generated_answer}")
        print(f"True Answer: {true_answer}")
        print(f"Best Combined Score (Similarity x Confidence): {similarity}\n")
        sim1.append(similarity)

    with open(file_name, "w") as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {file_name}")


from transformers import BertTokenizer, BertForQuestionAnswering, DebertaTokenizer, DebertaForQuestionAnswering

bert_model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

deberta_model = DebertaForQuestionAnswering.from_pretrained("microsoft/deberta-base")
deberta_tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

save_first_100_answers_from_hotpotqa3(dataset, pinecone_index=index, model=bert_model, tokenizer=bert_tokenizer, embedding_model=embedding_model)
print("BERT Mean Similarity:", np.mean(sim1))

print(sim1)
count=0
for i in sim1:
  if i>=1:
    count+=1
print(count/len(sim1))

sim1.clear()

save_first_100_answers_from_hotpotqa3(dataset, pinecone_index=index, model=deberta_model, tokenizer=deberta_tokenizer, embedding_model=embedding_model)
print("DeBERTa Mean Similarity:", np.mean(sim1))

print(sim1)
count=0
for i in sim1:
  if i>=1:
    count+=1
print(count/len(sim1))
