## **Import Necessary Dependencies**

In [19]:
import os
import pandas as pd
from dotenv import load_dotenv

from pinecone import Pinecone, ServerlessSpec, init
import pinecone
#from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import VertexAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from datasets import load_dataset
from langchain_community.retrievers import PineconeHybridSearchRetriever
from tqdm.autonotebook import tqdm
import pickle

import warnings
warnings.filterwarnings('ignore')
print("Installation Complete.")

Installation Complete.


In [20]:
# Load the 'squad_v2' dataset from Huggingface
dataset = load_dataset("squad_v2")

print(dataset['train'][0])
print(dataset)

{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}
DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
      

**Pre-Process Data**

In [22]:
def preprocess_squad(dataset):
    formatted_data = []

    for split in ["train", "validation"]:
        for row in dataset[split]:
            q, context = row["question"], row["context"]
            for a in row["answers"]["text"]:
                formatted_data.append({"question": q, "answer": a, 'context': context})

    df = pd.DataFrame(formatted_data)

    # Removing newlines
    df["question"] = df["question"].str.replace("\n", " ", regex=False)
    df["answer"] = df["answer"].str.replace("\n", " ", regex=False)
    df["context"] = df["context"].str.replace("\n", " ", regex=False)

    return df

In [23]:
#df = preprocess_squad(dataset)

# Checkpoint for preprocessed DataFrame
preprocessed_data_file = "preprocessed_squad.pkl"

if os.path.exists(preprocessed_data_file):
    # Load preprocessed data if it exists
    with open(preprocessed_data_file, "rb") as f:
        df = pickle.load(f)
else:
    # Preprocess the data
    df = preprocess_squad(dataset)

    # Save the preprocessed data
    with open(preprocessed_data_file, "wb") as f:
        pickle.dump(df, f)

print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

| question                                                         | answer              | context                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
|:-----------------------------------------------------------------|:--------------------|:-------------------------------------------------------------------------------------------------------------------------

In [24]:
# Sample Data
import random

# Checkpoint for sampled data
sampled_data_file = "sampled_squad.pkl"

if os.path.exists(sampled_data_file):
    # Load sampled data if it exists
    with open(sampled_data_file, "rb") as f:
        df_sample = pickle.load(f)
else:
    # Sample the data
    sample_size = int(0.10 * len(df))  
    df_sample = df.sample(n=sample_size, random_state=42)

    # Save the sampled data
    with open(sampled_data_file, "wb") as f:
        pickle.dump(df_sample, f)

In [25]:
print(df_sample.head().to_markdown(index=False, numalign="left", stralign="left"))

| question                                                                                                 | answer                                                  | context                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [26]:
load_dotenv()

api_key = os.environ.get("PINECONE_API_KEY")
environment = os.environ.get('PINECONE_ENV')

index_name = "hybridsearch-ragtime"

## Initialize the pinecone client
pc = pinecone.Pinecone(api_key=api_key)

## Create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768, ## Dimension of dense vector
        metric = "dotproduct", ## Sparse value supported only for dot product
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [27]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x2af22c5f770>

In [28]:
import pinecone

print(f"Pinecone client version: {pinecone.__version__}")


Pinecone client version: 5.0.1


In [6]:
pc.configure_index(index_name, metadata_config={"indexed": ["question", "answer"]})

TypeError: Pinecone.configure_index() got an unexpected keyword argument 'metadata_config'

**Embedding Data**

In [None]:
#from google.oauth2 import service_account



#credentials = service_account.Credentials.from_service_account_file(cred_path)

#embeddings = VertexAIEmbeddings(
#    model_name = "textembedding-gecko",
#    project = "inspired-studio-431021-m1",
#    location = "us-central1",
#    credentials_path = cred_path
#)

#batch_size = 32

#for i in range(0, len(df), batch_size):
#    i_end = min(i + batch_size, len(df))
    
#    batch_metadata = [
#        {
#            'question': raw['question'],
#           'answer': raw['answer'],
#            'context': raw['context']
#        } for _,raw in df.iloc[i:i_end].iterrows()
#    ]
    # Get embeddings for the batch
#    batch_embeddings = embeddings.embed_documents(df['context'].iloc[i:i_end].tolist())
    
    # Create list of (id, vector, metadata)
#    to_upsert = list(zip(df.index[i:i_end].astype(str), batch_embeddings,batch_metadata))
    
    # Upsert to pinecone
#    index.upsert(vectors=to_upsert)
    

In [29]:
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import HuggingFaceHub
from langchain.retrievers import PineconeHybridSearchRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import BM25Retriever
from langchain.document_loaders import DataFrameLoader
from pinecone_text.sparse import BM25Encoder


In [59]:
huggingfacehub_api_token = os.environ.get("HF_TOKEN")

In [30]:
huggingfacehub_api_token = os.environ.get("HF_TOKEN")
import pickle

# Initialize Huggingface Embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
embed_model = SentenceTransformer(model_name)
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)

batch_size = 64

# Load documents from DataFrame to Langchain format
loader = DataFrameLoader(df_sample, page_content_column="context")
documents = loader.load()


bm25_encoder_file = "bm25_encoder.pkl"

if os.path.exists(bm25_encoder_file):
    # Load BM25 encoder if it exists
    with open(bm25_encoder_file, "rb") as f:
        bm25_encoder = pickle.load(f)
else:
    # Create and fit the BM25Encoder
    bm25_encoder = BM25Encoder()
    bm25_encoder.fit([doc.page_content for doc in documents])

    # Save the BM25 encoder
    with open(bm25_encoder_file, "wb") as f:
        pickle.dump(bm25_encoder, f)

# Save the encoder
#with open("bm25_encoder.pkl", "wb") as f:
#    pickle.dump(bm25_encoder, f)


    
# Embed context in batches
#batch_embeddings = embed_model.encode(df['context'].iloc[i:i_end].tolist())

# Create List
#to_upsert = list(zip(df.index[i:i_end].astype(str), batch_embeddings, batch_metadata))

# Upsert to pinecone
#index.upsert(vectors=to_upsert)   


In [11]:
# Upsert embeddings and metadata to Pinecone
for i in range(0, len(documents), batch_size):
    i_end = min(i + batch_size, len(documents))
    batch = documents[i:i_end]
    ids = [str(x) for x in range(i, i_end)]

    # Get metadata (question and answer)
    metadatas = []
    for idx, doc in enumerate(batch):
        metadata = {
            "question": df_sample.iloc[i + idx]["question"],
            "answer": df_sample.iloc[i + idx]["answer"],
            "context": doc.page_content
        }
        metadatas.append(metadata)

    # Embed and get sparse vectors
    batch_embeddings = embed_model.encode([doc.page_content for doc in batch])
    sparse_vectors = bm25_encoder.encode_documents([doc.page_content for doc in batch])

    # Create list of vector dictionaries
    to_upsert = [
        {
            'id': id_,
            'values': dense_vector.tolist(),
            'sparse_values': sparse_vector  # Changed to "sparse_values" (plural)
        }
        for id_, dense_vector, sparse_vector in zip(ids, batch_embeddings, sparse_vectors)
    ]

    # Upsert to Pinecone
    index.upsert(vectors=to_upsert)

In [31]:
# Now load the encoder
import pickle

with open("bm25_encoder.pkl", "rb") as f:
    bm25_encoder = pickle.load(f)

In [32]:
# Pinecone retriever
retriever = PineconeHybridSearchRetriever(
    index=index,
    embeddings=embeddings,
    sparse_encoder=bm25_encoder
)

In [33]:
# Initialize HuggingFace LLM
repo_id = "google/flan-t5-xxl"
llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":0.5},huggingfacehub_api_token=huggingfacehub_api_token)

**Question Answering Chain**

In [34]:
chain_type_kwargs = {"prompt":PromptTemplate(
    template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nAnswer:",
    input_variables=["context","question"]
)}

qa_chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", 
                                       retriever=retriever, return_source_documents=True,
                                       chain_type_kwargs=chain_type_kwargs)

**Metric Evaluation**

In [56]:
from datasets import load_metric
from huggingface_hub import HfApi, HfFolder, Repository
from huggingface_hub.utils import HfHubHTTPError

# Load the SQuAD evaluation metric
metric = load_metric("squad_v2")  

# Function to generate predictions
def get_prediction(query, top_k=5):
    xq = embed_model.encode(query).tolist() 

    result = index.query(
        vector=xq, 
        top_k=top_k, 
        include_metadata=True
    )

    # Check if any matches were found
    if not result.get('matches'):
        return "", [], []  # Return empty strings and lists if no matches

    # Extract contexts and metadata directly from Pinecone results
    contexts = [
        item.metadata["context"] for item in result["matches"] if item and item.metadata
    ]
    questions = [
        item.metadata["question"] for item in result["matches"] if item and item.metadata
    ]

    context_str = "\n\n".join(contexts)
    prompt = f"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context_str}\n\nQuestion: {query}\nAnswer:"
    answer = llm(prompt)
    
    try:
        answer = llm(prompt)
    except HfHubHTTPError as e:
        if e.response.status_code == 401:  # Unauthorized (invalid token)
            print("Error: Invalid Hugging Face Hub API token. Please check your credentials.")
            return "", [], []
        elif e.response.status_code == 504: # Gateway Timeout
            print("Model timed out. Retrying...")
        else:
            raise e
        
    return answer, contexts, questions






In [60]:
sample_size = 100

for i in range(sample_size):
    question = df_sample.iloc[i]["question"]
    context = df_sample.iloc[i]["context"]
    ground_truth_answers = [df_sample.iloc[i]["answer"]] 

    prediction, retrieved_contexts, retrieved_questions = get_prediction(question)

    # Skip if no prediction or contexts were found
    if not prediction or not retrieved_contexts:
        continue

    # Find the matching context and answer start
    retrieved_context = ""
    for retrieved_context, retrieved_question in zip(retrieved_contexts, retrieved_questions):
        if context in retrieved_context and question == retrieved_question:  # Match both context and question
            break
    else:
        continue  # Skip if no match is found

    metric.add_batch(
        predictions=[prediction],
        references=[{"id": str(i), "answers": {"text": ground_truth_answers, "answer_start": [retrieved_context.find(answer) for answer in ground_truth_answers]}}]
    )
    
    
# Compute and print the metrics
final_metrics = metric.compute()
print(final_metrics)

HfHubHTTPError: 504 Server Error: Gateway Timeout for url: https://api-inference.huggingface.co/models/google/flan-t5-xxl (Request ID: mZz5i3mcs9jxzWSEpLCFB)

Model google/flan-t5-xxl time out

In [22]:
# Get query fro user
query = "What is the theory of relativity?"

# Get answer
result = qa_chain({"query":query})

print(result['result'])

# Access and print the retrieved documents (modified)
for doc in result["source_documents"]:
    print("-" * 100)
    print(f"Context: {doc.page_content}")
    # Access metadata directly from the doc object (not ScoredVector)
    print(f"Question: {doc.metadata['question']}")
    print(f"Answer: {doc.metadata['answer']}")

PineconeApiAttributeError: ScoredVector has no attribute 'metadata' at ['['received_data', 'matches', 0]']['metadata']