## **Import Necessary Dependencies**

In [55]:
import os
import pandas as pd
from dotenv import load_dotenv

from pinecone import Pinecone, ServerlessSpec, init
import pinecone
#from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings import VertexAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from datasets import load_dataset
from langchain_community.retrievers import PineconeHybridSearchRetriever
from tqdm.autonotebook import tqdm

import warnings
warnings.filterwarnings('ignore')
print("Installation Complete.")

Installation Complete.


In [3]:
# Load the 'ELI5' dataset from Huggingface
dataset = load_dataset("squad_v2")

print(dataset['train'][0])
print(dataset)

{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}
DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
      

**Pre-Process Data**

In [4]:
def preprocess_squad(dataset):
    formatted_data = []

    for split in ["train", "validation"]:
        for row in dataset[split]:
            q, context = row["question"], row["context"]
            for a in row["answers"]["text"]:
                formatted_data.append({"question": q, "answer": a, 'context': context})

    df = pd.DataFrame(formatted_data)

    # Removing newlines
    df["question"] = df["question"].str.replace("\n", " ", regex=False)
    df["answer"] = df["answer"].str.replace("\n", " ", regex=False)
    df["context"] = df["context"].str.replace("\n", " ", regex=False)

    return df

In [5]:
df = preprocess_squad(dataset)

print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

| question                                                         | answer              | context                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
|:-----------------------------------------------------------------|:--------------------|:-------------------------------------------------------------------------------------------------------------------------

In [66]:
# Sample Data
import random

sample_size = int(0.10 * len(df))  # 10% of the dataset size
df_sample = df.sample(n=sample_size, random_state=42)

print(df_sample.head().to_markdown(index=False, numalign="left", stralign="left"))

| question                                                                                                 | answer                                                  | context                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [114]:
load_dotenv()

#open_api_key = os.environ.get('OPEN_API_KEY')
api_key = os.environ.get("PINECONE_API_KEY")
environment = os.environ.get('PINECONE_ENV')

## Initialize the pinecone client
#pc = pinecone.Pinecone(api_key=api_key, environment=environment)

#cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
#region =  os.environ.get('PINECONE_ENV') or 'us-east-1'

#spec = ServerlessSpec(cloud=cloud, region=region)

#index_name = "ragtime"

#index = pinecone.Index(index_name, host="https://ragtime-luwubl1.svc.aped-4627-b74a.pinecone.io")

index_name = "hybridsearch-ragtime"

## Initialize the pinecone client
pc = pinecone.Pinecone(api_key=api_key)

## Create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768, ## Dimension of dense vector
        metric = "dotproduct", ## Sparse value supported only for dot product
        spec = ServerlessSpec(cloud="aws", region="us-east-1"),
        metadata_config={"indexed": ["question", "answer"]},
    )

In [115]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x22660f64e90>

In [112]:
#index.update(id='id_',metadata_config={"indexed": ["question", "answer"]})

**Embedding Data**

In [12]:
#from google.oauth2 import service_account



#credentials = service_account.Credentials.from_service_account_file(cred_path)

#embeddings = VertexAIEmbeddings(
#    model_name = "textembedding-gecko",
#    project = "inspired-studio-431021-m1",
#    location = "us-central1",
#    credentials_path = cred_path
#)

#batch_size = 32

#for i in range(0, len(df), batch_size):
#    i_end = min(i + batch_size, len(df))
    
#    batch_metadata = [
#        {
#            'question': raw['question'],
#           'answer': raw['answer'],
#            'context': raw['context']
#        } for _,raw in df.iloc[i:i_end].iterrows()
#    ]
    # Get embeddings for the batch
#    batch_embeddings = embeddings.embed_documents(df['context'].iloc[i:i_end].tolist())
    
    # Create list of (id, vector, metadata)
#    to_upsert = list(zip(df.index[i:i_end].astype(str), batch_embeddings,batch_metadata))
    
    # Upsert to pinecone
#    index.upsert(vectors=to_upsert)
    

In [116]:
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import HuggingFaceHub
from langchain.retrievers import PineconeHybridSearchRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import BM25Retriever
from langchain.document_loaders import DataFrameLoader
from pinecone_text.sparse import BM25Encoder


In [117]:
huggingfacehub_api_token = os.environ.get("HF_TOKEN")
import pickle

# Initialize Huggingface Embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
embed_model = SentenceTransformer(model_name)
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)

batch_size = 32

# Load documents from DataFrame to Langchain format
loader = DataFrameLoader(df_sample, page_content_column="context")
documents = loader.load()

# Create and fit the BM25Encoder
bm25_encoder = BM25Encoder()
bm25_encoder.fit([doc.page_content for doc in documents])

# Save the encoder
with open("bm25_encoder.pkl", "wb") as f:
    pickle.dump(bm25_encoder, f)


    
# Embed context in batches
#batch_embeddings = embed_model.encode(df['context'].iloc[i:i_end].tolist())

# Create List
#to_upsert = list(zip(df.index[i:i_end].astype(str), batch_embeddings, batch_metadata))

# Upsert to pinecone
#index.upsert(vectors=to_upsert)   


100%|██████████| 10712/10712 [00:16<00:00, 632.19it/s]


In [70]:
# Upsert embeddings and metadata to Pinecone
for i in range(0, len(documents), batch_size):
    i_end = min(i + batch_size, len(documents))
    batch = documents[i:i_end]
    ids = [str(x) for x in range(i, i_end)]

    # Get metadata (question and answer)
    metadatas = []
    for idx, doc in enumerate(batch):
        metadata = {
            "question": df_sample.iloc[i + idx]["question"],
            "answer": df_sample.iloc[i + idx]["answer"],
            "context": doc.page_content
        }
        metadatas.append(metadata)

    # Embed and get sparse vectors
    batch_embeddings = embed_model.encode([doc.page_content for doc in batch])
    sparse_vectors = bm25_encoder.encode_documents([doc.page_content for doc in batch])

    # Create list of vector dictionaries
    to_upsert = [
        {
            'id': id_,
            'values': dense_vector.tolist(),
            'sparse_values': sparse_vector  # Changed to "sparse_values" (plural)
        }
        for id_, dense_vector, sparse_vector in zip(ids, batch_embeddings, sparse_vectors)
    ]

    # Upsert to Pinecone
    index.upsert(vectors=to_upsert)

In [80]:
# Now load the encoder
with open("bm25_encoder.pkl", "rb") as f:
    bm25_encoder = pickle.load(f)

In [85]:
# Pinecone retriever
retriever = PineconeHybridSearchRetriever(
    index=index,
    embeddings=embeddings,
    sparse_encoder=bm25_encoder
)

In [86]:
# Initialize HuggingFace LLM
repo_id = "google/flan-t5-xxl"
llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={"temperature":0.5},huggingfacehub_api_token=huggingfacehub_api_token)

**Question Answering Chain**

In [87]:
chain_type_kwargs = {"prompt":PromptTemplate(
    template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nAnswer:",
    input_variables=["context","question"]
)}

qa_chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", 
                                       retriever=retriever, return_source_documents=True,
                                       chain_type_kwargs=chain_type_kwargs)

In [88]:
# Get query fro user
query = "What is the theory of relativity?"

# Get answer
result = qa_chain({"query":query})

print(result['result'])

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 01 Aug 2024 21:33:37 GMT', 'Content-Type': 'application/json', 'Content-Length': '132', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '211', 'x-pinecone-request-id': '6024689621172022955', 'x-envoy-upstream-service-time': '1', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Index configuration does not support sparse values - only indexes using dotproduct are supported","details":[]}
