In [1]:
! pip install elasticsearch fitz sentence-transformers numpy



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from elasticsearch import Elasticsearch


es = Elasticsearch("http://11.0.0.145:9200/")

# Define index with embedding field
index_name = "rag_hyde_advance"

mapping = {
    "mappings": {
        "properties": {
            "file_name": {"type": "text"},
            "chunk_id": {"type": "integer"},
            "content": {"type": "text"},
            "embedding": {"type": "dense_vector", "dims": 384}  # Dimension of chosen embedding model
        }
    }
}

# Create index
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)
    print(f"Index '{index_name}' created.")
else:
    print(f"Index '{index_name}' already exists.")


Index 'rag_hyde_advance' already exists.


In [4]:
import fitz  # PyMuPDF
import numpy as np
from transformers import AutoModel, AutoTokenizer
import torch
from elasticsearch import Elasticsearch
# from textsplit.tools import ge
# Load Hugging Face embedding model
embedding_model_name = "BAAI/bge-small-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

def get_embedding(text):
    """Generate embeddings using BAAI/bge-small-en-v1.5"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = embedding_model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS token embedding
    return embedding.tolist()

# Read and extract text from PDF
pdf_path = "/home/sandeep-cc/Documents/rag/documents/Elastic_NV_Annual-Report-Fiscal-Year-2023.pdf"
doc = fitz.open(pdf_path)

# Extract text from each page
pdf_text = "\n".join([page.get_text("text") for page in doc])

# 
# pdf_text=[dict(doc_obj) for doc_obj in doc]

from semantic_text_splitter import TextSplitter

def semantic_chunking(text, chunk_size=500):
    splitter = TextSplitter(capacity=chunk_size) # Adjust overlap if needed
    chunks = splitter.split(text)
    return chunks


chunks = semantic_chunking(pdf_text, chunk_size=500)


for chunk_id, chunk in enumerate(chunks):
    embedding = get_embedding(chunk)  # Generate embedding
    
    document = {
        "file_name": pdf_path.split("/")[-1],
        "chunk_id": chunk_id,
        "content": chunk,
        "embedding": embedding,
    }

    es.index(index=index_name, body=document)  # Store in Elasticsearch
    print(f"✅ Indexed chunk {chunk_id}")

AttributeError: 'builtins.TextSplitter' object has no attribute 'split'

In [None]:
import logging
import requests
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

# IBM Mistral API Credentials
API_TOKEN_IBM = "qdyGtVucbt6PtoHyk5QAnFqjeat4WrbgSDcCPfk3VAyn"
PROJECT_ID_IBM = "2f606c5f-f67f-4397-8635-6ca6358f8440"

# Authenticate with IBM Cloud
authenticator = IAMAuthenticator(API_TOKEN_IBM)
service = "Bearer " + authenticator.token_manager.get_token()

# IBM Mistral API Endpoint
url = "https://us-south.ml.cloud.ibm.com/ml/v1/text/generation?version=2023-05-29"
splitter = TextSplitter(chunk_size=512, chunk_overlap=50)

# User query
query_text = "total revenue 2023"

# Step 1: Generate a Hypothetical Document using IBM Mistral
hyde_prompt = f'''
[INST] 
You are an expert AI generating a **hypothetical yet relevant** answer based **only on the given question**.  
Your goal is to simulate a response that **directly aligns with the query’s intent** and can aid in retrieving relevant documents.  
**Do not include disclaimers or generic information—focus only on generating a plausible response.**  


### Question:
{query_text}

### Hypothetical Answer:
[/INST]
'''


body = {
    "input": hyde_prompt,
    "parameters": {
        "decoding_method": "greedy",
        "max_new_tokens": 200,
        "stop_sequences": [],
        "repetition_penalty": 1
    },
    "model_id": "mistralai/mixtral-8x7b-instruct-v01",
    "project_id": PROJECT_ID_IBM,
    "moderations": {
        "hap": {
            "input": {"enabled": False},
            "output": {"enabled": False}
        }
    }
}

headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": service
}

response = requests.post(url, headers=headers, json=body)
if response.status_code != 200:
    logging.warning(f"Status Code --> {response.status_code}")
    print("Model context window exceeded for this document" + str(response.text))
    exit()

data = response.json()
hypothetical_answer = data['results'][0]['generated_text']
print("\n🔹 Hypothetical Answer:", hypothetical_answer)

# Step 2: Embed the Hypothetical Document
query_embedding = get_embedding(hypothetical_answer)

# Step 3: Retrieve Relevant Chunks from Elasticsearch
query = {
    "size": 3,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_embedding},
            },
        }
    },
}

response = es.search(index=index_name, body=query)
retrieved_chunks = [hit["_source"]["content"] for hit in response["hits"]["hits"]]
context = "\n\n".join(retrieved_chunks)
print("\n🔹 Retrieved Context:", context[:500], "...")




🔹 Hypothetical Answer: Based on historical trends and market analysis, it is projected that the total revenue in 2023 will experience a steady growth of approximately 3.5% compared to 2022. This estimation is derived from the average growth rate over the past five years, adjusted for current economic indicators. However, please note that this is a hypothetical projection and actual figures may vary due to unforeseen market conditions or business strategies.

🔹 Retrieved Context:  make it easy for customers to expand across use cases.
Our business has experienced rapid growth around the world. As of April 30, 2023, we had approximately 20,200 
customers compared to over 18,600 customers and over 15,000 customers as of April 30, 2022 and 2021, respectively. Our total 
revenue was $1.1 billion, $862.4 million, and $608.5 million for the years ended April 30, 2023, 2022 and 2021, respectively, 
representing year-over-year growth of 24% for the year ended April 30, 2023 and  ...


In [28]:
# Step 4: Generate the Final Answer using IBM Mistral
final_prompt = f'''
[INST]  
You are an AI assistant specializing in generating **precise answers** based **only** on the given context.  
Your response must strictly rely on the provided information—**do not add external knowledge or assumptions**.  

### **Instructions:**  
- If the context contains a clear answer, **extract and summarize it concisely**.  
- If the context **partially** answers the question, indicate **what is known and what is missing**.  
- If the context **does not contain** the required answer, **explicitly state that it is not available**—do not guess.  

### **Context:**  
{context}  

### **Question:**  
{query_text}  

### **Answer:**  
[/INST]
'''

body["input"] = final_prompt
response = requests.post(url, headers=headers, json=body)

if response.status_code != 200:
    logging.warning(f"Status Code --> {response.status_code}")
    print("Model context window exceeded for this document" + str(response.text))
    exit()

data = response.json()
final_answer = data['results'][0]['generated_text']
print("\n🔹 Final Answer:", final_answer)



🔹 Final Answer: The total revenue for the year ended April 30, 2023 was approximately $1.1 billion.
