**Note:** The current dataset is synthetic and quite limited. The pipeline demonstrates a basic approach to data processing and retrieval. For larger or different datasets, consider using specialized models and customized retrieval techniques.


INSTALL NECESSARY DEPENDENCIES

In [None]:
!pip install langchain pdfplumber transformers langchain-community

In [None]:
import json
import re
from google.colab import files
from langchain.document_loaders import PDFPlumberLoader
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.inmemory import InMemoryVectorStore
import pickle

LOAD THE DATA(PDF) AND CREATE EMBEDDINGS AND STORE THEM IN VECTOR STORE

In [None]:

pdf_filename = "/content/extended_data.pdf"
# print(f"Processing PDF file: {pdf_filename}")

# Load the PDF document(s)
loader = PDFPlumberLoader(pdf_filename)
raw_docs = loader.load()  # returns a list of Document objects

# Combine text from all pages
combined_text = "\n".join([doc.page_content for doc in raw_docs])
print("Raw PDF text loaded. Preview:")
print(combined_text[:500])

def extract_all_json_arrays(text):
    """
    Extracts all balanced JSON arrays from the text.
    Returns a list of JSON array strings.
    """
    arrays = []
    start = 0
    while True:
        start = text.find('[', start)
        if start == -1:
            break
        count = 0
        end = -1
        for i, char in enumerate(text[start:], start):
            if char == '[':
                count += 1
            elif char == ']':
                count -= 1
                if count == 0:
                    end = i
                    break
        if end != -1:
            arrays.append(text[start:end+1])
            start = end + 1
        else:
            break
    return arrays

# Clean text by removing problematic control characters
clean_text = re.sub(r'[\x00-\x1F\x7F]', '', combined_text)

# Extract all JSON array substrings
json_arrays = extract_all_json_arrays(clean_text)
all_qa = []
for arr in json_arrays:
    try:
        data = json.loads(arr)
        if isinstance(data, list):
            all_qa.extend(data)
    except json.JSONDecodeError as e:
        print("Error parsing an array:", e)

print(f"Total Q&A pairs found: {len(all_qa)}")

# Create Document objects where each Q&A pair is preserved as one chunk.
qa_documents = []
for qa in all_qa:
    q = qa.get("question", "").strip()
    a = qa.get("answer", "").strip()
    if q and a:
        # Format each Q&A pair so the Document contains one Q&A.
        qa_text = f"Question: {q}\nAnswer: {a}"
        qa_documents.append(Document(page_content=qa_text))
print(f"Created {len(qa_documents)} Document objects from Q&A data.")

# Create an embeddings instance using a CPU-friendly model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create an in-memory vector store and add the Q&A documents
vectorstore = InMemoryVectorStore(embedding=embeddings)
if qa_documents:
    vectorstore.add_documents(qa_documents)
    print("Documents added to the vector store.")
else:
    print("No Q&A documents to add.")



TEST THE RETRIEVAL TO GET INSIGHTS

In [None]:
# Test retrieval: get the top 3 Q&A pairs for a sample query.
query = "sports in kiet collge"
retrieved_with_scores = vectorstore.similarity_search_with_score(query, k=5)

print("\n----- Retrieved Top 3 Q&A Pairs -----")
for i, (doc, score) in enumerate(retrieved_with_scores, 1):
    print(f"Result {i}:")
    print("Similarity Score:", score)
    print(doc.page_content)
    print("-" * 40)

IF YOU THINK RETRIEVED DATA IS GOOD ENOUGH SAVE THE EMBEDDINGS

In [None]:
import json

# Create a list to hold the embeddings data
embeddings_data = []
for doc in qa_documents:
    # Compute the embedding vector for the document's text using embed_query
    vector = embeddings.embed_query(doc.page_content)
    embeddings_data.append({
        "document": doc.page_content,
        "embedding": vector
    })

# Save the embeddings data to a JSON file
with open("embeddings.json", "w") as f:
    json.dump(embeddings_data, f)

# Download the JSON file
files.download("embeddings.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>