##RAG for docx files

Install and import packages

In [None]:
pip install python-docx sentence-transformers faiss-cpu transformers

Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [None]:
import os
from docx import Document
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

Creating functions

In [None]:
def load_word_document(doc_path):
    document = Document(doc_path)
    paragraphs = []
    for para in document.paragraphs:
        if para.text.strip():
            paragraphs.append(para.text.strip())
    return paragraphs

def create_vector_store(paragraphs, model_name='paraphrase-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(paragraphs, convert_to_tensor=True)
    embeddings_np = embeddings.cpu().numpy()
    dim = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings_np)
    return index, embeddings_np, paragraphs
def retrieve_relevant_paragraphs(query, index, paragraphs, model_name='paraphrase-MiniLM-L6-v2', k=3):
    model = SentenceTransformer(model_name)
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()
    D, I = index.search(query_embedding, k)
    relevant_paragraphs = [paragraphs[i] for i in I[0]]

    return relevant_paragraphs
def generate_answer(query, relevant_paragraphs):
    context = " ".join(relevant_paragraphs)
    question_answering_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
    answer = question_answering_pipeline(question=query, context=context)

    return answer['answer']
def answer_question_from_doc(doc_path, query):
    paragraphs = load_word_document(doc_path)
    index, _, _ = create_vector_store(paragraphs)
    relevant_paragraphs = retrieve_relevant_paragraphs(query, index, paragraphs)
    answer = generate_answer(query, relevant_paragraphs)
    return answer
doc_path = '/content/C_fakepathSyllabus 2024-1.docx'


Ask question and get answer

In [None]:
query = "when did Shah Ismail come to the throne?"

answer = answer_question_from_doc(doc_path, query)
print("Answer:", answer)


Device set to use cuda:0


Answer: 1555) and Istanbul (1590
