# *Depot-D.NLR*

### Extracting Text From PDF

In [1]:
import os      
import fitz    
from langchain.text_splitter import RecursiveCharacterTextSplitter


def extract_text_from_pdfs(project_dataset):
    """Extracts text from all PDFs in a given folder."""
    all_text_chunks = []  

    for pdf_file in os.listdir(project_dataset): 
        if pdf_file.endswith(".pdf"):  
            pdf_path = os.path.join(project_dataset, pdf_file)  
            doc = fitz.open(pdf_path)  
            
            for page_num, page in enumerate(doc): 
                text = page.get_text("text")  
                if text.strip():  
                    all_text_chunks.append({  
                        "content": text,  
                        "metadata": {"source": pdf_file, "page": page_num + 1}  
                    })
    
    return all_text_chunks  

### Chunking Them Into Documents

In [2]:
from langchain_core.documents import Document


def process_pdf(pdf_path):
    """Extracts text from a PDF and splits it into smaller chunks with metadata."""
    pdf_texts = extract_text_from_pdfs(pdf_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    
    # all_chunks = []
    documents= []
    for text_entry in pdf_texts:
        
        chunks = text_splitter.split_text(text_entry["content"])
        
        pagenumber = text_entry["metadata"]["page"]
        metadata = text_entry["metadata"]

        for idx, chunk in enumerate(chunks):
            # print(chunk[:10])
            # print(metadata)
            # print(f"{pagenumber}-{idx}")

            doc = Document(
                page_content=chunk,
                metadata=metadata,
                id=f"{pagenumber}-{idx}"
                
            )

            # print(doc)
            documents.append(doc)
            # all_chunks.append({"content": chunk, "metadata": text_entry["metadata"]})

    return documents
    
    
documents = process_pdf("./")

### Embedding Function / Embedding Setup for the Chunks / Documents

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

### Setting up The ChromaDB Vector Store

In [4]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="Depot-D.NLR_db",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

### Add Documents to The Vector Store

In [5]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)

['4398fa3e-49f7-4e98-9c94-161e7f1da472',
 '2f7966f7-e720-437c-9ed4-69f32bdb919e',
 'f38cbb71-e3e7-4e9f-bfa8-7269ff0298d7',
 '3c7ed7ec-26a6-470e-a5ee-d8745360360e',
 'bd330890-7090-4300-97b9-a65546afb9f8',
 'b1e5f3e5-bd3e-478a-85f3-f4ea35077a08',
 'b34d9a45-382f-4e8f-b8a0-52a139a71a24',
 'e12732ab-213d-4be8-9fdd-6a6a06b9dacb',
 '7508cf91-a820-454a-b86f-a68f0406af97',
 '3001b8ab-7d6e-4edf-8e35-56b057169fb7',
 '443f0094-cc25-4f03-9b3c-9777963f7a74',
 'fc79d336-2ae0-407f-a1cb-0c3ea49a2d20',
 '8c9d6e13-8119-4cbd-8270-c1bb42a327aa',
 '5d7a9753-d1e9-402d-883f-ceece1ec3087',
 'db8ef679-ba35-427d-b612-d42617187911',
 'f388680f-de6a-4586-86ff-8908e487ba0d',
 'a8d766ab-583a-46b7-b52b-ab5fbb384fda',
 'cd904568-00ab-4069-b9f5-2bf59f20da77',
 'f7684c4f-82de-4a02-b182-0f7883087e49',
 '860ce774-a4cc-400c-bad1-c10f73430ef1',
 '839e9cde-191c-48cb-9923-88a452b2d33d',
 'e8674efa-a2c1-483f-8df7-4d59c2b0e128',
 '0461e38a-bc07-4942-829f-c4f88ddf0886',
 'f211b16c-a45b-410f-862c-3d5797fc70d4',
 '601995a5-b01d-

### Retriever Setup / Retriever Function

In [6]:
retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10}
)

### LLM Model

In [24]:
from langchain_community.llms import Ollama
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Optional: for streaming output
callbacks = [StreamingStdOutCallbackHandler()]

llm = Ollama(
    model="llama3.1",  # this should match the model name shown in `ollama list`
    temperature=0.01,
    top_p=0.95,
    repeat_penalty=1.03,
    callbacks=callbacks,
)

# Test it
#response = llm.invoke("What is Retrieval Augmented Generation?")
#print(response)


### Response Cleaner Function

In [25]:
def clean_response(text):
    text = text.strip()
    if text.lower().startswith("train accident is"):
        text = text[text.find('.') + 1:].strip()
    
    keywords = ["include", "types of", "are as follows"]
    for word in keywords:
        if word in text.lower():
            text = text.replace("•", "-")
            break

    return text


### Prompt Template

In [None]:
from langchain_core.prompts import PromptTemplate

SYSTEM_TEMPLATE = """
You are an assistant specialized in Indian Railways documentation. 
Answer the following question using only the provided context. 
Be concise, skip generic definitions, and prioritize information relevant to Indian Railways.

If the answer involves categories, list them clearly in bullet points or numbered format.
Always include source metadata like PDF name and page number if available.

Question: {question}

Context:
{context}
    
    """

question_answering_prompt = PromptTemplate(template=SYSTEM_TEMPLATE, input_variables=["question", "context"])

### THE RAG CHAIN

In [27]:

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


def format_docs(docs):
    context = "\n\n".join(doc.page_content for doc in docs)
    print(f"----n{context}\n----")
    return context

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | question_answering_prompt
    | llm
    | StrOutputParser()
)

### Qyerying

In [28]:
query = "What is the procedure for train accident investigation?"
response = rag_chain.invoke(query)

----nthe section where provided with signals etc.  
vii. prima facie cause of accident;  
g)  ensure that the Train Signal Register, Log Book, Private Number Book, Line Admission 
Book, speed Recorder Chart and other relevant records are seized,  
h)  obtain the statements of staff involved in the accident, as far as possible;  
i)   prepare a rough sketch showing the position of derailed vehicles, position of track and 
OHE etc. make a quick survey of the extent of damage ;

they have to be performed. On occurrence of an accident to his train, the Guard of the train 
shall immediately:  
1. Note the time of accident. 
2. Make a quick survey of the accident site for casualties, injuries, if any, and for deciding 
the assistance required.  
3. Send the first information of accident to the control office and then to the nearest 
Station Master furnishing the following information, through mobile phone or portable 
telephone, or walkie-talkie or gate phone, etc.,

Accident inquiries done 

### Cleaning the Response

In [29]:
# Clean the LLM response
cleaned = clean_response(response)
# Add metadata from the top 2-3 retrieved documents
docs = retriever.get_relevant_documents(query)
sources = []
for doc in docs:
    meta = doc.metadata
    sources.append(f"[Source: {meta.get('source', 'unknown')} | Page: {meta.get('page', '?')}]")
# Remove duplicates
sources = list(set(sources))
# Final formatted output
final_output = cleaned + "\n\n" + "\n".join(sources)
print(final_output)

The procedure for train accident investigation in Indian Railways involves the following steps:

**Initial Steps**

1. Note the time of accident (Source: PDF, page 14)
2. Make a quick survey of the accident site for casualties, injuries, and assistance required (Source: PDF, page 14)
3. Send the first information of accident to the control office and nearest Station Master (Source: PDF, page 14)

**Investigation**

* Seize relevant records such as Train Signal Register, Log Book, Private Number Book, Line Admission Book, speed Recorder Chart, and other relevant records (Source: PDF, page vii)
* Obtain statements of staff involved in the accident (Source: PDF, page vii)
* Prepare a rough sketch showing the position of derailed vehicles, track, and OHE (Source: PDF, page vii)

**Additional Steps**

* In case of serious accidents, the Commissioner of Railway Safety conducts the inquiry (Source: PDF, page 14)
* In case of other accidents, departmental officials conduct the inquiry (Source: