In [1]:
%pip install langchain langchain-huggingface langchain-community langchain-text-splitters faiss-cpu pypdf sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
import os
import PyPDF2



  from .autonotebook import tqdm as notebook_tqdm


‚úÖ All packages installed and imported successfully!


In [None]:
from langchain_community.document_loaders import PyPDFLoader

def read_document(file_path):
   
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
        docs = loader.load()  # returns List[Document], each doc = 1 page
        print(f"‚úÖ PDF loaded successfully: {len(docs)} pages")
        return docs

    elif file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        docs = [
            Document(
                page_content=text,
                metadata={"page": 1, "source": file_path},
            )
        ]
        print(f"TXT file loaded: {len(text)} characters")
        return docs

    else:
        raise ValueError("Only PDF and TXT files are supported")


In [None]:
def split_text(documents, chunk_size=500, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
    )

    chunks = text_splitter.split_documents(documents)
    print(f"Text split into {len(chunks)} chunks")
    return chunks


In [71]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [72]:
def create_vector_store(chunks, embeddings):
    vector_store = FAISS.from_documents(chunks, embeddings)
    print("Vector store created successfully")
    return vector_store

In [73]:
def save_vector_store(vector_store, save_path="vector_store"):
    vector_store.save_local(save_path)
    print(f" Vector store saved to: {save_path}")


In [None]:
def query_vector_store(vector_store, query, k=3):
    results = vector_store.similarity_search(query, k=k)
    print(f"Found {len(results)} relevant chunks")
    return results

In [None]:
import re

def beautify_text(text):
    """Clean and format extracted PDF text into readable paragraphs."""

    # Add space between words smashed together: e.g., "Whatis" -> "What is"
    text = re.sub(r'([a-zA-Z])([A-Z])', r'\1 \2', text)

    # Add space before capital letters inside words: Databaseisacollection -> Database is a collection
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    # Add space after punctuation if missing
    text = re.sub(r'([.,;!?])(?=[A-Za-z])', r'\1 ', text)

    # Add space after parentheses if missing: SQL(Notes -> SQL (Notes
    text = re.sub(r'\(', '( ', text)
    text = re.sub(r'\)', ') ', text)

    # Fix cases like: Notesby -> Notes by
    text = re.sub(r'([a-zA-Z])by', r'\1 by', text)

    # Ensure bullets format nicely
    text = text.replace("‚óè", "\n\n‚Ä¢ ")

    # Put a newline before headings like "What is"
    text = re.sub(r'(What is)', r'\n\n\1', text, flags=re.IGNORECASE)

    # Normalize spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Normalize line breaks
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()


In [89]:
def format_results(results, top_k=3):
    if not results:
        return "No relevant information found."
    
    formatted_chunks = []
    
    for i, doc in enumerate(results[:top_k], start=1):
        meta = doc.metadata or {}
        page = meta.get("page")
        source = meta.get("source")
        start_index = meta.get("start_index")
        
        # Clean and beautify text
        clean_text = beautify_text(doc.page_content)
        
        formatted_chunks.append(
            f"[Result {i}] page={page}, source={source}, start_index={start_index}\n\n{clean_text}"
        )
    
    return ("\n" + "=" * 80 + "\n").join(formatted_chunks)


In [90]:
print("\n Enter the path to your document (PDF or TXT):")
file_name = input("File path: ").strip()


 Enter the path to your document (PDF or TXT):


In [91]:
text = read_document(file_name)


‚úÖ PDF loaded successfully: 29 pages


In [92]:
chunks = split_text(text, chunk_size=500, chunk_overlap=100)

Text split into 79 chunks


In [93]:
chunks[0]

Document(metadata={'producer': 'Skia/PDF m117 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'SQL Notes by Apna College', 'source': 'E:\\Langchain-Project\\SQL Notes by Apna College (1).pdf', 'total_pages': 29, 'page': 0, 'page_label': '1'}, page_content='SQL(NotesbyApnaCollege)\nWhatisDatabase?Databaseisacollectionofinterrelateddata.\nWhatisDBMS?DBMS(DatabaseManagement System)issoftwareusedtocreate,manage,andorganizedatabases.\nWhatisRDBMS?‚óè RDBMS(RelationalDatabaseManagement System)-isaDBMSbasedontheconceptoftables(alsocalledrelations).‚óè Dataisorganizedintotables(alsoknownasrelations)withrows(records)andcolumns(attributes).‚óè Eg-MySQL,PostgreSQL,Oracleetc.')

In [94]:
vector_store = create_vector_store(chunks, embeddings)


Vector store created successfully


In [95]:
save_path = input("\n Enter path to save vector store (default: ./vector_store): ").strip() or "./vector_store"
save_vector_store(vector_store, save_path)

 Vector store saved to: sql


In [96]:
print("\n" + "="*50)
print(" System Ready! Ask questions about your document")
print("Type 'quit' to exit")
print("="*50 + "\n")

while True:
    query = input("\n Your Question: ")
    
    if query.lower() == 'quit':
        print("üëã Goodbye!")
        break
    
    results = query_vector_store(vector_store, query, k=3)
    
    
    answer = format_results(results)
    print("\nüí° Answer:")
    print("-" * 50)
    print(answer)
    print("-" * 50)



 System Ready! Ask questions about your document
Type 'quit' to exit

Found 3 relevant chunks

üí° Answer:
--------------------------------------------------
[Result 1] page=0, source=E:\Langchain-Project\SQL Notes by Apna College (1).pdf, start_index=None

Whatis SQL? SQLis Structured Query Language-usedtostore, manipulateandretrievedatafrom RDBMS.(Itisnotadatabase, itisalanguageusedtointeractwithdatabase)
Weuse SQLfor CRUDOperations:

‚Ä¢ CREATE-Tocreatedatabases, tables, inserttuplesintablesetc

‚Ä¢ READ-Toreaddatapresentinthedatabase.

‚Ä¢ UPDATE-Modifyalreadyinserteddata.

‚Ä¢ DELETE-Deletedatabase, tableorspecificdatapoint/tuple/rowormultiplerows.
*Note-SQLkeywordsare NOTcasesensitive. Eg: selectisthesameas SELECTin SQL.
[Result 2] page=0, source=E:\Langchain-Project\SQL Notes by Apna College (1).pdf, start_index=None

SQL(Notesby Apna College)
Whatis Database? Databaseisacollectionofinterrelateddata.
Whatis DBMS? DBMS(Database Management System)issoftwareusedtocreate, manage, 