In [None]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv 

import langchain
from typing import List, Dict, Any


In [2]:
from langchain_core.documents import Document 
'''
It is better to have out data or text in document structure expected by langchain
that is the reason we are using this library, to basically convert raw text to langchain document structure. Each document object
will have page_content and metadata attributes. Page_content will contain the text and metadata will contain the metadata of the document.
'''
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
) # This is the splitter that will split the text into chunks

In [4]:
doc = Document(
    page_content="This is just for testing purposes. But let me tell you one thing - One day I will be working a big product based company like google",
    metadata=
    {
        "source": "Pramod's thought",
        "Author": "Pramod Vepada",
        "date": "04-09-2025",
        "category": "Ambition"
    }
)

print(f"content = {doc.page_content}")
print(f"metadata = {doc.metadata}") # This will print the metadata of the document


content = This is just for testing purposes. But let me tell you one thing - One day I will be working a big product based company like google
metadata = {'source': "Pramod's thought", 'Author': 'Pramod Vepada', 'date': '04-09-2025', 'category': 'Ambition'}


#### The metadata field in LangChain's Document object is crucial for RAG applications because it enables:

Key Benefits
1. Source Attribution

- Track which file, URL, or database the content came from
- Provide citations in responses to users
- Enable fact-checking and verification

2. Filtering & Routing

- Filter documents by type, date, author, or domain
- Route queries to specific document subsets
- Implement access control based on metadata

3. Context Enhancement

- Store document titles, summaries, or categories
- Include creation dates, authors, or document types
- Add custom tags for better retrieval

4. Retrieval Optimization

- Use metadata for hybrid search (semantic + metadata filtering)
- Implement time-based or relevance scoring
- Enable multi-modal retrieval strategies