In [12]:
import os
from typing import List,Dict,Any
import pandas as pd

In [13]:
from langchain_core.documents import Document 
from langchain_text_splitters import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)

# Understanding document structure in langchain 

In [14]:
# creating a simple document 
doc = Document(
    page_content = "Random text content that will be embedded and searched",
    metadata = {
        "source":"example.txt",
        "author":"Pulkit",
        "page":1,
        "date_created":"01-01-2026",
        "custom_field":"random info"
    }
)
print("Document structure")

print(f"Content : {doc.page_content}")
print(f"Metadata : {doc.metadata}")

Document structure
Content : Random text content that will be embedded and searched
Metadata : {'source': 'example.txt', 'author': 'Pulkit', 'page': 1, 'date_created': '01-01-2026', 'custom_field': 'random info'}


Text File (.txt)

In [None]:
#Creating a simple text file 
os.makedirs("data/text_files",exist_ok = True)  # no error raised if directory exists


In [16]:
sample_texts = {
    "data/text_files/git_cheatsheet.txt" : """Git Cheatsheet

git init                    # initialize new repo
git clone URL               # clone remote repo

Check Status & History-
git status                  # current state
git log                     # commit history
git log --oneline           # compact history
git diff                    # unstaged changes
git diff --staged           # staged changes

Stage & Commit-
git add <file.txt>           # stage/track file
git add .                    # stage everything
git commit -m "message"      # commit staged files
git commit --amend	     # Amend last commit

Branching-
git branch                   # list branches
git branch new-branch        # create branch
git checkout branch-name     # switch branch
git checkout -b new-branch   # create + switch
git merge branch-name	     # Merge branch

Fetch vs Pull-
git fetch        # download changes (no merge)
git pull         # fetch + merge

Unstage a file-
git restore --staged file.txt

Discard local changes-
git restore file.txt

Undo last commit (keep changes)-
git reset --soft HEAD~1

Undo last commit (delete changes ⚠)-
git reset --hard HEAD~1

git revert HEAD~1

GitHub
git remote -v      		# check remote origin URL of git repo
git remote remove origin	# remove remote origin URL of git repo
git remote add origin URL
git branch -M main
git push -u origin main""",
    "data/text_files/uv_package_manager.txt": """uv package manager

uv init -> Initialize a Python project

run add "libraries" -> adds libraries to dependencies

uv run python main.py -> Runs the file

uv venv -> Create a virtual environment

source .venv/bin/activate -> activate virtual env

uv sync -> sync env

uv list -> list installed dependencies

uv info -> show project info

uv python install 3.11  -> use specific python version
uv venv --python 3.11

uv cache clean -> clear uv cache"""
}
for filepath, content in sample_texts.items():
    with open(filepath,'w',encoding = "utf-8") as f:
        f.write(content)

print("Sample file created")

Sample file created


# Textloader - Reads single file

In [17]:
from langchain_community.document_loaders import TextLoader
# Loading a single text file
loader = TextLoader("data/text_files/git_cheatsheet.txt", encoding = "utf-8")
documents = loader.load()
print(f"Loaded {len(documents)} document")
print(f"Content Preview :{documents[0].page_content[:100]}")
print(f"Metadata :{documents[0].metadata}")


Loaded 1 document
Content Preview :Git Cheatsheet

git init                    # initialize new repo
git clone URL               # clon
Metadata :{'source': 'data/text_files/git_cheatsheet.txt'}


# DirectoryLoader - Multiple txt files

In [24]:
from langchain_community.document_loaders import DirectoryLoader
dir_loader = DirectoryLoader(
    "data/text_files",
    glob = "**/*.txt",   # Controls which file to load 
    loader_cls = TextLoader, # Loader class to use
    loader_kwargs = {"encoding" :"utf - 8"},
    show_progress = True
)
documents = dir_loader.load()

print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f"  Length: {len(doc.page_content)} characters")
   

100%|██████████| 2/2 [00:00<00:00, 3666.35it/s]

Loaded 2 documents

Document 1:
  Source: data\text_files\git_cheatsheet.txt
  Length: 1336 characters

Document 2:
  Source: data\text_files\uv_package_manager.txt
  Length: 448 characters



