In [4]:
import os
from typing import List, Dict, Any
import pandas as pd

In [1]:
from langchain_core.documents import Document
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Set up Completed!")

Set up Completed!


In [6]:
## create a simple document
doc=Document(
    page_content="This is the main text content that will be embedded and searched.",
    metadata={
        "source":"example.txt",
        "page":1,
        "author":"Krish Naik",
        "date_created":"2024-01-01",
        "cutom_field":"any_value"

    }
)
print("Document Structure")

print(f"Content :{doc.page_content}")
print(f"Metadata :{doc.metadata}")

# Why metadata matters:
print("\n📝 Metadata is crucial for:")
print("- Filtering search results")
print("- Tracking document sources")
print("- Providing context in responses")
print("- Debugging and auditing")

Document Structure
Content :This is the main text content that will be embedded and searched.
Metadata :{'source': 'example.txt', 'page': 1, 'author': 'Krish Naik', 'date_created': '2024-01-01', 'cutom_field': 'any_value'}

📝 Metadata is crucial for:
- Filtering search results
- Tracking document sources
- Providing context in responses
- Debugging and auditing


In [7]:
page_content="This is the main text content that will be embedded and searched."
print(len(page_content))
words = page_content.split()
print(len(words))

65
12


In [8]:
## Create a simple txt file
import os
os.makedirs("data/text_files",exist_ok=True)

In [9]:
sample_texts={
    "data/text_files/python_intro.txt":"""Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability.
Created by Guido van Rossum and first released in 1991, Python has become one of the most popular
programming languages in the world.

Key Features:
- Easy to learn and use
- Extensive standard library
- Cross-platform compatibility
- Strong community support

Python is widely used in web development, data science, artificial intelligence, and automation.""",
    
    "data/text_files/machine_learning.txt": """Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems
    
    
    """

}

for filepath,content in sample_texts.items():
    with open(filepath,'w',encoding="utf-8") as f:
        f.write(content)

print("✅ Sample text files created!")

✅ Sample text files created!


In [10]:
from langchain_community.document_loaders import DirectoryLoader

## load all the text files from the directory
dir_loader=DirectoryLoader(
    "data/text_files",
    glob="**/*.txt", ## Pattern to match files  
    loader_cls= TextLoader, ##loader class to use
    loader_kwargs={'encoding': 'utf-8'},
    show_progress=True

)

documents=dir_loader.load()

print(f"📁 Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"  Source: {doc.metadata['source']}")
    print(f"  Length: {len(doc.page_content)} characters")


# 📊 Analysis
print("\n📊 DirectoryLoader Characteristics:")
print("✅ Advantages:")
print("  - Loads multiple files at once")
print("  - Supports glob patterns")
print("  - Progress tracking")
print("  - Recursive directory scanning")

print("\n❌ Disadvantages:")
print("  - All files must be same type")
print("  - Limited error handling per file")
print("  - Can be memory intensive for large directories")

NameError: name 'TextLoader' is not defined

In [None]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

## Loading a single text file
loader=TextLoader("data/text_files/python_intro.txt", encoding="utf-8")

documents=loader.load()
print(f"📄 Loaded {len(documents)} document")
print(f"Content preview: {documents[0].page_content[:100]}...")
print(f"Metadata: {documents[0].metadata}")



📄 Loaded 1 document
Content preview: Python Programming Introduction

Python is a high-level, interpreted programming language known for ...
Metadata: {'source': 'data/text_files/python_intro.txt'}


In [2]:
from langchain.document_loaders import TextLoader

# Create an instance of TextLoader with the file path and encoding
loader = TextLoader("C:\\Users\\Shaur\\OneDrive\\Desktop\\ML\\test.txt", encoding="utf-8")

# Load the documents
documents = loader.load()

# Print the loaded documents to verifyuv 
print(documents)

[Document(metadata={'source': 'C:\\Users\\Shaur\\OneDrive\\Desktop\\ML\\test.txt'}, page_content='this is going to be my first test file for loading a text file to vs code and applying parsing techniques to it\nafter this we will apply parsing tecchniques \nand then store it in a vector database\n')]


In [3]:
for doc in documents:
    print(doc.page_content)

this is going to be my first test file for loading a text file to vs code and applying parsing techniques to it
after this we will apply parsing tecchniques 
and then store it in a vector database



In [4]:
# Assuming 'documents' is the list loaded by TextLoader
first_document = documents[0]

# Now, access the page_content of that specific document
page = first_document.page_content
print(page)
print(len(page))
a = page.split()
print(len(a))

this is going to be my first test file for loading a text file to vs code and applying parsing techniques to it
after this we will apply parsing tecchniques 
and then store it in a vector database

197
38


In [6]:
from langchain.text_splitter import CharacterTextSplitter
# Access the page_content of the first document in the list
text_to_split = documents[0].page_content

print("1️⃣ CHARACTER TEXT SPLITTER")
char_splitter = CharacterTextSplitter(
    separator=" ",
    chunk_size=60,
    chunk_overlap=20,
    length_function=len
)

# Pass the string content (text_to_split) to the splitter
char_chunks = char_splitter.split_text(text_to_split)

print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}...")

1️⃣ CHARACTER TEXT SPLITTER
Created 5 chunks
First chunk: this is going to be my first test file for loading a text...


In [7]:
for i, chunk in enumerate(char_chunks):
    print(f"Chunk {i + 1}: {chunk}")

Chunk 1: this is going to be my first test file for loading a text
Chunk 2: for loading a text file to vs code and applying parsing
Chunk 3: and applying parsing techniques to it
after this we will
Chunk 4: this we will apply parsing tecchniques 
and then store it in
Chunk 5: then store it in a vector database


In [8]:
print(char_chunks[0])

this is going to be my first test file for loading a text


In [9]:
print(documents[0].metadata)

{'source': 'C:\\Users\\Shaur\\OneDrive\\Desktop\\ML\\test.txt'}


In [None]:
# Method 1: Character-based splitting
print("1️⃣ CHARACTER TEXT SPLITTER")
char_splitter = CharacterTextSplitter(
    separator="\n",  # Split on newlines
    chunk_size=50,  # Max chunk size in characters
    chunk_overlap=20,  # Overlap between chunks
    length_function=len  # How to measure chunk size
)

char_chunks=char_splitter.split_text(text_to_split)
print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}...")

Created a chunk of size 111, which is longer than the specified 50


1️⃣ CHARACTER TEXT SPLITTER
Created 3 chunks
First chunk: this is going to be my first test file for loading a text file to vs code and applying parsing techn...


In [None]:
# Method 2: Recursive character splitting (RECOMMENDED)
print("\n2️⃣ RECURSIVE CHARACTER TEXT SPLITTER")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=[" "],  # Try these separators in order
    chunk_size=50,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_splitter.split_text(text_to_split)
print(f"Created {len(recursive_chunks)} chunks")
print(f"First chunk: {recursive_chunks[0][:100]}...")


2️⃣ RECURSIVE CHARACTER TEXT SPLITTER
Created 6 chunks
First chunk: this is going to be my first test file for loading...


In [None]:
# Create text without natural break points
simple_text = "This is sentence one and it is quite long. This is sentence two and it is also quite long. This is sentence three which is even longer than the others. This is sentence four. This is sentence five. This is sentence six."

splitter = RecursiveCharacterTextSplitter(
    separators=[" "],  # Only split on spaces
    chunk_size=80,
    chunk_overlap=20,
    length_function=len
)

chunks = splitter.split_text(simple_text)

print(f"\nSimple text example - {len(chunks)} chunks:\n")

for i in range(len(chunks) - 1):
    print(f"Chunk {i+1}: '{chunks[i]}'")
    print(f"Chunk {i+2}: '{chunks[i+1]}'")
    
    
    print()


Simple text example - 4 chunks:

Chunk 1: 'This is sentence one and it is quite long. This is sentence two and it is also'
Chunk 2: 'two and it is also quite long. This is sentence three which is even longer than'

Chunk 2: 'two and it is also quite long. This is sentence three which is even longer than'
Chunk 3: 'is even longer than the others. This is sentence four. This is sentence five.'

Chunk 3: 'is even longer than the others. This is sentence four. This is sentence five.'
Chunk 4: 'is sentence five. This is sentence six.'



In [None]:
# Method 3: Token-based splitting
print("\n3️⃣ TOKEN TEXT SPLITTER")
token_splitter = TokenTextSplitter(
    chunk_size=50,  # Size in tokens (not characters)
    chunk_overlap=10
)

token_chunks = token_splitter.split_text(text_to_split)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk: {token_chunks[0][:100]}...")


3️⃣ TOKEN TEXT SPLITTER


NameError: name 'TokenTextSplitter' is not defined

In [None]:
print("KO")

KO
