In [1]:
import langchain

Introduct to Data Ingestion

### Introduction To Data Ingestion

In [2]:
import os
from typing import List, Dict, Any
import pandas as pd

In [3]:
from langchain_core.documents import Document
from langchain.text_splitter import(
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Setup is completed")

Setup is completed


## Understanding Document Structure In Langchain

In [4]:
## Create a simple document
doc=Document(
    page_content="This is the main text content that will be embedded and searched.",
    metadata={
        "source": "example.txt",
        "page1": 1,
        "author": "John Doe",
        "date_created": "2025-10-01",
        "custom_data":"any value"
        }
)
print("Document Structure")

print(f"Content :{doc.page_content}")
print(f"Metadata :{doc.metadata}")
#print(f"Content :{doc.page_content}")

Document Structure
Content :This is the main text content that will be embedded and searched.
Metadata :{'source': 'example.txt', 'page1': 1, 'author': 'John Doe', 'date_created': '2025-10-01', 'custom_data': 'any value'}


In [5]:
type(doc)

langchain_core.documents.base.Document

## Text Files (.txt) - The Simplest Case {#2-text-files}

In [6]:
## Create a simple text file

In [7]:
import os
os.makedirs("data/text_files", exist_ok=True)

In [8]:
sample_text={
    "data/text_files/Machine_learning.txt":"""Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties

Applications include image recognition, speech processing, and recommendation systems

    """

    
}
for filepath, content in sample_text.items(): 
    with open(filepath, 'w', encoding="utf-8") as f:
        f.write(content)
print("Sample text file is created")


Sample text file is created


In [9]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

## Loading a single text file
loader=TextLoader("data/text_files/python_intro.txt", encoding="utf-8")
documents=loader.load()
print(f"loaded {len(documents)} document")
print(f"Content Preview: {documents[0].page_content[:1000]}...")
print(f"Metadata: {documents[0].metadata}")

loaded 1 document
Content Preview: Python Programming Introduction

Python is a high-level, interpreted programming language known for its simplicity and readability. Created by Guido van Rossum and first released in 1991, Python has become one of the most popular programming languages in the world.

Key Features:

Easy to learn and use
Extensive standard library
Cross-platform compatibility
Strong community support
Python is widely used in web development, data science, artificial intelligence, and automation.

    ...
Metadata: {'source': 'data/text_files/python_intro.txt'}


## DirectoryLoader - Multiple Text Files

In [10]:
from langchain_community.document_loaders import DirectoryLoader

## Load all the text files from the directory
dir_loader=DirectoryLoader(
    "data/text_files",
    glob="**/*.txt",  ## Pattern to match files
    loader_cls= TextLoader,  ## loader class to use
    loader_kwargs={'encoding':'utf-8'},
    show_progress=True


)
documents=dir_loader.load()

print(f"Loaded {len(documents)} documents")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"   Source: {doc.metadata['source']}")
    print(f"   Lenth:  {len(doc.page_content)} characters")

100%|██████████| 4/4 [00:00<00:00, 1448.93it/s]

Loaded 4 documents

Document 1:
   Source: data\text_files\Machine_learning.txt
   Lenth:  574 characters

Document 2:
   Source: data\text_files\machine_learning1.txt
   Lenth:  575 characters

Document 3:
   Source: data\text_files\python_intro.txt
   Lenth:  487 characters

Document 4:
   Source: data\text_files\python_intro1.txt
   Lenth:  489 characters





## Text Splitting Statergies

In [11]:
### Different text splitting statergies
from langchain.text_splitter import(
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)

In [12]:
## Method 1 - Character text splitter
text=documents[0].page_content
text

'Machine Learning Basics\n\nMachine learning is a subset of artificial intelligence that enables systems to learn and improve\nfrom experience without being explicitly programmed. It focuses on developing computer programs\nthat can access data and use it to learn for themselves.\n\nTypes of Machine Learning:\n1. Supervised Learning: Learning with labeled data\n2. Unsupervised Learning: Finding patterns in unlabeled data\n3. Reinforcement Learning: Learning through rewards and penalties\n\nApplications include image recognition, speech processing, and recommendation systems\n\n    '

In [13]:
## Method 1: Character-based splitting
print("CHARACTER TEXT SPLITTER")
char_splitter = CharacterTextSplitter(
    separator="\n",  # Split on newlines
    chunk_size=200,  # Max chunk size in characters
    chunk_overlap=20,  # Overlap between chunks
    length_function=len  # How to measure chunk size
)
char_chunks = char_splitter.split_text(text)
print(f"Created {len(char_chunks)} chunks")
print(f"First chunk: {char_chunks[0][:100]}...")


CHARACTER TEXT SPLITTER
Created 4 chunks
First chunk: Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems...


In [14]:
# Example: print the first two character chunks without errors
print(char_chunks[0])
print("--------------------")
print(char_chunks[1])
print("--------------------")
print(char_chunks[2])
print("--------------------")
print(char_chunks[3])

Machine Learning Basics
Machine learning is a subset of artificial intelligence that enables systems to learn and improve
--------------------
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.
Types of Machine Learning:
--------------------
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties
--------------------
Applications include image recognition, speech processing, and recommendation systems


In [15]:
# Method 2: Recursive character splitting (RECOMMENDED)
print("\n2️⃣ RECURSIVE CHARACTER TEXT SPLITTER")
recursive_splitter = RecursiveCharacterTextSplitter(
    separators=['\n'],  # Try these separators in order
    chunk_size=200,
    chunk_overlap=20,
    length_function=len
)

recursive_chunks = recursive_splitter.split_text(text)
print(f"Created {len(recursive_chunks)} chunks")
print(f"First chunk: {recursive_chunks[0][:100]}...")

print(recursive_chunks[0])
print("--------------------")
print(recursive_chunks[1])
print("--------------------")
print(recursive_chunks[2])
print("--------------------")
print(recursive_chunks[3])



2️⃣ RECURSIVE CHARACTER TEXT SPLITTER
Created 4 chunks
First chunk: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system...
Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables systems to learn and improve
--------------------
from experience without being explicitly programmed. It focuses on developing computer programs
that can access data and use it to learn for themselves.

Types of Machine Learning:
--------------------
1. Supervised Learning: Learning with labeled data
2. Unsupervised Learning: Finding patterns in unlabeled data
3. Reinforcement Learning: Learning through rewards and penalties
--------------------
Applications include image recognition, speech processing, and recommendation systems


In [16]:
import certifi
import os
os.environ['SSL_CERT_FILE'] = certifi.where()

print("\n3️⃣ TOKEN TEXT SPLITTER")
token_splitter = TokenTextSplitter(
    chunk_size=50,  # Size in tokens (not characters)
    chunk_overlap=10,
)

token_chunks = token_splitter.split_text(text)
print(f"Created {len(token_chunks)} chunks")
print(f"First chunk: {token_chunks[0][:100]}...")


3️⃣ TOKEN TEXT SPLITTER
Created 3 chunks
First chunk: Machine Learning Basics

Machine learning is a subset of artificial intelligence that enables system...
