In [22]:
import os
from typing import List, Dict, Any

import pandas as pd

In [23]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)

print("Setup completed")


Setup completed


In [24]:
doc= Document(
    page_content="This is a test document",
    metadata={"source": "test", "page": 1, "date": "2020-01-01", "author": "John Doe", "tags": ["test", "example"]}
)

print("Document created")

print(doc.page_content)
print(doc.metadata)

Document created
This is a test document
{'source': 'test', 'page': 1, 'date': '2020-01-01', 'author': 'John Doe', 'tags': ['test', 'example']}


In [25]:
import os
os.makedirs("data/text_files", exist_ok=True) 

In [26]:
sample_data={
    "data/text_files/file1.txt": """Python
Python is a high-level, interpreted language known for its clean syntax and readability. It's often praised for being easy to learn.

Primary Use Cases:

Data Science, AI & Machine Learning: This is its biggest domain. It's the standard for libraries like Pandas, NumPy, Scikit-learn, TensorFlow, and PyTorch.

Web Development (Backend): Used to build server-side applications with frameworks like Django and Flask.

Scripting & Automation: Widely used for automating repetitive tasks, writing system scripts, and building developer tools.

Popularity Data:

TIOBE Index: Consistently ranks as the #1 most popular programming language worldwide.

Stack Overflow Survey: Regularly voted the most "wanted" language (meaning developers who don't use it want to learn it) and is one of the most commonly used.

Community: Has an extremely large and active community, especially in scientific and data-focused fields""",
    "data/text_files/file2.txt": """JavaScript (JS)
JavaScript is a high-level, dynamic language that is the core technology of the web. It's the only language that runs natively in web browsers.

Primary Use Cases:

Frontend Web Development: Used to create all interactive elements on a website. It powers frameworks like React, Angular, and Vue.

Backend Web Development: With Node.js, JavaScript can run on the server, allowing for full-stack development using a single language.

Mobile Apps: Used in frameworks like React Native to build cross-platform mobile apps.

Desktop Apps: Used with frameworks like Electron (which powers VS Code and Slack).

Popularity Data:

Stack Overflow Survey: Has been the most commonly used programming language for over a decade.

GitHub: Consistently has the most repositories and active projects.

Ecosystem: The npm (Node Package Manager) is the largest ecosystem of open-source libraries in the world.""",
    "data/text_files/file3.txt": """Java is a compiled, class-based, object-oriented language known for its "Write Once, Run Anywhere" (WORA) philosophy, meaning compiled Java code can run on all platforms that support Java without recompilation. It is known for its stability, security, and performance.

Primary Use Cases:

Enterprise Applications: The dominant language for large-scale, backend systems at corporations, banks, and financial institutions, often using the Spring framework.

Android App Development: The original and still primary language for building native Android applications.

Big Data Systems: Many major big data tools, like Apache Hadoop, Kafka, and Spark, are written in Java or Scala (which runs on the Java Virtual Machine).

Popularity Data:

TIOBE Index: A long-standing top-tier language, consistently ranking in the top 3 or 4 for decades, indicating massive, sustained global use.

Job Market: The demand for Java developers remains extremely high, especially in enterprise-level and finance jobs, due to the enormous existing codebase in the world."""
}


for filepath,content in sample_data.items():
    with open(filepath,'w', encoding="utf-8") as f:
        f.write(content)

        print(f"File {filepath} created")

File data/text_files/file1.txt created
File data/text_files/file2.txt created
File data/text_files/file3.txt created


In [27]:
# from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader


loader = TextLoader("data/text_files/file1.txt")
documents = loader.load()

print(type(documents))
print(documents)


print(documents[0].metadata)
print(documents[0].page_content)

<class 'list'>
[Document(metadata={'source': 'data/text_files/file1.txt'}, page_content='Python\nPython is a high-level, interpreted language known for its clean syntax and readability. It\'s often praised for being easy to learn.\n\nPrimary Use Cases:\n\nData Science, AI & Machine Learning: This is its biggest domain. It\'s the standard for libraries like Pandas, NumPy, Scikit-learn, TensorFlow, and PyTorch.\n\nWeb Development (Backend): Used to build server-side applications with frameworks like Django and Flask.\n\nScripting & Automation: Widely used for automating repetitive tasks, writing system scripts, and building developer tools.\n\nPopularity Data:\n\nTIOBE Index: Consistently ranks as the #1 most popular programming language worldwide.\n\nStack Overflow Survey: Regularly voted the most "wanted" language (meaning developers who don\'t use it want to learn it) and is one of the most commonly used.\n\nCommunity: Has an extremely large and active community, especially in scienti

In [28]:
### Direcytory Loaders

from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader("data/text_files", glob="*.txt",
                         loader_cls=TextLoader, ### LOADER CLASS TO USE
                         loader_kwargs={"encoding": "utf-8"},
                         show_progress=True)
documents = loader.load()

print(type(documents))

print(f" Loaded {len(documents)} documents")
for i, doc in enumerate(documents):

  print(f"\nDocument {i+1}:")
  print(f"source: {doc.metadata['source']}")
  print(f". Length: {len(doc.page_content)} characters")




100%|██████████| 3/3 [00:00<00:00, 3405.39it/s]

<class 'list'>
 Loaded 3 documents

Document 1:
source: data/text_files/file2.txt
. Length: 908 characters

Document 2:
source: data/text_files/file3.txt
. Length: 1048 characters

Document 3:
source: data/text_files/file1.txt
. Length: 918 characters





In [29]:
# method - Character-Based Text Splitting

char_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=10,separator="\n",length_function=len)

char_chunks = char_splitter.split_text(documents[0].page_content)

print(len(char_chunks))
print(char_chunks[0])
print("---------")
print(char_chunks[1])

6
JavaScript (JS)
JavaScript is a high-level, dynamic language that is the core technology of the web. It's the only language that runs natively in web browsers.
Primary Use Cases:
---------
Frontend Web Development: Used to create all interactive elements on a website. It powers frameworks like React, Angular, and Vue.


In [31]:
###  recursive Character-Based Text Splitting

recursive_char_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=10,separators=[" "],length_function=len)

recursive_char_chunks = recursive_char_splitter.split_text(documents[0].page_content)

print(len(recursive_char_chunks))
print(char_chunks[0])
print("---------")
print(char_chunks[1])

5
JavaScript (JS)
JavaScript is a high-level, dynamic language that is the core technology of the web. It's the only language that runs natively in web browsers.
Primary Use Cases:
---------
Frontend Web Development: Used to create all interactive elements on a website. It powers frameworks like React, Angular, and Vue.


In [34]:
### Token Bsased Splitting

token_splitter= TokenTextSplitter(
    chunk_size=40,
    chunk_overlap=10,
    length_function=len
)

token_chunks= token_splitter.split_text(documents[0].page_content)

print(f"Created {len(token_chunks)} chunks")
print(token_chunks[0])
print("---------")
print(token_chunks[1])

Created 7 chunks
JavaScript (JS)
JavaScript is a high-level, dynamic language that is the core technology of the web. It's the only language that runs natively in web browsers.


---------
 that runs natively in web browsers.

Primary Use Cases:

Frontend Web Development: Used to create all interactive elements on a website. It powers frameworks like React, Angular, and
