# Introduction to Data Ingestion

In [2]:
import os
from typing import List, Dict, Any
import pandas as pd

In [5]:
from langchain_core.documents import Document
from langchain_text_splitters import (RecursiveCharacterTextSplitter, CharacterTextSplitter, TokenTextSplitter)

print("Setup done.")

Setup done.


# Understanding Document Structure in Langchain

In [4]:
## create sample document
doc= Document(page_content="This is the main text content that will be embedded and searched", metadata={"source": "sample.txt",
                                                                                                         "author": "John Doe",
                                                                                                         "page": 1,
                                                                                                         "date_created": "2024-06-15",
                                                                                                         "length": 67,
                                                                                                         "keywords": ["sample", "document", "langchain"]
                                                                                                         })

print("Document Structure")

print(f"Content: {doc.page_content}")
print(f"Metadata: {doc.metadata}")
print(type(doc))

Document Structure
Content: This is the main text content that will be embedded and searched
Metadata: {'source': 'sample.txt', 'author': 'John Doe', 'page': 1, 'date_created': '2024-06-15', 'length': 67, 'keywords': ['sample', 'document', 'langchain']}
<class 'langchain_core.documents.base.Document'>


# Text Files

In [6]:
## Create a text file
import os
os.makedirs("data/text_files", exist_ok=True)

In [16]:
sample_texts = {
    "data/text_files/sample1.txt": """This is the content of sample text file one. It contains some example text for testing.

    Machine Learning with text in Python involves applying various algorithms and techniques to extract insights, classify, or generate text. This process generally follows a structured pipeline:
Text Preprocessing:
Lowercasing/Uppercasing: Standardizing case to treat words like "The" and "the" as the same.
Eliminating Stopwords: Removing common words (e.g., "a", "the", "is") that often carry little semantic meaning.
Stemming/Lemmatization: Reducing words to their root form (e.g., "running", "ran" to "run") to handle variations.
Removing Punctuation and Numbers: Cleaning the text of non-alphabetic characters.
Feature Vectorization:
Bag-of-Words (BoW): Representing text as a collection of word counts, ignoring grammar and word order.
TF-IDF (Term Frequency-Inverse Document Frequency): Weighing words based on their frequency in a document and rarity across the entire corpus.
Word Embeddings (e.g., Word2Vec, GloVe): Representing words as dense vectors in a continuous vector space, capturing semantic relationships.
Building and Training ML Models:
Supervised Learning: Training models for tasks like text classification (e.g., sentiment analysis, spam detection), where labeled data is available. Common algorithms include Naive Bayes, Support Vector Machines (SVMs), and deep learning models like Recurrent Neural Networks (RNNs) and Transformers.
Unsupervised Learning: Applying models for tasks like topic modeling (e.g., Latent Dirichlet Allocation - LDA) or clustering, where data is unlabeled.
Model Evaluation:
Assessing the model's performance using metrics relevant to the task (e.g., accuracy, precision, recall, F1-score for classification).
Key Python Libraries:
NLTK (Natural Language Toolkit): For basic text processing, tokenization, stemming, and lemmatization.
spaCy: For advanced NLP tasks, including named entity recognition, dependency parsing, and part-of-speech tagging.
scikit-learn: For implementing various machine learning algorithms and feature extraction techniques (e.g., CountVectorizer, TfidfVectorizer).
pandas: For data manipulation and handling text data in DataFrames.
TensorFlow/PyTorch: For building and training deep learning models for more complex text tasks.
""",
    "data/text_files/sample2.txt": """This is the content of sample text file two. It provides additional example text for testing.

    Natural Language Processing (NLP) is a subfield of artificial intelligence that focuses on the interaction between computers and human language. The goal of NLP is to enable computers to understand, interpret, and generate human language in a way that is valuable. NLP encompasses a variety of tasks, including:
Tokenization: Breaking down text into smaller units, such as words or sentences.
Part-of-Speech Tagging: Identifying the grammatical parts of speech (nouns, verbs, adjectives, etc.) in a sentence.
Named Entity Recognition (NER): Detecting and classifying named entities (people, organizations, locations, etc.) in text.
Sentiment Analysis: Determining the sentiment or emotional tone of a piece of text (positive, negative, neutral).
Machine Translation: Automatically translating text from one language to another."""
}

for file_path, content in sample_texts.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)

print("Sample text files created.")

Sample text files created.


# Text Loader - Read single file

In [15]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/text_files/sample1.txt", encoding="utf-8")

documents = loader.load()

print(f"Loaded {len(documents)} Documents")
print(type(documents))
print(f"Content preview: {documents[0].page_content[:100]}")  
print(f"Metadata: {documents[0].metadata}")

print(documents)

Loaded 1 Documents
<class 'list'>
Content preview: This is the content of sample text file one. It contains some example text for testing.

    Machine
Metadata: {'source': 'data/text_files/sample1.txt'}
[Document(metadata={'source': 'data/text_files/sample1.txt'}, page_content='This is the content of sample text file one. It contains some example text for testing.\n\n    Machine Learning with text in Python involves applying various algorithms and techniques to extract insights, classify, or generate text. This process generally follows a structured pipeline:\nText Preprocessing:\nLowercasing/Uppercasing: Standardizing case to treat words like "The" and "the" as the same.\nEliminating Stopwords: Removing common words (e.g., "a", "the", "is") that often carry little semantic meaning.\nStemming/Lemmatization: Reducing words to their root form (e.g., "running", "ran" to "run") to handle variations.\nRemoving Punctuation and Numbers: Cleaning the text of non-alphabetic characters.\nFeatur

# DirectoryLoader - Multiple text files

In [18]:
from langchain_community.document_loaders import DirectoryLoader

# Load all text files from a directory
dir_loader = DirectoryLoader(
    "data/text_files", 
    glob="**/*.txt", ## Pattern to match files
    loader_cls= TextLoader, ## Loader class to use
    loader_kwargs={"encoding":"utf-8"},
    show_progress=True
    )

documents = dir_loader.load()

print(f"Loaded {len(documents)} Documents from directory")
for i, doc in enumerate(documents):
    print(f"\nDocument {i+1}:")
    print(f"Content length: {len(doc.page_content)} characters")  
    print(f"Source: {doc.metadata['source']}")

100%|██████████| 2/2 [00:00<00:00, 1990.18it/s]

Loaded 2 Documents from directory

Document 1:
Content length: 2301 characters
Source: data\text_files\sample1.txt

Document 2:
Content length: 926 characters
Source: data\text_files\sample2.txt



