# Import Required Libraries

Import the necessary libraries: io, zipfile, requests, and frontmatter.

In [10]:
import io
import zipfile
import requests
import frontmatter

# Define the read_repo_data Function

Define a function to download a GitHub repository as a zip file, extract markdown files, parse them with frontmatter, and return a list of dictionaries with content and metadata.

In [11]:
def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data

# Download and Process Repository Data

Use the read_repo_data function to download and process data from specified GitHub repositories, such as 'DataTalksClub/faq' and 'evidentlyai/docs'.

In [None]:
# For homework, select a GitHub repo with documentation: evidentlyai/docs
evidently_docs = read_repo_data('evidentlyai', 'docs')

# Optionally, you can try other repos
# dtc_faq = read_repo_data('DataTalksClub', 'faq')
# fastai_docs = read_repo_data('fastai', 'fastbook')

# Print Document Counts

Print the number of documents retrieved from each repository.

In [None]:
print(f"Evidently documents: {len(evidently_docs)}")

# Uncomment to print others
# print(f"FAQ documents: {len(dtc_faq)}")
# print(f"FastAI documents: {len(fastai_docs)}")

Data Engineering Zoomcamp documents: 95


# Day 2: Chunking and Intelligent Processing for Data

Welcome to Day 2 of our 7-Day AI Agents Email Crash-Course.

In the first part of the course, we focus on data preparation – the process of properly preparing data before it can be used for AI agents.

## Small and Large Documents

Yesterday (Day 1), we downloaded the data from a GitHub repository and processed it. For some use cases, like the FAQ database, this is sufficient. The questions and answers are small enough. We can put them directly into the search engine.

But it's different for the Evidently documentation. These documents are quite large. Let's take a look at this one: https://github.com/evidentlyai/docs/blob/main/docs/library/descriptors.mdx.

We could use it as is, but we risk overwhelming our LLMs.

## Why We Need to Prepare Large Documents Before Using Them

Large documents create several problems:

- Token limits: Most LLMs have maximum input token limits
- Cost: Longer prompts cost more money
- Performance: LLMs perform worse with very long contexts
- Relevance: Not all parts of a long document are relevant to a specific question

So we need to split documents into smaller subdocuments. For AI applications like RAG (which we will discuss tomorrow), this process is referred to as "chunking."

Today, we will cover multiple ways of chunking data:

1. Simple character-based chunking
2. Paragraph and section-based chunking
3. Intelligent chunking with LLM

Just so you know, for the last section, you will need a Gemini API key.

## 1. Simple Chunking

Let's start with simple chunking. This will be sufficient for most cases.

We can continue with the notebook from Day 1. We already downloaded the data from Evidently docs. We put them into the evidently_docs list.

This is how the document at index 45 looks like:

{'title': 'LLM regression testing',
 'description': 'How to run regression testing for LLM outputs.',
 'content': 'In this tutorial, you will learn...'
}

The content field is 21,712 characters long. The simplest thing we can do is cut it into pieces of equal length. For example, for size of 2000 characters, we will have:

Chunk 1: 0..2000
Chunk 2: 2000..4000
Chunk 3: 4000..6000

And so on.

However, this approach has disadvantages:

- Context loss: Important information might be split in the middle
- Incomplete sentences: Chunks might end mid-sentence
- Missing connections: Related information might end up in different chunks

That's why, in practice, we usually make sure there's overlap between chunks. For size 2000 and overlap 1000, we will have:

Chunk 1: 0..2000
Chunk 2: 1000..3000
Chunk 3: 2000..4000
...

This is better for AI because:

- Continuity: Important information isn't lost at chunk boundaries
- Context preservation: Related sentences stay together in at least one chunk
- Better search: Queries can match information even if it spans chunk boundaries

This approach is known as the "sliding window" method.

In [None]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [None]:
# Let's apply it for document 45. This gives us 21 chunks:
# 0..2000, 1000..3000, ..., 19000..21000, 20000..21712

if len(evidently_docs) > 45:
    doc_45_content = evidently_docs[45]['content']
    chunks_45 = sliding_window(doc_45_content, 2000, 1000)
    print(f"Document 45 has {len(chunks_45)} chunks")
else:
    print("Document 45 not available")

# Let's process all the documents:

evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

print(f"Total chunks created: {len(evidently_chunks)} from {len(evidently_docs)} documents")

## 2. Splitting by Paragraphs and Sections

Splitting by paragraphs is relatively easy:

In [None]:
import re

if len(evidently_docs) > 45:
    text = evidently_docs[45]['content']
    paragraphs = re.split(r"\n\s*\n", text.strip())
    print(f"Document 45 has {len(paragraphs)} paragraphs")
    print(f"First paragraph: {paragraphs[0][:200]}...")
else:
    print("Document 45 not available")

Let's now look at section splitting. Here, we take advantage of the documents' structure. Markdown documents have this structure:

# Heading 1
## Heading 2  
### Heading 3

What we can do is split by headers.

In [None]:
def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections

In [None]:
# If we want to split by second-level headers, that's what we do:

if len(evidently_docs) > 45:
    text = evidently_docs[45]['content']
    sections = split_markdown_by_level(text, level=2)
    print(f"Document 45 has {len(sections)} sections")
    if sections:
        print(f"First section: {sections[0][:200]}...")
else:
    print("Document 45 not available")

# Now we iterate over all the docs to create the final result:

evidently_chunks_sections = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks_sections.append(section_doc)

print(f"Total sections created: {len(evidently_chunks_sections)} from {len(evidently_docs)} documents")

## 3. Intelligent Chunking with LLM

In some cases, we want to be more intelligent with chunking. Instead of doing simple splits, we delegate this work to AI.

This makes sense when:

- Complex structure: Documents have complex, non-standard structure
- Semantic coherence: You want chunks that are semantically meaningful
- Custom logic: You need domain-specific splitting rules
- Quality over cost: You prioritize quality over processing cost

This costs money. In most cases, we don't need intelligent chunking.

Simple approaches are sufficient. Use intelligent chunking only when

- You already evaluated simpler methods and you can confirm that they produce poor results
- You have complex, unstructured documents
- Quality is more important than cost
- You have the budget for LLM processing

Let's create a prompt:

In [None]:
import google.generativeai as genai

# Set up Gemini API
# You need to set your API key here
# genai.configure(api_key="YOUR_GEMINI_API_KEY")

prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    
    model = genai.GenerativeModel('gemini-1.5-flash')
    response = model.generate_content(prompt)
    
    sections = response.text.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [None]:
# Now we apply this to every document:

from tqdm.auto import tqdm

evidently_chunks_intelligent = []

# Uncomment the next line and set your API key
# genai.configure(api_key="YOUR_GEMINI_API_KEY")

for doc in tqdm(evidently_docs[:5]):  # Process only first 5 docs for demo (costs money)
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks_intelligent.append(section_doc)

print(f"Total intelligent sections created: {len(evidently_chunks_intelligent)} from 5 documents")

# Note: This process requires time and incurs costs. As mentioned before, use this only when really necessary.
# For most applications, you don't need intelligent chunking.

## Bonus: Processing Code in Your GitHub Repository

You can use this approach for processing the code in your GitHub repository. You can use a variation of the following prompt:

"Summarize the code in plain English. Briefly describe each class and function/method (their purpose and role), then give a short overall summary of how they work together. Avoid low-level details."

Then add both the source code and the summary to your documents.