### Fetch, clean, and save chunks of text

In [17]:
import sys
import json
from pathlib import Path
# Add src/ to Python path so we can import modules
project_root = Path.cwd().parent   
sys.path.append(str(project_root / "src"))

# Import your pipeline function
from data_loader import fetch_and_clean, chunk_text

In [18]:
# Define paths
DATA_DIR = project_root / "data" / "raw"
DATA_DIR.mkdir(parents=True, exist_ok=True)

In [19]:
# Define URL and save path
url = "https://www.gutenberg.org/cache/epub/244/pg244.txt"  # A Study in Scarlet 
save_path = DATA_DIR / "a_study_in_scarlet.txt"

# Fetch, clean, and save text
clean_text = fetch_and_clean(url, save_path=str(save_path))

# Preview first 500 characters
print("\n--- Preview ---\n")
print(clean_text[:500])


--- Preview ---

A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.

 PART I.
 CHAPTER I. MR. SHERLOCK HOLMES.
 CHAPTER II. THE SCIENCE OF DEDUCTION.
 CHAPTER III. THE LAURISTON GARDENS MYSTERY
 CHAPTER IV. WHAT JOHN RANCE HAD TO TELL.
 CHAPTER V. OUR ADVERTISEMENT BRINGS A VISITOR.
 CHAPTER VI. TOBIAS GREGSON SHOWS WHAT HE CAN DO.
 CHAPTER VII. LIGHT IN THE DARKNESS.

 PART II. THE COUNTRY OF THE SAINTS
 CHAPTER I. ON THE GREAT ALKALI PLAIN.
 CHAPTER II. THE FLOWER OF


In [20]:
# Chunk Data
text = open(DATA_DIR / "a_study_in_scarlet.txt", encoding="utf-8").read()

# Clean text
# remove /n with .
text = text.replace("\n", " ").replace("\r", " ").replace("  ", " ")
# start text from CHAPTER I. MR. SHERLOCK HOLMES. to avoid preface
start_idx = text.find("In the year 1878 I took my degree of Doctor of Medicine of the")
text = text[start_idx:]


chunks = chunk_text(text, max_chars=4000, overlap=1000)
print(f"Chunks: {len(chunks)}")


Chunks: 61


In [21]:
# save chunks to a jsonl file
with open(DATA_DIR /"a_study_in_scarlet_chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunks, f, ensure_ascii=False, indent=2)