In [34]:
import os
import warnings 
import re
import unicodedata

from bs4 import BeautifulSoup
from langchain_community.document_loaders import RecursiveUrlLoader

warnings.filterwarnings("ignore")

In [None]:
def strip_replacement_chars(text):
    return text.replace('\uFFFD', '')

In [36]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

In [None]:
def scrape_data_and_store(url:str, max_depth:int):
    loader = RecursiveUrlLoader(url,max_depth=max_depth, extractor=bs4_extractor) # configuring loader
    docs = loader.load() # doing web scraping

    for index, doc in enumerate(docs):
        base = os.path.join("..", "data")
        os.makedirs(base, exist_ok=True)

        try:

            file_name_uncleaned = f"{doc.metadata['title']}"
            file_name = f"{re.sub(r'[^A-Za-z0-9_ ]+', '', file_name_uncleaned)}.txt" # to avoid errors in file names
            path = os.path.join(base, file_name)

            with open(path, "w") as f:
                page_content = strip_replacement_chars(doc.page_content)
                f.write(page_content)
        except:
            print(f"Something wrong with {doc} at index {index}")


In [41]:
scrape_data_and_store("https://docs.python.org/3.9/", 2)

Something wrong with page_content='Python
Search Python 3.9.24 documentation
utf-8

Python 3.9.24 documentation
https://www.python.org/images/favicon16x16.ico' metadata={'source': 'https://docs.python.org/3.9/_static/opensearch.xml', 'content_type': 'text/xml'} at index 18
Something wrong with page_content='What’s New In Python 3.9 — Python 3.9.24 documentation

Table of Contents

What’s New In Python 3.9
Summary – Release highlights
New Features
Dictionary Merge & Update Operators
New String Methods to Remove Prefixes and Suffixes
Type Hinting Generics in Standard Collections
New Parser

Other Language Changes
New Modules
zoneinfo
graphlib

Improved Modules
ast
asyncio
compileall
concurrent.futures
curses
datetime
distutils
fcntl
ftplib
gc
hashlib
http
IDLE and idlelib
imaplib
importlib
inspect
ipaddress
math
multiprocessing
nntplib
os
pathlib
pdb
poplib
pprint
pydoc
random
signal
smtplib
socket
time
sys
tempfile
tracemalloc
typing
unicodedata
venv
xml

Optimizations
Deprecated
Remove

We can skip one or two files. Now let's do data cleaning, I see some "unicode replacement character"s. When we scrape, beautiful soup handles non utf-8 characters by encoding them with this replacement chracters. Let's remove it.

In [57]:
import re

SHORT_SPACES = re.compile(r'(?<=\S) {1,3}(?=\S)')  # only collapse spaces that sit between non-space chars

def normalize_text(text: str) -> str:
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    lines = text.split('\n')

    out = []
    in_code = False

    for orig in lines:
        # detect fence toggle
        stripped = orig.strip()
        if stripped.startswith('```'):
            out.append(stripped)  # normalized fence line
            in_code = not in_code
            continue

        if in_code:
            out.append(orig)
            continue

        # preserve preformatted lines that start with a tab or 4+ spaces
        if orig.startswith('\t') or re.match(r' {4,}', orig):
            out.append(orig.rstrip())
            continue

        # for normal lines: remove trailing whitespace, collapse short internal runs,
        # but keep leading/trailing single spaces intact
        line = orig.rstrip()  # remove trailing spaces only
        if line:
            line = SHORT_SPACES.sub(' ', line)

            # also collapse leading sequences of 2-3 spaces to a single leading space
            # while still preserving 4+ leading spaces would have matched preformatted earlier
            line = re.sub(r'^( {2,3})', ' ', line)

            # collapse trailing sequences of 2-3 spaces to a single space if any remain
            line = re.sub(r'( {2,3})$', ' ', line)

        # collapse multiple blank lines later
        out.append(line)

    # collapse multiple blank lines to single blank line
    collapsed = []
    prev_blank = False
    for l in out:
        is_blank = (l == '')
        if is_blank and prev_blank:
            continue
        collapsed.append(l)
        prev_blank = is_blank

    return '\n'.join(collapsed) + '\n'


In [58]:
from pathlib import Path

def fix_file(path):
    raw = path.read_bytes()
    text = raw.decode("cp1252", errors="ignore")
    normalized_text = normalize_text(text)
    path.write_text(normalized_text, encoding="utf8")

def fix_dir(dir_path):
    p = Path(dir_path)
    for f in p.iterdir():
        if f.is_file():
            fix_file(f)

fix_dir(r"..\data")



In [12]:
import re

s = "a/b\\c"
clean = re.sub(r'[/\\\\]', '_', s)
print(clean)


a_b_c


In [None]:


loader = RecursiveUrlLoader(
    "https://docs.python.org/3.9/",
    max_depth=2,
    # use_async=False,
    # extractor=None,
    # metadata_extractor=None,
    # exclude_dirs=(),
    # timeout=10,
    # check_response_status=True,
    # continue_on_failure=True,
    # prevent_outside=True,
    # base_url=None,
    # ...
)

In [None]:
docs = loader.load()

{'source': 'https://docs.python.org/3.9/',
 'content_type': 'text/html',
 'title': '3.9.24 Documentation',
 'language': None}

In [2]:
type(docs[0].metadata)

NameError: name 'docs' is not defined

In [16]:
docs[0].page_content



In [None]:


def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

loader = RecursiveUrlLoader("https://docs.python.org/3.9/", extractor=bs4_extractor)
docs = loader.load()

base = os.path.join("..", "data")
os.makedirs(base, exist_ok=True)

path = os.path.join(base, "data.txt")

if not os.path.exists(path):
    with open(path, "w"):
        pass

with open(path, "a") as f:
    f.write(docs[0].page_content)


In [None]:
for doc in docs:
    with open(path, "a", encoding="utf-8") as f:
        f.write(doc.page_content)

UnicodeEncodeError: 'charmap' codec can't encode character '\u0141' in position 1963: character maps to <undefined>

In [None]:
# write code to store files in AWS S3
# write code to fetch data from s3
# see how to setup cronjob, can we use airflow for this?