<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preprocess-HTML-files" data-toc-modified-id="Preprocess-HTML-files-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preprocess HTML files</a></span></li><li><span><a href="#RAG" data-toc-modified-id="RAG-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>RAG</a></span><ul class="toc-item"><li><span><a href="#PDF" data-toc-modified-id="PDF-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>PDF</a></span></li></ul></li></ul></div>

# Imports

In [17]:
import requests
import os
import shutil
import re
from datetime import datetime
from dotenv import load_dotenv

from bs4 import BeautifulSoup
import openai

from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

In [46]:
import pkg_resources

def get_package_version(package_name):
    try:
        version = pkg_resources.get_distribution(package_name).version
        print(f"{package_name} version: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package_name} is not installed")

packages = ['langchain_community', 'langchain', 'openai','langchain_openai','requests']

for package in packages:
    get_package_version(package)


langchain_community version: 0.2.7
langchain version: 0.2.9
openai version: 1.35.14
langchain_openai version: 0.1.17
requests version: 2.32.2


In [35]:
# Function to print package version
def print_package_version(package_name):
    try:
        pkg = __import__(package_name)
        print(f"{package_name}: {pkg.__version__}")
    except AttributeError:
        print(f"{package_name}: version info not available")
    except ImportError:
        print(f"{package_name}: not installed")

packages = [
    'requests', 'os', 'shutil', 're', 'datetime', 'dotenv', 
    'bs4', 'openai', 'langchain_community', 'langchain', 'langchain_openai','chromadb'
]

for package in packages:
    print_package_version(package)

requests: 2.32.2
os: version info not available
shutil: version info not available
re: 2.2.1
datetime: version info not available
dotenv: version info not available
bs4: 4.12.3
openai: 1.35.14
langchain_community: 0.2.7
langchain: 0.2.9
langchain_openai: version info not available
chromadb: 0.5.4


# HTML files handling

## Request HTML files and meta data

In [19]:
def save_webpage_and_metadata(url, output_dir, counter):
    try:
        # Fetch the webpage content with certificate verification
        response = requests.get(url, verify=True)
        response.raise_for_status()  # Ensure we notice bad responses
        content = response.content
        
        # Generate a filename with a counter
        filename = f"webpage_{counter}.html"
        filepath = os.path.join(output_dir, filename)
        
        # Save the webpage content to a file
        with open(filepath, 'wb') as file:
            file.write(content)
        
        # Save the metadata (URL) in a separate file
        metadata_filename = f"{filename}.meta"
        metadata_filepath = os.path.join(output_dir, metadata_filename)
        with open(metadata_filepath, 'w') as metafile:
            metafile.write(f"URL: {url}\n")
        
        print(f"Webpage saved to {filepath}")
        print(f"Metadata saved to {metadata_filepath}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to save {url}: {e}")

def save_multiple_webpages_and_metadata(urls, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Ensures the directory exists
    counter = 1  # Initialize counter
    for url in urls:  # For each URL, request the URL, save metadata and content
        save_webpage_and_metadata(url, output_dir, counter)
        counter += 1  # Increment counter after each URL

# List of URLs to be saved
urls = [
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/almen-praksis',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/jobsogende',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/uddannelse',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/uddannelse/grunduddannelser',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/uddannelse/specialeuddannelser',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/uddannelse/specialeuddannelser/uddannelsesplan',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/uddannelse/specialeuddannelser/journal-club',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont/velvaere',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont/relation',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont/etik',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont/faglighed',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont/tillid',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont/naervaer',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/kompetenceudvikling',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/noglepersoner',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/forskning',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/hjemmesygeplejen',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/kontakt-onkologisk-afdeling',
    'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/sikker-post-onkologisk-afdeling-vejle'  
]

output_dir = 'HTML træningsdata'  # directory
save_multiple_webpages_and_metadata(urls, output_dir)

Webpage saved to HTML træningsdata\webpage_1.html
Metadata saved to HTML træningsdata\webpage_1.html.meta
Webpage saved to HTML træningsdata\webpage_2.html
Metadata saved to HTML træningsdata\webpage_2.html.meta
Webpage saved to HTML træningsdata\webpage_3.html
Metadata saved to HTML træningsdata\webpage_3.html.meta
Webpage saved to HTML træningsdata\webpage_4.html
Metadata saved to HTML træningsdata\webpage_4.html.meta
Webpage saved to HTML træningsdata\webpage_5.html
Metadata saved to HTML træningsdata\webpage_5.html.meta
Webpage saved to HTML træningsdata\webpage_6.html
Metadata saved to HTML træningsdata\webpage_6.html.meta
Webpage saved to HTML træningsdata\webpage_7.html
Metadata saved to HTML træningsdata\webpage_7.html.meta
Webpage saved to HTML træningsdata\webpage_8.html
Metadata saved to HTML træningsdata\webpage_8.html.meta
Webpage saved to HTML træningsdata\webpage_9.html
Metadata saved to HTML træningsdata\webpage_9.html.meta
Webpage saved to HTML træningsdata\webpage_10.

## Preprocess HTML files

In [20]:
directory_path = 'HTML træningsdata' # Directory containing saved HTML files and meta data
# Phrases that didn't get removed after preprocessing - i opted to specify the unwanted phrases, to remove them.
unwanted_phrases = [
    "Du forsøger muligvis at få adgang til dette websted fra en sikret browser på serveren. Aktivér scripts, og genindlæs siden.",
    "Det ser ud til, at JavaScript ikke er aktiveret for din browser. Aktivér JavaScript, og prøv igen.",
    "Se alt om"
]

# Function to read metadata from a .meta file if it exists
def read_metadata(filepath):
    meta_filepath = f"{filepath}.meta" # Look for the file path name accompanied with .meta
    if os.path.exists(meta_filepath):
        with open(meta_filepath, 'r') as metafile:
            metadata = metafile.read().strip()
        if "URL: " in metadata:
            url = metadata.split("URL: ")[-1] # If there is more meta data, we only want the URL
        elif "url: " in metadata:
            url = metadata.split("url: ")[-1] 
        else:
            url = ""
        return {"URL": url}
    return {}

# Function to convert HTML to Markdown
def html_to_markdown(html_content, meta_tags):
    '''
    The preprocessing consists of two types of files. Ones that was saved manually (closed access), and ones that were requested (open access)
    The open access consists of a meta file and a HTML file, whereas the closed access, I manually typed the URL as metadata inside the HTML file. 
    Potentially this could have been done in a better way.
    '''
    soup = BeautifulSoup(html_content, 'html.parser') # BeautifulSoup to extract data from HTML tags

    # Extract metadata from HTML
    for meta in soup.find_all('meta'): #First we want to find the meta data in the closed sourced HTML files. In the HTML files there is a meta tag
        if 'name' in meta.attrs and 'content' in meta.attrs:
            meta_tags[meta.attrs['name']] = meta.attrs['content'] # We gather all the meta data, and save it as a dictionary

    # Remove script and style elements
    for script_or_style in soup(['script', 'style']): #Remove HTML tags script and style as we dont need them
        script_or_style.decompose()

    # Remove unwanted phrases
    for phrase in unwanted_phrases: # Remove the unwanted phrases that was located after inspection of files after preprocessing.
        for element in soup.find_all(text=re.compile(re.escape(phrase))): #Find the element
            element.extract() #Remove the element

    # Remove all links
    for a in soup.find_all('a', href=True): # Remove all links. If we wanted the RAG model to help with links, we could keep this, and simply remove the tag
        a.decompose()

    # Convert headings
    for i in range(1, 7): # Look through all heading levels (h1,h2,h3 etc..)
        for tag in soup.find_all(f'h{i}'): # Find all the headings for the specific level we are at
            tag.replace_with(f"{'#' * i} {tag.get_text()}\n") # Change it to "#" structure as this is what i wanted to work with for my RAG model

    # Convert lists and remove empty list items
    for ul in soup.find_all('ul'):  # Loop through the ordered and unordered lists, to iterate over the lists in the docs
        items = []
        for li in ul.find_all('li'):
            text = li.get_text().strip()
            if text: # if we found something
                items.append(f"- {text}")
        ul.replace_with("\n".join(items) + "\n") # Change it into markdown format

    for ol in soup.find_all('ol'): 
        items = []
        for i, li in enumerate(ol.find_all('li'), 1):
            text = li.get_text().strip()
            if text:
                items.append(f"{i}. {text}")
        ol.replace_with("\n".join(items) + "\n")

    # Convert tables
    for table in soup.find_all('table'): # Find tables (if any)
        rows = []
        for tr in table.find_all('tr'): # Loop over table rows
            cells = [td.get_text().strip() for td in tr.find_all(['td', 'th'])] #Table data and table header
            rows.append(" | ".join(cells)) # Append it into a MD formatting
        if rows: # If any rows, save them too
            table_md = "\n".join(rows)
            table.replace_with(table_md + "\n")

    # Get the text and remove excessive newlines
    text = soup.get_text()
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove excessive newlines as i found that several new lines were present after preprocessing

    # Remove standalone hyphens
    text = re.sub(r'\n-\n', '\n', text)

    meta_text = "\n".join([f"{key}: {value}" for key, value in meta_tags.items()])     # Add metadata to the beginning of the text
    return f"{meta_text}\n\n{text}" # Return the text and meta data

# Traverse the directory and process HTML files
for root, dirs, files in os.walk(directory_path): # Look through all paths in the dir to find the html files.
    # Skip the preprocessed directory
    if 'preprocessed' in dirs: # If i already ran my script, i want to remove preprocessed files, to run it all again.
        dirs.remove('preprocessed')

    for filename in files: # Loop over all the files
        if filename.endswith('.html'): # Only preprocess html files
            file_path = os.path.join(root, filename)
            
            # Read metadata from .meta file if it exists, so we can cite it later
            meta_tags = read_metadata(file_path)

            # Open and read the HTML file
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read() #Get the html file

            # Convert HTML to Markdown
            markdown_content = html_to_markdown(html_content, meta_tags) # Use our function to preprocess and convert to markdown

            # Create the preprocessed directory if it doesn't exist
            preprocessed_directory = os.path.join(directory_path, 'preprocessed')
            if not os.path.exists(preprocessed_directory):
                os.makedirs(preprocessed_directory)

            # Save the markdown content to a new file
            relative_path = os.path.relpath(root, directory_path) # We want structure in the file paths, so under the "preprocessed" path, we create the same folder names, but with the preprocessed files
            output_dir = os.path.join(preprocessed_directory, relative_path) #preprocessing DIR/relative path/file 
            if not os.path.exists(output_dir):
                os.makedirs(output_dir) # Make the dir if it doesnt exist yet
            output_file_path = os.path.join(output_dir, f'preprocessed_{filename[:-5]}.md') # Remove .html and add .md

            with open(output_file_path, 'w', encoding='utf-8') as file:
                file.write(markdown_content) # Write the file at the output path.
            '''
            The output is now a .md file that has been preprocessed, to store only valuable information. tags are removed and data has been extracted into a .md file.
            '''

            print(f'Processed file: {filename}')


  for element in soup.find_all(text=re.compile(re.escape(phrase))): #Find the element


Processed file: webpage_1.html
Processed file: webpage_10.html
Processed file: webpage_11.html
Processed file: webpage_12.html
Processed file: webpage_13.html
Processed file: webpage_14.html
Processed file: webpage_15.html
Processed file: webpage_16.html
Processed file: webpage_17.html
Processed file: webpage_18.html
Processed file: webpage_19.html
Processed file: webpage_2.html
Processed file: webpage_20.html
Processed file: webpage_21.html
Processed file: webpage_22.html
Processed file: webpage_3.html
Processed file: webpage_4.html
Processed file: webpage_5.html
Processed file: webpage_6.html
Processed file: webpage_7.html
Processed file: webpage_8.html
Processed file: webpage_9.html


# Adding HTML data to Vector DB

In [21]:
load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY'] # Get OPENAI key from environment

In [22]:
def extract_url_from_markdown(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
    # Extract the URL metadata from the beginning of the Markdown file
    lines = content.split('\n')
    url = 'No URL available'
    metadata_end_index = 0
    for index, line in enumerate(lines):
        if line.strip() == "":
            metadata_end_index = index + 1
            break
        if line.startswith("url: ") or (line.startswith('URL: ')):
            url = line.split(": ", 1)[1].strip()
    
    # Extract the main content (after metadata)
    main_content = '\n'.join(lines[metadata_end_index:]).strip()
    
    return main_content, url

# Specify the directory containing the preprocessed Markdown files
DATA_PATH = 'HTML træningsdata/preprocessed'
CHROMA_PATH = "chroma"

def load_documents():
    documents = []
    for root, _, files in os.walk(DATA_PATH):
        for filename in files:
            if filename.endswith('.md'):
                file_path = os.path.join(root, filename)
                content, url = extract_url_from_markdown(file_path)
                documents.append(Document(page_content=content, metadata={"url": url}))
    return documents

docs = load_documents()

# Print out the documents for verification
for doc in docs:
    print("Metadata:", doc.metadata)
    print("Content:", doc.page_content[:200], "...\n")


Metadata: {'url': 'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling'}
Content: Onkologisk Afdeling, Vejle. Specialiseret ikke-kirurgisk kræftbehandling. 
Menu
- Onkologisk Afdeling
# Onkologisk Afdeling
        Udskriv
Onkologisk Afdeling (Kræftafdelingen) på Sygehus Lillebælt,  ...

Metadata: {'url': 'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont'}
Content: En tænkehorisont for sygeplejen
Menu
- Excellent sygepleje - en tænkehorisont
# Excellent sygepleje - en tænkehorisont
        Udskriv
Udviklet i Onkologisk Afdeling
## Definition på Excellent sygeple ...

Metadata: {'url': 'https://sygehuslillebaelt.dk/afdelinger/vejle-sygehus/onkologisk-afdeling/fagfolk-og-uddannelse/sygepleje/excellent-sygepleje-en-taenkehorisont/velvaere'}
Content: Velvære, målet med excellent sygepleje
Menu
- Velvære
# Velvære
        Udskriv
Kernebegrebet velvære beskrives i forhold til a

In [23]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter( # Splitting text by looking recursively at the text we give it instead of giving each chunk a fixed length - potentially more logical splits
        chunk_size=1000, # I have iteratively tried different chunk sizes and chunk overlap. We want big enough chunk sizes to contain valuable information, but also ensuring optimal performance
        chunk_overlap=200, #Likewise i iteratively tried different chunk overlaps. We want to contain information, without adding redundant information between the chunk splits
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents) #Split all the documents
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks
docs = split_text(docs)

Split 22 documents into 76 chunks.


In [28]:
# Function to save chunks to Chroma
DATA_PATH = 'HTML træningsdata/preprocessed'
CHROMA_PATH = "chroma"
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH) #If chroma path exists, then we rm it to start over

    # Create a new DB from the documents
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH # We use OpenAI Embeddings to embed our documents (chunks). We could also use open source, and an open source LLM instead of ChatGPT (but will require more computing, and possibly a worse model)
    )
    db.persist() # Save it
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    
save_to_chroma(docs)

Saved 76 chunks to chroma.


  warn_deprecated(
