In [31]:
import stealth_requests as requests
import requests as r
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from dotenv import load_dotenv
import os
import json
from tqdm import tqdm
from time import sleep
import base64
load_dotenv()

True

In [32]:
lightrag_url = os.getenv('LIGHTRAG_URL', 'http://localhost:9621')

with open('data/urls.txt', 'r') as f:
    page_urls = [line.strip() for line in f if line.strip()]

with open('data/pdfs.txt', 'r') as f:
    pdf_urls = [line.strip() for line in f if line.strip()]

In [50]:
def scrape_content(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content, 'html.parser')

    # Find the 'div' with class 'content-wrap'
    content_wrap_div = soup.find('div', class_='content-wrap')

    # Get the HTML content of this div
    if content_wrap_div:
        content_wrap_html = str(content_wrap_div)
        md_content = md(content_wrap_html)
        #print(md_content)
        return md_content
    else:
        print(f"Div with class 'content-wrap' not found in {url}")
        return None

def insert_page_to_rag(page):
    data = {
        "text": page["content"],
        "file_path": page["url"],
        "metadata": {
            "url": page["url"]
        },
        "source": "web"
    }
    resp = r.post(f"{lightrag_url}/documents/text", json=data)
    return resp.status_code

def insert_pdf_to_rag(pdf_url):
    resp = requests.get(pdf_url)
    if resp.status_code == 200:
        filename = pdf_url.split('/')[-1] or "file.pdf"
        files = {
            'file': (filename, resp.content, 'application/pdf')
        }
        headers = {
            'accept': 'application/json'
            # Do not set Content-Type, requests will set it for multipart/form-data
        }
        upload_resp = r.post(f"{lightrag_url}/documents/file", files=files, headers=headers)
        return upload_resp
    else:
        print(f"Failed to download PDF: {pdf_url}")
        return None


In [None]:
pdf_pbar = tqdm(pdf_urls)

for url in pdf_pbar:
    pdf_pbar.set_description(f"Processing PDF: {url}")
    status_code = insert_pdf_to_rag(url)
    if status_code == 200:
        print(f"Successfully inserted PDF: {url}")
    else:
        print(f"Failed to insert PDF: {url}, Status Code: {status_code.content}")

Processing PDF: https://summer.skku.edu/_res/summer/etc/2025_ebrochure.pdf
Failed to insert PDF: https://summer.skku.edu/_res/summer/etc/2025_ebrochure.pdf, Status Code: b'{"status":"success","message":"File \'2025_ebrochure.pdf\' saved successfully. Processing will continue in background."}'
Failed to insert PDF: https://summer.skku.edu/_res/summer/etc/2025_ebrochure.pdf, Status Code: b'{"status":"success","message":"File \'2025_ebrochure.pdf\' saved successfully. Processing will continue in background."}'


In [None]:
url_pbar = tqdm(page_urls)

markdown_dicts = []
for url in url_pbar:
    url_pbar.set_description(f"Scraping {url}")
    content = scrape_content(url)
    if content:
        markdown_dicts.append({
            "url": url,
            "content": content
        })
    sleep(0.5)  # To avoid overwhelming the server


Scraping h:   0%|          | 0/50 [00:00<?, ?it/s]


DNSError: Failed to perform, curl: (6) Could not resolve host: h. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.

In [35]:
for page in markdown_dicts:
    print(insert_page_to_rag(page))

In [35]:
import base64

markdown_dicts = []
for url in page_urls:
    print(f"Scraping {url}...")
    url = url
    content = scrape_content(url)
    markdown_bytes = content.encode('utf-8')
    base64_bytes = base64.b64encode(markdown_bytes)
    if content:
        markdown_dicts.append({
            "url": url,
            "content": content,
            "base64": base64_bytes.decode('utf-8')
        })


Scraping https://summer.skku.edu/summer/index.do...
Div with class 'content-wrap' not found.


AttributeError: 'NoneType' object has no attribute 'encode'

In [None]:
api_url = "http://localhost:8080/"
api_key = os.getenv("OPEN_WEBUI_API_KEY")
headers = {
    "Authorization": f"Bearer {api_key}",
    "Content-Type": "application/json",
    'Accept': 'application/json'
}

In [None]:
knowledge_base_data = {
    "name": "ISS Info",
    "description": "Knowledge Base with information about International Summer Semester",
    "data": {},
    "access_control": {}
}

# The api_url is defined in cell 4
create_kb_url = f"{api_url}api/v1/knowledge/create"
response = r.post(create_kb_url, headers=headers, json=knowledge_base_data)
if response.status_code == 200:
    kb_creation_response = response.json()
    kb_creation_id = kb_creation_response.get("id")
    print(kb_creation_response)
else:
    print(f"Failed to create knowledge base: {response.status_code} - {response.text}")

In [None]:
kb_creation_id

In [None]:
api_add_file_url = "/api/v1/files/"

In [None]:
def add_file_to_kb(source_url, base64_content):
    api_add_file_url = "/api/v1/files/"
    files = {'file': base64_content}
    response = r.post(f"{api_url}{api_add_file_url}", headers=headers, files=files)
    return response

In [None]:
add_file_to_kb