# Scrape


In [None]:
%pip install -q requests markdownify

In [None]:
import requests
from markdownify import markdownify as md
import os
from urllib.parse import urlparse

BASE_URL = "https://support.optisigns.com/api/v2/help_center/articles.json"
OUTPUT_DIR = "articles_md"

os.makedirs(OUTPUT_DIR, exist_ok=True)


def slugify(url):
    parsed = urlparse(url)
    slug = parsed.path.rstrip("/").split("/")[-1]
    return slug.replace("-", "_")


def fetch_articles():
    articles = []
    url = BASE_URL
    while url:
        resp = requests.get(url)
        data = resp.json()
        articles.extend(data.get("articles", []))
        url = data.get("next_page")  # pagination
    return articles


def save_article_as_md(article):
    html_content = article.get("body", "")
    markdown_content = md(html_content)

    slug = slugify(article.get("html_url", f"article_{article['id']}"))
    filename = os.path.join(OUTPUT_DIR, f"{slug}.md")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# {article.get('title')}\n\n")
        f.write(markdown_content)


def scrape_all():
    print("Fetching articles...")
    articles = fetch_articles()
    print(f"Total articles fetched: {len(articles)}")

    for article in articles:
        save_article_as_md(article)

    print(f"Saved all Markdown files in '{OUTPUT_DIR}' folder.")

In [None]:
scrape_all()

## Chunking


In [None]:
%pip install -q langchain

In [None]:
%pip install -q rich

In [None]:
import os


def get_files_in_directory(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".md") or filename.endswith(".json"):
            files.append(os.path.join(directory, filename))
    return files


In [None]:
import json
from langchain.text_splitter import MarkdownTextSplitter
import rich

md_files = get_files_in_directory(OUTPUT_DIR)

JSONL_DIR = "jsonl_files"
os.makedirs(JSONL_DIR, exist_ok=True)

for file in md_files:
    with open(file, "r", encoding="utf-8") as f:
        content = f.read()
        metadata = {
            "file_name": os.path.basename(file),
            "file_path": file,
            "chunk_index": 0,
        }
        # print(metadata)

        splitter = MarkdownTextSplitter(chunk_size=800, chunk_overlap=200)
        chunks = splitter.split_text(content)
        outfile = os.path.join(
            JSONL_DIR, os.path.splitext(file)[0].split(os.sep)[-1] + ".json"
        )
        # print("outfile:", outfile, type(outfile))

        with open(outfile, "w", encoding="utf-8") as out:
            for i, chunk in enumerate(chunks):
                chunk_metadata = metadata.copy()
                chunk_metadata["chunk_index"] = i
                chunk_metadata["content"] = chunk
                # rich.print(chunk_metadata)
                out.write(json.dumps(chunk_metadata, ensure_ascii=False) + "\n")
                # break
    # break

# OpenAI Vector store


In [None]:
%pip install -q openai python-dotenv

## Create OpenAI Client


In [None]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


## Create Vector Store


In [None]:
vector_store = client.vector_stores.create(name="Support FAQ")
print(vector_store)

In [None]:
vector_store_id = "vs_687a68688564819184ac96b514b8f083"

## Check vector store storage


In [None]:
vector_stores = client.vector_stores.list()
print(vector_stores)

## Upload files to vector store


In [None]:
files = get_files_in_directory(OUTPUT_DIR)
file_streams = [open(file, "rb") for file in files[:2]]


In [None]:
files[0]

In [None]:
file_batch = client.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store_id, files=file_streams[:1]
)
print(file_batch)

## Individual file upload


In [None]:
result = client.files.create(
    file=open(
        "a.jsonl",
        "rb",
    ),
    purpose="assistants",
)

In [None]:
result.id

In [None]:
client.vector_stores.file_batches.create(
    vector_store_id=vector_store_id, file_ids=["file-2casDiXq1CLeUXPnX9Ad3U"]
)

## Clear all files in vector store


In [None]:
import rich

rich.print(client.files.list())

In [None]:
file_ids = [file.id for file in client.files.list()]
print(len(file_ids), "files uploaded to the vector store.")

In [None]:
for file_id in file_ids:
    client.files.delete(file_id)

## Update files in vector store


In [None]:
client.vector_stores.files.retrieve(extra_query={})

In [None]:
import rich

vector_store_files = client.vector_stores.files.list(vector_store_id=vector_store_id)
print(len(vector_store_files.data))
rich.print(vector_store_files)

id_to_delete = vector_store_files.data[0].id
print(f"Deleting file with ID: {id_to_delete}")

In [None]:
# The client.vector_stores.files.delete does not actuall the file but removes it from the vector store
# deleted_vector_store_file = client.vector_stores.files.delete(
#     vector_store_id=vector_store_id,
#     file_id=id_to_delete
# )
client.files.delete(id_to_delete)
print(f"Deleted file with ID: {id_to_delete}")

In [None]:
def update_file_in_vector_store(
    client, vector_store_id, deleted_file_id, file_streams: list
):
    try:
        client.files.delete(deleted_file_id)
    except Exception as e:
        print(f"Error deleting file {deleted_file_id}: {e}")
    update_file = client.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store_id, files=file_streams
    )

    return update_file

In [None]:
update_file = client.vector_stores.file_batches.upload_and_poll(
    vector_store_id=vector_store_id, files=file_streams[1:2]
)

## Example retrieval


In [None]:
response = client.vector_stores.search(
    vector_store_id=vector_store_id, query="youtube", max_num_results=5
)

In [None]:
rich.print(response.data[0])

# Check for new updates


## Connect to Supabase PostgreSQL


In [None]:
import os
from supabase import create_client, Client
from dotenv import load_dotenv

load_dotenv()

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [None]:
import hashlib


def get_hash_from_files(file_paths):
    sha256_hash = hashlib.sha256()
    hashes = []

    for file_path in file_paths:
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        hashes.append(sha256_hash.hexdigest())

    return hashes


get_hash_from_files(
    [
        os.path.join(JSONL_DIR, file)
        for file in os.listdir(JSONL_DIR)
        if file.endswith(".json")
    ][:4]
)

In [None]:
response = supabase.table("scraped_articles").select("hash").execute()
old_hashes = [item["hash"] for item in response.data]
old_hashes

In [None]:
from datetime import datetime, timezone

new_files = get_files_in_directory(JSONL_DIR)
new_hashes = get_hash_from_files(new_files)

delta_files = [
    {
        "id": file.split(os.sep)[-1],
        "hash": hash_value,
        "updated_at": datetime.now(timezone.utc).isoformat(),
    }
    for file, hash_value in zip(new_files, new_hashes)
    if hash_value not in old_hashes
]

In [None]:
delta_files[:4]

In [None]:
from datetime import datetime, timezone

now_utc = datetime.now(timezone.utc).isoformat()
print(now_utc)
