# Scrape

In [None]:
%pip install -q requests markdownify

In [1]:
import requests
from markdownify import markdownify as md
import os
from urllib.parse import urlparse

BASE_URL = "https://support.optisigns.com/api/v2/help_center/articles.json"
OUTPUT_DIR = "articles_md"

os.makedirs(OUTPUT_DIR, exist_ok=True)

def slugify(url):
    parsed = urlparse(url)
    slug = parsed.path.rstrip('/').split('/')[-1]
    return slug.replace('-', '_')

def fetch_articles():
    articles = []
    url = BASE_URL
    while url:
        resp = requests.get(url)
        data = resp.json()
        articles.extend(data.get("articles", []))
        url = data.get("next_page")  # pagination
    return articles

def save_article_as_md(article):
    html_content = article.get("body", "")
    markdown_content = md(html_content)

    slug = slugify(article.get("html_url", f"article_{article['id']}"))
    filename = os.path.join(OUTPUT_DIR, f"{slug}.md")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# {article.get('title')}\n\n")
        f.write(markdown_content)

def scrape_all():
    print("Fetching articles...")
    articles = fetch_articles()
    print(f"Total articles fetched: {len(articles)}")

    for article in articles:
        save_article_as_md(article)

    print(f"Saved all Markdown files in '{OUTPUT_DIR}' folder.")

In [2]:
scrape_all()

Fetching articles...
Total articles fetched: 399
Saved all Markdown files in 'articles_md' folder.


## Chunking

In [41]:
%pip install -q langchain

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
%pip install -q rich

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os

def get_files_in_directory(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".md") or filename.endswith(".json"):
            files.append(os.path.join(directory, filename))
    return files


In [89]:
import json
from langchain.text_splitter import MarkdownTextSplitter
import rich
import builtins

md_files = get_files_in_directory(OUTPUT_DIR)

JSONL_DIR = "jsonl_files"
os.makedirs(JSONL_DIR, exist_ok=True)

for file in md_files:
    with open(file, "r", encoding="utf-8") as f:
        content = f.read()
        metadata = {
            "file_name": os.path.basename(file),
            "file_path": file,
            "chunk_index": 0
        }
        # print(metadata)

        splitter = MarkdownTextSplitter(chunk_size=800, chunk_overlap=200)
        chunks = splitter.split_text(content)
        outfile = os.path.join(JSONL_DIR, os.path.splitext(file)[0].split(os.sep)[-1] + ".json")
        # print("outfile:", outfile, type(outfile))

        with open(outfile, "w", encoding="utf-8") as out:
            for i, chunk in enumerate(chunks):
                chunk_metadata = metadata.copy()
                chunk_metadata["chunk_index"] = i
                chunk_metadata["content"] = chunk
                # rich.print(chunk_metadata)
                out.write(json.dumps(chunk_metadata, ensure_ascii=False) + "\n")
                # break
    # break

# OpenAI Vector store

In [3]:
%pip install -q openai python-dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Create OpenAI Client

In [14]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


## Create Vector Store

In [75]:
vector_store = client.vector_stores.create(
  name="Support FAQ"
)
print(vector_store)

VectorStore(id='vs_687a06cbcfb48191a6027bd29ba44c7f', created_at=1752827595, file_counts=FileCounts(cancelled=0, completed=0, failed=0, in_progress=0, total=0), last_active_at=1752827595, metadata={}, name='Support FAQ', object='vector_store', status='completed', usage_bytes=0, expires_after=None, expires_at=None)


In [76]:
vector_store_id = "vs_687a06cbcfb48191a6027bd29ba44c7f"

## Check vector store storage

In [77]:
vector_stores = client.vector_stores.list()
print(vector_stores)

SyncCursorPage[VectorStore](data=[VectorStore(id='vs_687a06cbcfb48191a6027bd29ba44c7f', created_at=1752827595, file_counts=FileCounts(cancelled=0, completed=0, failed=0, in_progress=0, total=0), last_active_at=1752827595, metadata={}, name='Support FAQ', object='vector_store', status='completed', usage_bytes=0, expires_after=None, expires_at=None)], has_more=False, object='list', first_id='vs_687a06cbcfb48191a6027bd29ba44c7f', last_id='vs_687a06cbcfb48191a6027bd29ba44c7f')


## Upload files to vector store

In [90]:
files = get_files_in_directory(JSONL_DIR)
file_streams = [open(file, "rb") for file in files[:40]]


In [91]:
file_batch = client.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store_id, files=file_streams
)
print(file_batch)

VectorStoreFileBatch(id='vsfb_2a638ea8fdc2457fa3a6d0cac4e11e85', created_at=1752827840, file_counts=FileCounts(cancelled=0, completed=40, failed=0, in_progress=0, total=40), object='vector_store.file_batch', status='completed', vector_store_id='vs_687a06cbcfb48191a6027bd29ba44c7f')


## Clear all files in vector store

In [35]:
# Clear the whole vector store
client.vector_stores.delete(vector_store_id=vector_store_id)
print(f"Vector store {vector_store_id} deleted.")

Vector store vs_68792514975881919f957c135a4e5233 deleted.


## Example retrieval

In [95]:
response = client.vector_stores.search(
    vector_store_id=vector_store_id,
    query="youtube",
    max_num_results=5
)

In [100]:
rich.print(response.data[0])

# Check for new updates

## Connect to Supabase PostgreSQL

In [101]:
import os
from supabase import create_client, Client
from dotenv import load_dotenv

load_dotenv()

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [108]:
import hashlib

def get_hash_from_files(file_paths):
    sha256_hash = hashlib.sha256()
    hashes = []
    
    for file_path in file_paths:
        with open(file_path, "rb") as f:
            for byte_block in iter(lambda: f.read(4096), b""):
                sha256_hash.update(byte_block)
        hashes.append(sha256_hash.hexdigest())
        
    return hashes

get_hash_from_files([os.path.join(JSONL_DIR, file) for file in os.listdir(JSONL_DIR) if file.endswith(".json")][:4])

['13f23053968bba6523f356dce43f77f1b409532884a20aa5a07496f277910a74',
 '1ef6c17b104a131995e13899dd7744e6ab2275aa1dee814a8363536a9fe9e4c2',
 '367d3125be88ef0e570b0d4ae2931fe6d027ff3c5afffd9328d69051d4c3f879',
 'ccfb05ec0bd31fd0dc1ac32a46d8597a34a9d63b77a99952dd26912b67f793b4']

In [113]:
response = (
    supabase.table("scraped_articles")
    .select("hash")
    .execute()
)
old_hashes = [item['hash'] for item in response.data]
old_hashes

['bc19de...', '098y7gyefstvhjidwa0e9hubhd']

In [121]:
from datetime import datetime, timezone

new_files = get_files_in_directory(JSONL_DIR)
new_hashes = get_hash_from_files(new_files)

delta_files = [
    {"id": file.split(os.sep)[-1], "hash": hash_value, "updated_at": datetime.now(timezone.utc).isoformat()}
    for file, hash_value in zip(new_files, new_hashes)
    if hash_value not in old_hashes
]

In [124]:
delta_files[:4]

[{'id': '10159088954387_How_to_put_Live_TV_on_your_screens_with_OptiSigns.json',
  'hash': '13f23053968bba6523f356dce43f77f1b409532884a20aa5a07496f277910a74',
  'updated_at': '2025-07-18T09:35:26.987686+00:00'},
 {'id': '10259143299219_How_to_use_Website_Screenshot_app.json',
  'hash': '1ef6c17b104a131995e13899dd7744e6ab2275aa1dee814a8363536a9fe9e4c2',
  'updated_at': '2025-07-18T09:35:26.987706+00:00'},
 {'id': '10517038025363_How_to_Customize_Branding_and_Fonts_in_the_OptiSigns_Designer_Brand_Kit.json',
  'hash': '367d3125be88ef0e570b0d4ae2931fe6d027ff3c5afffd9328d69051d4c3f879',
  'updated_at': '2025-07-18T09:35:26.987714+00:00'},
 {'id': '11032964830227_How_to_use_Monday_com_App.json',
  'hash': 'ccfb05ec0bd31fd0dc1ac32a46d8597a34a9d63b77a99952dd26912b67f793b4',
  'updated_at': '2025-07-18T09:35:26.987719+00:00'}]

In [115]:
from datetime import datetime, timezone

now_utc = datetime.now(timezone.utc).isoformat()
print(now_utc)


2025-07-18T09:32:59.130523+00:00
