In [4]:
%pip install -q requests markdownify

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


| Area                                | Points |
| ----------------------------------- | ------ |
| Scrape & clean quality              | 25     |
| API-based vector-store upload works | 20     |
| Daily job deployment & logs         | 15     |
| Code clarity + README               | 10     |
| Bonus (tests, improvements)         | +5     |


In [None]:
import requests
from markdownify import markdownify as md
import os
from urllib.parse import urlparse

BASE_URL = "https://support.optisigns.com/api/v2/help_center/articles.json"
OUTPUT_DIR = "articles_md"

os.makedirs(OUTPUT_DIR, exist_ok=True)

def slugify(url):
    parsed = urlparse(url)
    slug = parsed.path.rstrip('/').split('/')[-1]
    return slug.replace('-', '_')

def fetch_articles():
    articles = []
    url = BASE_URL
    while url:
        resp = requests.get(url)
        data = resp.json()
        articles.extend(data.get("articles", []))
        url = data.get("next_page")  # pagination
    return articles

def save_article_as_md(article):
    html_content = article.get("body", "")
    markdown_content = md(html_content)

    slug = slugify(article.get("html_url", f"article_{article['id']}"))
    filename = os.path.join(OUTPUT_DIR, f"{slug}.md")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# {article.get('title')}\n\n")
        f.write(markdown_content)

def main():
    print("Fetching articles...")
    articles = fetch_articles()
    print(f"Total articles fetched: {len(articles)}")

    for article in articles:
        save_article_as_md(article)

    print(f"Saved all Markdown files in '{OUTPUT_DIR}' folder.")



In [None]:
main()


Fetching articles...
Total articles fetched: 400
Saved all Markdown files in 'articles_md' folder.


In [5]:
%pip install -q openai python-dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


In [8]:

vector_store = client.vector_stores.create(
  name="Support FAQ"
)
print(vector_store)


VectorStore(id='vs_68792514975881919f957c135a4e5233', created_at=1752769812, file_counts=FileCounts(cancelled=0, completed=0, failed=0, in_progress=0, total=0), last_active_at=1752769812, metadata={}, name='Support FAQ', object='vector_store', status='completed', usage_bytes=0, expires_after=None, expires_at=None)


In [50]:
vector_stores = client.vector_stores.list()
print(vector_stores)

SyncCursorPage[VectorStore](data=[VectorStore(id='vs_68792514975881919f957c135a4e5233', created_at=1752769812, file_counts=FileCounts(cancelled=0, completed=39, failed=0, in_progress=0, total=39), last_active_at=1752824213, metadata={}, name='Support FAQ', object='vector_store', status='completed', usage_bytes=215673, expires_after=None, expires_at=None)], has_more=False, object='list', first_id='vs_68792514975881919f957c135a4e5233', last_id='vs_68792514975881919f957c135a4e5233')


In [4]:
vector_store_files = client.vector_stores.files.list(
  vector_store_id="vs_68792514975881919f957c135a4e5233"
)
print(vector_store_files[0])

TypeError: 'SyncCursorPage[VectorStoreFile]' object is not subscriptable

In [16]:
# get all files in articales_md directory
import os

def get_files_in_directory(directory):
    files = []
    for filename in os.listdir(directory):
        if filename.endswith(".md"):
            files.append(os.path.join(directory, filename))
    return files


In [22]:
files = get_files_in_directory("articles_md")
file_streams = [open(file, "rb") for file in files[:40]]

In [23]:
file_batch = client.vector_stores.file_batches.upload_and_poll(
  vector_store_id="vs_68792514975881919f957c135a4e5233", files=file_streams
)

In [25]:
print(file_batch)

VectorStoreFileBatch(id='vsfb_9d7fcf8392524d8996274ecf1e0e274f', created_at=1752770996, file_counts=FileCounts(cancelled=0, completed=40, failed=0, in_progress=0, total=40), object='vector_store.file_batch', status='completed', vector_store_id='vs_68792514975881919f957c135a4e5233')


In [31]:
print(vector_store_files.to_dict())

{'data': [{'id': 'file-3GqxRzLak5wythBK9x27pC', 'created_at': 1752770993, 'last_error': None, 'object': 'vector_store.file', 'status': 'completed', 'usage_bytes': 3587, 'vector_store_id': 'vs_68792514975881919f957c135a4e5233', 'attributes': {}, 'chunking_strategy': {'static': {'chunk_overlap_tokens': 400, 'max_chunk_size_tokens': 800}, 'type': 'static'}}, {'id': 'file-JgGbkvytLj8eSujsigJcF8', 'created_at': 1752770993, 'last_error': None, 'object': 'vector_store.file', 'status': 'completed', 'usage_bytes': 5206, 'vector_store_id': 'vs_68792514975881919f957c135a4e5233', 'attributes': {}, 'chunking_strategy': {'static': {'chunk_overlap_tokens': 400, 'max_chunk_size_tokens': 800}, 'type': 'static'}}, {'id': 'file-RzpcknQw6cYRD4C7c4An4N', 'created_at': 1752770993, 'last_error': None, 'object': 'vector_store.file', 'status': 'completed', 'usage_bytes': 6101, 'vector_store_id': 'vs_68792514975881919f957c135a4e5233', 'attributes': {}, 'chunking_strategy': {'static': {'chunk_overlap_tokens': 40

In [41]:
response = client.vector_stores.search(
    vector_store_id="vs_68792514975881919f957c135a4e5233",
    query="youtube",
    max_num_results=5
)

In [None]:
response.data[4].to_dict()

AttributeError: 'VectorStores' object has no attribute 'embeddings'

In [33]:
%pip install -q supabase

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
import os
from supabase import create_client, Client
from dotenv import load_dotenv

load_dotenv()

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [39]:
response = (
    supabase.table("scraped_articles")
    .upsert({"id": 1, "updated_at": "2024-06-29T09:20:00Z", "hash": "bc19de..."})
    .execute()
)

In [45]:
import json
import hashlib

def hash(data):
    data_json = json.dumps(data, sort_keys=True)
    hash_digest = hashlib.sha256(data_json.encode("utf-8")).hexdigest()
    
    return hash_digest


In [49]:
client.vector_stores.files.delete(
    vector_store_id="vs_68792514975881919f957c135a4e5233",
    file_id="file-6m1cFquGdJhUaWz212Qod1"
)

VectorStoreFileDeleted(id='file-6m1cFquGdJhUaWz212Qod1', deleted=True, object='vector_store.file.deleted')