#### Because of timing constraint, wasn't able to perform data embedding from openai

In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
from pathlib import Path
import math
import polars as pl
import openai
import json
import glob
import hashlib

In [3]:
PARQUET_PATH = os.path.join(root_dir, "data", "processed_flight_features_train.parquet")
BATCH_INPUT_JSONL = os.path.join(root_dir, "data", "batch_input.jsonl")
BATCH_DIR = os.path.join(root_dir, "data", "batch_job")
OUTPUT_DIR = os.path.join(root_dir, "data", "embedded_flight_feature.parquet")

COL_NAME = "flight_text"
MODEL_NAME = "text-embedding-3-small"

In [4]:
openai.api_key = os.getenv("OPENAI_API_KEY")

In [5]:
scan = pl.scan_parquet(PARQUET_PATH).select([COL_NAME]).with_row_index("row_id")
row_count = scan.select(pl.len()).collect(engine="streaming")[0, 0]
print(f"[INFO] Total rows: {row_count}")

[INFO] Total rows: 18145372


In [None]:
# compute chunk size
SPLIT = 400

chunk_size = math.ceil(row_count / SPLIT)
print(f"[INFO] Each file will have up to {chunk_size} rows")

for split_idx in range(SPLIT):
    start = split_idx * chunk_size
    if start >= row_count:
        break
    end = min(start + chunk_size, row_count)

    print(f"[INFO] Processing rows {start}..{end} into part {split_idx}")

    # collect this chunk
    batch_df = scan.slice(start, chunk_size).collect(streaming=True)
    texts = batch_df[COL_NAME].to_list()
    row_ids = batch_df["row_id"].to_list()

    # write to JSONL
    out_path = os.path.join(BATCH_DIR, f"batch_part_{split_idx}.jsonl")
    with open(out_path, "w", encoding="utf-8") as f:
        for rid, txt in zip(row_ids, texts):
            line = {
                "custom_id": f"row_{rid}",
                "method": "POST",
                "url": "/v1/embeddings",
                "body": {
                    "model": MODEL_NAME,
                    "input": txt
                }
            }
            f.write(json.dumps(line, ensure_ascii=False) + "\n")

    print(f"[INFO] Saved {len(row_ids)} rows to {out_path}")

In [6]:
# Upload JSONL as batch file
client = openai.OpenAI()

# Directory where you stored your chunked JSONL files
CHUNK_PATTERN = os.path.join(BATCH_DIR, "batch_part_*.jsonl")

# Get all chunk files (sorted to keep track)
chunk_files = sorted(glob.glob(CHUNK_PATTERN))

In [7]:
class BatchEmbed:
    def __init__(self, jsonl_files, output_path: str | Path, file_tag: str):
        """
        jsonl_files: list of up to 10 paths to your JSONL input files
        """
        if len(jsonl_files) > 10:
            raise ValueError("You can only handle up to 10 files per batch wave.")
        self.jsonl_files = jsonl_files
        
        # Internal memory
        self.file_ids = []   # will hold OpenAI file IDs after send()
        self.batch_ids = []  # will hold OpenAI batch IDs after batch()
        
        # Client
        self.client = openai.OpenAI()
        
        # Save
        self.output_path = output_path
        self.file_tag = file_tag

    def send(self):
        """Upload all JSONL files and save file IDs."""
        for path in self.jsonl_files:
            with open(path, "rb") as f:
                uploaded = self.client.files.create(file=f, purpose="batch")
                self.file_ids.append(uploaded.id)
                print(f"[INFO] Uploaded {path} → file_id: {uploaded.id}")
        print("[INFO] All file_ids:", self.file_ids)

    def manual_file_id_insert(self, *file_ids):
        self.file_ids = file_ids

    def batch(self):
        """Create batch jobs from uploaded file IDs."""
        if not self.file_ids:
            raise RuntimeError("No file_ids found. Run send() first.")
        for idx, fid in enumerate(self.file_ids):
            batch = self.client.batches.create(
                input_file_id=fid,
                endpoint="/v1/embeddings",
                completion_window="24h",
                metadata={"description": f"flight embeddings job part {idx}"}
            )
            self.batch_ids.append(batch.id)
            print(f"[INFO] Created batch job for file_id {fid} → batch_id: {batch.id}")
        print("[INFO] All batch_ids:", self.batch_ids)

    def manual_batch_id_insert(self, *batch_ids):
        self.batch_ids = batch_ids

    def retrieve(self):
        """
        Retrieve results for each completed batch and write to parquet.
        Run only after you confirm via dashboard that all batch_ids are completed.
        """
        os.makedirs(self.output_path, exist_ok=True)
        if not self.batch_ids:
            raise RuntimeError("No batch_ids found. Run batch() first.")

        all_rows, all_embs = [], []
        for bid in self.batch_ids:
            status = self.client.batches.retrieve(bid)
            if status.status != "completed":
                print(f"[WARN] Batch {bid} not completed (status={status.status}), skipping.")
                continue
            output_file_id = status.output_file_id
            print(f"[INFO] Retrieving output for batch_id {bid}")

            output_content = self.client.files.content(output_file_id)

            # parse lines
            for line in output_content.iter_lines():
                if not line:
                    continue
                data = json.loads(line)
                if data.get("error"):
                    continue
                rid = int(data["custom_id"].replace("row_", ""))
                emb = data["response"]["body"]["data"][0]["embedding"]
                all_rows.append(rid)
                all_embs.append(emb)

        if not all_embs:
            print("[WARN] No embeddings found in any batch.")
            return

        # build one merged dataframe
        emb_cols = {f"emb_{j}": [v[j] for v in all_embs] for j in range(len(all_embs[0]))}
        df = pl.DataFrame({"row_id": all_rows, **emb_cols}).sort("row_id")

        out_path = os.path.join(self.output_path, f"embeddings_{self.file_tag}.parquet")
        df.write_parquet(out_path)
        print(f"[INFO] Saved merged parquet with {len(all_rows)} rows → {out_path}")

In [None]:
# Create object
START, END = 10, 20
FILE_TAG = hashlib.sha256(f"{START}_{END}".encode()).hexdigest()

embedder = BatchEmbed(chunk_files[START:END], OUTPUT_DIR, FILE_TAG)

# 1️⃣ Upload files
embedder.send()  
# 👉 Save/backup printed file_ids somewhere safe

# 2️⃣ Start batch jobs
embedder.batch()
# 👉 Save/backup printed batch_ids somewhere safe

# 3️⃣ Later, after confirming completion in OpenAI dashboard:
embedder.retrieve()

In [None]:
# First batch usage
START, END = 0, 10
FILE_TAG = hashlib.sha256(f"{START}_{END}".encode()).hexdigest()
embedder = BatchEmbed([], OUTPUT_DIR, FILE_TAG)

embedder.manual_batch_id_insert(
    "batch_687deaf1221881909a54e7f29c9a4ea7",
    "batch_687deae736648190b90313d7c362920a",
    "batch_687deade9e908190b72ccea7b1d3e551",
    "batch_687dead3b3a88190a9af44ab66e65a8d",
    "batch_687deacaabe08190b338c5db0bf26e75",
    "batch_687deabfc11881909545df5bfa6d5a90",
    "batch_687deab3db74819080aaf944aa255d53",
    "batch_687deaab1aec8190abf09cb13acb5756",
    "batch_687deaa275e08190a29f67bf9c9cb4be",
    "batch_687dea91e1c481908202a9c930b91dd1"
)
embedder.retrieve()

[INFO] Retrieving output for batch_id batch_687deaf1221881909a54e7f29c9a4ea7
[INFO] Retrieving output for batch_id batch_687deae736648190b90313d7c362920a
[INFO] Retrieving output for batch_id batch_687deade9e908190b72ccea7b1d3e551
[INFO] Retrieving output for batch_id batch_687dead3b3a88190a9af44ab66e65a8d
[INFO] Retrieving output for batch_id batch_687deacaabe08190b338c5db0bf26e75
[INFO] Retrieving output for batch_id batch_687deabfc11881909545df5bfa6d5a90
[INFO] Retrieving output for batch_id batch_687deab3db74819080aaf944aa255d53
[INFO] Retrieving output for batch_id batch_687deaab1aec8190abf09cb13acb5756
[INFO] Retrieving output for batch_id batch_687deaa275e08190a29f67bf9c9cb4be
[INFO] Retrieving output for batch_id batch_687dea91e1c481908202a9c930b91dd1


### Engineering Job Record

| Batch Start, End | Start DT | End DT |
|------------------|----------|--------|
|0, 10| 25/07/21 16:20 | 25/07/22 10:30 |
|10, 20 | 25/07/22 10:30 |  |
|20, 30 |              |       |
|30, 40 |              |       |
|40, 50 |              |       |
|50, 60 |              |       |
|60, 70 |              |       |
|70, 80 |              |       |
|80, 90 |              |       |
|90, 100 |              |       |
|100, 110 |              |       |
|110, 120 |              |       |
|120, 130 |              |       |
|130, 140 |              |       |
|140, 150 |              |       |
|150, 160 |              |       |
|160, 170 |              |       |
|170, 180 |              |       |
|180, 190 |              |       |
|190, 200 |              |       |
|200, 210 |              |       |
|210, 220 |              |       |
|220, 230 |              |       |
|230, 240 |              |       |
|240, 250 |              |       |
|250, 260 |              |       |
|260, 270 |              |       |
|270, 280 |              |       |
|280, 290 |              |       |
|290, 300 |              |       |
|300, 310 |              |       |
|310, 320 |              |       |
|320, 330 |              |       |
|330, 340 |              |       |
|340, 350 |              |       |
|350, 360 |              |       |
|360, 370 |              |       |
|370, 380 |              |       |
|380, 390 |              |       |
|390, 400 |              |       |



## Optional - Embedding with HuggingFace Qwen

* Couldn't accomplish this with my Mac

```python
# Config
PARQUET_PATH = os.path.join(root_dir, "data", "processed_flight_features_train.parquet")
OUTPUT_DIR = os.path.join(root_dir, "data", "embed")
BATCH_SIZE = 32
CHUNK_SIZE = 8192
DEVICE = "mps"
COL_NAME = "flight_text"
MODEL = "Qwen/Qwen3-Embedding-0.6B"

# Load the model
model = SentenceTransformer(MODEL)
model.to(torch.device(DEVICE))  # move model to Apple GPU

scan = pl.scan_parquet(PARQUET_PATH).select([COL_NAME])
row_count = scan.select(pl.len()).collect(engine="streaming")[0, 0]
print(f"[INFO] Total rows: {row_count}")

for start in range(0, row_count, CHUNK_SIZE):
    print(f"[INFO] Processing rows {start}..{min(start+CHUNK_SIZE, row_count)}")

    # load a chunk from parquet
    batch_df = scan.slice(start, CHUNK_SIZE).collect(engine="streaming")
    texts = batch_df.get_column(COL_NAME).to_list()
    del batch_df  # free as soon as possible

    # ✅ embed with no_grad to save memory
    with torch.no_grad():
        embeddings = model.encode(
            texts,
            batch_size=BATCH_SIZE,
            device=DEVICE,
            convert_to_tensor=True,
            show_progress_bar=True,
            normalize_embeddings=True,
        )

    # ✅ save and release GPU memory immediately
    np.save(os.path.join(OUTPUT_DIR, f"embeddings_part_{start//CHUNK_SIZE}.npy"),
            embeddings.cpu().numpy())
    print(f"[INFO] Saved part {start//CHUNK_SIZE}")

    # ✅ release references and clear MPS cache
    del texts
    del embeddings
    torch.mps.empty_cache()

print("[✅] All done.")
```