In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

False

In [None]:
from pathlib import Path
import polars as pl
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
PARQUET_PATH = os.path.join(root_dir, "data", "processed_flight_features_test.parquet")
OUTPUT_DIR = os.path.join(root_dir, "data", "embedded_flight_feature_lite_test")

PARQUET_OUT_DIR = os.path.join(root_dir, "data", "embedded_flight_feature_lite_parquet_test")

COL_NAME = "flight_text"
MODEL_NAME = "all-MiniLM-L6-v2"

In [None]:
scan = pl.scan_parquet(PARQUET_PATH).select([COL_NAME]).with_row_index("row_id")
row_count = scan.select(pl.len()).collect(engine="streaming")[0, 0]
print(f"[INFO] Total rows: {row_count}")

In [None]:
model = SentenceTransformer(MODEL_NAME)

In [None]:
start = 0
chunk_idx = 0
BATCH_SIZE = 128

while start < row_count:
    end = min(start + BATCH_SIZE, row_count)
    print(f"[INFO] Processing rows {start} to {end} of {row_count}")

    # Collect this chunk
    df_chunk = (
        pl.scan_parquet(PARQUET_PATH)
        .select([COL_NAME])
        .with_row_index("row_id")
        .filter((pl.col("row_id") >= start) & (pl.col("row_id") < end))
        .collect(engine="streaming")
    )

    texts = df_chunk[COL_NAME].to_list()
    row_ids = df_chunk["row_id"].to_list()

    # Encode in smaller batches
    all_embeddings = []
    for i in range(0, len(texts), BATCH_SIZE):
        subtexts = texts[i:i+BATCH_SIZE]
        emb = model.encode(
            subtexts,
            batch_size=BATCH_SIZE,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=True
        )
        all_embeddings.append(emb)

    all_embeddings = np.vstack(all_embeddings)  # shape: (chunk_size, 384)
    # Save embeddings and row_ids
    out_file = os.path.join(OUTPUT_DIR, f"embeddings_part{chunk_idx:05d}.npz")
    np.savez_compressed(out_file, row_ids=np.array(row_ids), embeddings=all_embeddings)
    print(f"[INFO] Saved {len(row_ids)} embeddings to {out_file}")

    # Next chunk
    start = end
    chunk_idx += 1


In [None]:
# Find all .npz files
npz_files = sorted(OUTPUT_DIR.glob("embeddings_part*.npz"))
print(f"[INFO] Found {len(npz_files)} chunk files")

all_tables = []

for f in npz_files:
    data = np.load(f)
    row_ids = data["row_ids"]           # shape (N,)
    embeddings = data["embeddings"]     # shape (N, 384)
    n_samples, dim = embeddings.shape

    # Build column names for embeddings
    embed_cols = {f"emb_{i}": embeddings[:, i] for i in range(dim)}

    # Create a Polars DataFrame for this chunk
    df = pl.DataFrame({
        "row_id": row_ids,
        **embed_cols
    })

    all_tables.append(df)
    print(f"[INFO] Loaded {n_samples} embeddings from {f}")

# Concatenate all DataFrames
merged_df = pl.concat(all_tables, how="vertical")

# (Optional) sort by row_id if needed
merged_df = merged_df.sort("row_id")

# Save to Parquet
merged_df.write_parquet(MERGED_PARQUET)
print(f"[INFO] Saved merged embeddings to {MERGED_PARQUET}")