In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss, numpy as np

# 1. Load the cleaned data directly from S3
s3_path = "s3://fcopilot-shreemathitumkur-20250708-oh/raw/shipments.parquet"
df = pd.read_parquet(s3_path, storage_options={'anon': False})

# 2. Prepare the text column (make sure the name matches your schema)
texts = df['delay_reason'].fillna("").astype(str).tolist()

# 3. Load a small embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# 4. Embed
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True)
embeddings = np.asarray(embeddings).astype("float32")  # faiss expects float32

print("Embeddings shape:", embeddings.shape)           # (1000, 384) for example


ModuleNotFoundError: No module named 'pandas'

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss, numpy as np

s3_path = "s3://fcopilot-shreemathitumkur-20250708-oh/raw/shipments.parquet"
df = pd.read_parquet(s3_path, storage_options={'anon': False})

texts = df['delay_reason'].fillna("").astype(str).tolist()

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True).astype("float32")

print("Embeddings shape:", embeddings.shape)


  from .autonotebook import tqdm as notebook_tqdm
severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.

Batches: 100%|███████████████████████████████████████| 16/16 [00:03<00:00,  4.50it/s]

Embeddings shape: (1000, 384)





In [2]:
import faiss, numpy as np, pathlib, pickle

# 1. Build an IndexFlatL2 (simple, no training needed)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)                 # add all 1000 vectors
print("Index size:", index.ntotal)    # should print 1000

# 2. Persist to disk so you can load it later
pathlib.Path("data/index").mkdir(parents=True, exist_ok=True)
faiss.write_index(index, "data/index/shipments.faiss")

# 3. Save ID→row-number mapping (handy for look-ups)
with open("data/index/id_map.pkl", "wb") as f:
    pickle.dump(df[['delay_reason']].to_dict("records"), f)

print("Index & mapping saved ✔︎")


Index size: 1000
Index & mapping saved ✔︎


In [3]:
def search(query, k=3):
    q_emb = model.encode([query]).astype("float32")
    distances, idx = index.search(q_emb, k)
    for rank, (i, d) in enumerate(zip(idx[0], distances[0]), 1):
        print(f"{rank}. ({d:.2f}) {df.loc[i, 'delay_reason'][:120]}")

search("truck delayed due to traffic", k=3)


1. (1.11) Traffic
2. (1.11) Traffic
3. (1.11) Traffic
