In [3]:
import torch
import pandas as pd, numpy as np, os
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

print("Libraries imported successfully.")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = 'all-MiniLM-L6-v2' # A fast and powerful sentence transformer
BATCH_SIZE = 64
print(f"Using device: {DEVICE}")

model = SentenceTransformer(MODEL_NAME, device=DEVICE)
print("SentenceTransformer model loaded.")


train_df = pd.read_csv("../data/raw/dataset/train.csv"); test_df = pd.read_csv("../data/raw/dataset/test.csv")
full_df = pd.concat([train_df.assign(is_train=1), test_df.assign(is_train=0)], ignore_index=True)

# Use the 'catalog_content' for embeddings
texts_to_encode = full_df['catalog_content'].tolist()

print("Starting text embedding generation...")
text_embeddings = model.encode(texts_to_encode, batch_size=BATCH_SIZE, show_progress_bar=True)
print(f"Text embeddings generated with shape: {text_embeddings.shape}")

embedding_cols = [f"txt_emb_{i}" for i in range(text_embeddings.shape[1])]
text_features_df = pd.DataFrame(text_embeddings, columns=embedding_cols)

PROCESSED_DIR = "../data/processed"; os.makedirs(PROCESSED_DIR, exist_ok=True)
SAVE_PATH = os.path.join(PROCESSED_DIR, "text_embeddings_v1.parquet")
text_features_df.to_parquet(SAVE_PATH, index=False)
print(f"\n Text embedding features saved to: {SAVE_PATH}")

Libraries imported successfully.
Using device: cpu
SentenceTransformer model loaded.
Starting text embedding generation...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Text embeddings generated with shape: (150000, 384)

 Text embedding features saved to: ../data/processed\text_embeddings_v1.parquet
