In [1]:
!pip install git+https://github.com/huggingface/transformers accelerate

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-8feqteur
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-8feqteur
  Resolved https://github.com/huggingface/transformers to commit 3927ffed31e3c0d2929bf98bd05b7c61fcc48b62
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting huggingface-hub==1.0.0.rc5 (from transformers==5.0.0.dev0)
  Downloading huggingface_hub-1.0.0rc5-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers==5.0.0.dev0)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.1

In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [3]:
!pip install num2words

Collecting num2words
  Downloading num2words-0.5.14-py3-none-any.whl.metadata (13 kB)
Collecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading num2words-0.5.14-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=a9fd4a6ccd59a27f9027040634609060746ddc73117f34b53f7df3446b4bbedc
  Stored in directory: /root/.cache/pip/wheels/1a/b0/8c/4b75c4116c31f83c8f9f047231251e13cc74481cca4a78a9ce
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.14


In [4]:
import re
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm
import torch
from num2words import num2words
from transformers import AutoTokenizer, AutoModel

# --- Helper Functions ---
def replace_numbers(match):
    return num2words(int(match.group()))

def preprocess_product_text(text, func=str.lower):
    # Remove emojis
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002700-\U000027BF"
        "\U0001F900-\U0001F9FF"
        "\U00002600-\U000026FF"
        "\U00002B00-\U00002BFF"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r'\b\d+\b', replace_numbers, text)
    
    # Extract fields
    item_name = re.search(r'Item Name:\s*(.*)', text, re.IGNORECASE)
    item_name = func(item_name.group(1).strip()) if item_name else ''
    
    value = re.search(r'Value:\s*(.*)', text, re.IGNORECASE)
    value = func(value.group(1).strip()) if value else ''
    
    units = re.search(r'Units:\s*(.*)', text, re.IGNORECASE)
    units = func(units.group(1).strip()) if units else ''
    
    product_desc_match = re.search(r'Product Description:\s*(.*?)(?:\n\w+:|$)', text, re.IGNORECASE | re.DOTALL)
    
    output_lines = [f"Item Name: {item_name}", f"Value: {value}", f"Units: {units}"]
    
    if product_desc_match:
        description = func(product_desc_match.group(1).strip())
        output_lines.append(f"Description: {description}")
    else:
        bullet_points = re.findall(r'Bullet Point \d+:\s*(.*)', text, re.IGNORECASE)
        for i in range(min(3, len(bullet_points))):
            output_lines.append(f"Bullet Points {i+1}: {func(bullet_points[i].strip())}")
    
    return '\n'.join(output_lines)


# --- Config ---
file_path = '/kaggle/input/maalchallenge/student_resource/dataset/train.csv'
batch_size = 12   # increase for better GPU utilization
hf_token = os.getenv("HF_TOKEN")
init_idx = 7
start_idx = init_idx * 9375
end_idx = start_idx + 9375

In [5]:
if hf_token:
    os.environ['HF_TOKEN'] = hf_token

# --- Load Data ---
df = pd.read_csv(file_path).iloc[start_idx:end_idx]
df['processed_text'] = df['catalog_content'].apply(preprocess_product_text)

# --- Load HF model + tokenizer with device_map="auto" ---
model_name = "Qwen/Qwen3-Embedding-4B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModel.from_pretrained(
    model_name,
    device_map="auto",
    token=hf_token
)
model.eval()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen3Model(
  (embed_tokens): Embedding(151665, 2560)
  (layers): ModuleList(
    (0-35): 36 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
        (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
        (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
        (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
        (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
    )
  )
  (norm): Qwen3RM

In [6]:
# --- Helper: mean pooling like sentence-transformers ---
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # (batch, seq_len, hidden)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1).clamp(min=1e-9)


# --- Generate embeddings ---
embeddings_dict = {}
texts = df['processed_text'].tolist()
sample_ids = df['sample_id'].tolist()
num_batches = (len(texts) + batch_size - 1) // batch_size

for batch_idx in tqdm(range(num_batches), desc="Processing batches"):
    with torch.no_grad():
        start = batch_idx * batch_size
        end = min(start + batch_size, len(texts))
        batch_texts = texts[start:end]
        batch_ids = sample_ids[start:end]
        
        encodings = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        
        # Move input to correct device automatically handled by model.device
        encodings = {k: v.to(model.device) for k, v in encodings.items()}
        
        outputs = model(**encodings)
        batch_embeddings = mean_pooling(outputs, encodings["attention_mask"])
        
        # Convert to CPU list
        for sid, emb in zip(batch_ids, batch_embeddings):
            embeddings_dict[sid] = emb.cpu().tolist()
        
        del batch_embeddings, batch_texts, batch_ids, encodings, outputs
        torch.cuda.empty_cache()

# --- Save embeddings ---
output_file = f"embeddings_{init_idx}.json"
with open(output_file, "w") as f:
    json.dump(embeddings_dict, f)

print(f"Saved {len(embeddings_dict)} embeddings to {output_file}")

Processing batches: 100%|██████████| 782/782 [2:10:22<00:00, 10.00s/it]


Saved 9375 embeddings to embeddings_7.json
