In [1]:
!pip install sentence-transformers faiss-cpu lightgbm pandas


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-

In [2]:
import pandas as pd
import json
from pathlib import Path
import logging
import faiss
import numpy as np
import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import re

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


2025-07-23 09:34:53.155901: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753263293.390866      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753263293.461341      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def clean_html(text):
    """Removes HTML tags from a string."""
    if not isinstance(text, str):
        return ""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def process_json_file(file_path):
    """
    Loads a single JSON file and extracts relevant product data.
    Creates a composite text field for robustness by including more metadata.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        product = data.get('data', {})
        
        # --- Create a more descriptive, composite text field ---
        name = product.get('productDisplayName', '')
        category = product.get('masterCategory', {}).get('typeName', '')
        sub_category = product.get('subCategory', {}).get('typeName', '')
        article_type = product.get('articleType', {}).get('typeName', '')
        
        # --- NEW: Add more descriptive fields ---
        base_colour = product.get('baseColour', '')
        season = product.get('season', '')
        usage = product.get('usage', '')
        
        # Process displayCategories (e.g., "Bottomwear,Casual Wear") into "Bottomwear Casual Wear"
        display_categories = product.get('displayCategories', '').replace(',', ' ')
        
        # Process articleAttributes dictionary (e.g., {'Pattern': 'Printed'}) into "Printed"
        attributes = product.get('articleAttributes', {})
        attribute_text = ' '.join(str(v) for v in attributes.values())
        
        # The original description is optional
        description_value = product.get('productDescriptors', {}).get('description', {}).get('value', '')
        
        # Combine all text fields into a single, rich string.
        full_text = ' '.join([
            name, category, sub_category, article_type, 
            base_colour, season, usage, display_categories, 
            attribute_text, clean_html(description_value)
        ]).strip()
        
        # Replace multiple spaces with a single space for cleanliness
        full_text = re.sub(r'\s+', ' ', full_text)
        
        if not name or not full_text:
            return None
            
        return {
            'id': product.get('id'),
            'name': name,
            'full_text': full_text,
            'brand': product.get('brandName', ''),
            'gender': product.get('gender', ''),
            'masterCategory': category,
            'price': product.get('price', 0),
            'usage': usage, # Also store usage separately
            'baseColour': base_colour # Also store color separately
        }
    except Exception as e:
        return None

# --- CORRECTED PATH ---
input_directory = Path("/kaggle/input/fashion-product-images-dataset/fashion-dataset/styles")
output_path_data = "/kaggle/working/fashion_data_bert_faiss.pkl"
output_path_faiss = "/kaggle/working/fashion_faiss.index"

logging.info("Loading and processing product data...")
json_files = list(input_directory.glob("*.json"))

if not json_files:
    raise FileNotFoundError(f"No JSON files found in the directory: {input_directory}. Please ensure the dataset is added and the path is correct.")

products = [process_json_file(f) for f in json_files]
products = [p for p in products if p is not None]

logging.info(f"Attempted to process {len(json_files)} files. Successfully loaded data for {len(products)} products.")

if not products:
    raise ValueError("The 'products' list is empty. No valid product data could be extracted from the JSON files.")

products_df = pd.DataFrame(products)

products_df.dropna(subset=['id', 'full_text'], inplace=True)
products_df.reset_index(drop=True, inplace=True)

logging.info(f"Created DataFrame with {len(products_df)} products.")
print(products_df.head())


      id                                   name  \
0   1566                  Artengo Men Black Cap   
1  39342              Fila Men Zoom Black Shoes   
2  13182  FILA Men Dls Maxim Black Sports Shoes   
3  21762             Fossil Men Mead Black Belt   
4  48692           Lucera Women Silver Earrings   

                                           full_text    brand gender  \
0  Artengo Men Black Cap Accessories Headwear Cap...  Artengo    Men   
1  Fila Men Zoom Black Shoes Footwear Shoes Casua...     FILA    Men   
2  FILA Men Dls Maxim Black Sports Shoes Footwear...     FILA    Men   
3  Fossil Men Mead Black Belt Accessories Belts B...   Fossil    Men   
4  Lucera Women Silver Earrings Accessories Jewel...   Lucera  Women   

  masterCategory   price   usage baseColour  
0    Accessories   299.0  Sports      Black  
1       Footwear  2199.0  Casual      Black  
2       Footwear  2499.0  Sports      Black  
3    Accessories  1695.0  Casual      Black  
4    Accessories  1525.0  Casu

In [4]:
logging.info("Loading SentenceTransformer model...")
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name) 

logging.info("Generating embeddings for product text (this will take some time on CPU)...")
text_to_embed = products_df['full_text'].tolist()
embeddings = model.encode(text_to_embed, show_progress_bar=True, convert_to_numpy=True)

logging.info(f"Embeddings generated with shape: {embeddings.shape}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1389 [00:00<?, ?it/s]

In [5]:
logging.info("Building Faiss index on CPU...")
embedding_dimension = embeddings.shape[1]

index = faiss.IndexFlatL2(embedding_dimension)
index.add(embeddings.astype('float32'))

logging.info(f"Faiss index built. Total vectors in index: {index.ntotal}")

faiss.write_index(index, output_path_faiss)
logging.info(f"Faiss index saved to {output_path_faiss}")

with open(output_path_data, 'wb') as f:
    pickle.dump({'data': products_df}, f)
logging.info(f"Product data saved to {output_path_data}")


In [6]:
logging.info("Generating training data for the ranker...")

faiss_index_cpu = faiss.read_index(output_path_faiss)

sample_queries_df = products_df.sample(100, random_state=42)
query_embeddings = model.encode(sample_queries_df['full_text'].tolist(), show_progress_bar=True)

distances, indices = faiss_index_cpu.search(query_embeddings.astype('float32'), k=100)

training_data = []
for i in range(len(indices)):
    candidate_indices = indices[i]
    candidates_df = products_df.iloc[candidate_indices].copy()
    
    candidates_df['retrieval_score'] = 1 - (distances[i]**2 / 2)
    
    labels = [1] * 10 + [0] * 90
    candidates_df['label'] = labels
    training_data.append(candidates_df)

ranker_df = pd.concat(training_data, ignore_index=True)
logging.info(f"Generated ranking dataset with {len(ranker_df)} samples.")
print(ranker_df.head())


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

      id                                      name  \
0   5576   Nike Men's Free Run Grey Red White Shoe   
1   5533      Nike Men's Free Run Blue Orange Shoe   
2  36141    Nike Men Free 4.0 V2 Grey Sports Shoes   
3  14345    Nike Men Free Run +2 Grey Sports Shoes   
4  36837  Nike Women Free 4.0 V2 Grey Sports Shoes   

                                           full_text brand gender  \
0  Nike Men's Free Run Grey Red White Shoe Footwe...  Nike    Men   
1  Nike Men's Free Run Blue Orange Shoe Footwear ...  Nike    Men   
2  Nike Men Free 4.0 V2 Grey Sports Shoes Footwea...  Nike    Men   
3  Nike Men Free Run +2 Grey Sports Shoes Footwea...  Nike    Men   
4  Nike Women Free 4.0 V2 Grey Sports Shoes Footw...  Nike  Women   

  masterCategory   price   usage baseColour  retrieval_score  label  
0       Footwear  5595.0  Sports       Grey         1.000000      1  
1       Footwear  5595.0  Sports       Blue         0.998468      1  
2       Footwear  5795.0  Sports       Grey       

In [7]:
logging.info("Engineering features and training the LightGBM ranker...")

features = [
    'retrieval_score',
    'price' 
]

X = ranker_df[features]
y = ranker_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lgb_ranker = lgb.LGBMClassifier(objective='binary', random_state=42, n_estimators=100)
lgb_ranker.fit(X_train, y_train,
               eval_set=[(X_test, y_test)],
               eval_metric='logloss',
               callbacks=[lgb.early_stopping(10, verbose=False)])

logging.info("Ranker training complete.")


[LightGBM] [Info] Number of positive: 800, number of negative: 7200
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001233 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 509
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100000 -> initscore=-2.197225
[LightGBM] [Info] Start training from score -2.197225


In [8]:
output_path_ranker = "/kaggle/working/ranker_model.pkl"

with open(output_path_ranker, 'wb') as f:
    pickle.dump(lgb_ranker, f)

logging.info(f"✅ Ranker model saved to {output_path_ranker}")
