In [1]:
# Install all necessary libraries
!pip install -q pandas numpy scikit-learn lightgbm timm torch torchvision sentence-transformers tqdm

# Import libraries
import pandas as pd
import numpy as np
import os
import shutil # For deleting directories
from PIL import Image
from tqdm.notebook import tqdm
import requests
from io import BytesIO
import time

# Image and Text Processing
import torch
import timm
from torchvision import transforms
from sentence_transformers import SentenceTransformer

# Modeling
import lightgbm as lgb

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries installed and imported successfully.")

Libraries installed and imported successfully.


In [2]:

DATA_DIR = '/content/'
TEMP_IMAGE_DIR = '/content/temp_images/' # Temporary folder for batch image downloads

# Settings
MODEL_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 1000 # You can increase this to 2000 on Colab for more speed

print(f"Using device: {MODEL_DEVICE}") # This should now say 'cuda'
print(f"Processing in batches of: {BATCH_SIZE}")

# Load the datasets from the path you uploaded them to
try:
    train_df = pd.read_csv(f'{DATA_DIR}train.csv')
    test_df = pd.read_csv(f'{DATA_DIR}test.csv')
    print("Datasets loaded successfully.")
except FileNotFoundError:
    print(f"Error: Make sure you have uploaded train.csv and test.csv to the '{DATA_DIR}' folder.")

Using device: cuda
Processing in batches of: 1000
Datasets loaded successfully.


In [3]:
# --- Initialize Models (do this only once) ---
# Image Model
IMG_MODEL_NAME = 'efficientnet_b0'
img_model = timm.create_model(IMG_MODEL_NAME, pretrained=True, num_classes=0).to(MODEL_DEVICE)
img_model.eval()
img_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Text Model
TEXT_MODEL_NAME = 'all-MiniLM-L6-v2'
text_model = SentenceTransformer(TEXT_MODEL_NAME, device=MODEL_DEVICE)


# --- Helper Functions ---
def download_batch_images(df_batch, output_dir):
    """Downloads a small batch of images."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for _, row in df_batch.iterrows():
        filepath = os.path.join(output_dir, f"{row['sample_id']}.jpg")
        if os.path.exists(filepath): continue
        try:
            response = requests.get(row['image_link'], timeout=15)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
            img.convert('RGB').save(filepath, 'JPEG')
        except Exception:
            pass

def get_image_embedding(image_path, model, transform):
    """Generates embedding for a single image."""
    try:
        image = Image.open(image_path).convert('RGB')
        image_tensor = transform(image).unsqueeze(0).to(MODEL_DEVICE)
        with torch.no_grad():
            return model(image_tensor).cpu().numpy().flatten()
    except Exception:
        return np.zeros(1280)

def process_batch(df_batch, temp_img_dir):
    """Processes one batch of data: download, featurize, cleanup."""
    # 1. Download images for this batch
    download_batch_images(df_batch, temp_img_dir)

    # 2. Generate Image Features
    img_features = []
    for sample_id in df_batch['sample_id']:
        fpath = os.path.join(temp_img_dir, f"{sample_id}.jpg")
        img_features.append(get_image_embedding(fpath, img_model, img_transform))
    img_features_df = pd.DataFrame(np.vstack(img_features), index=df_batch.index)

    # 3. Generate Text Features
    text_features = text_model.encode(df_batch['catalog_content'].tolist(), batch_size=64)
    text_features_df = pd.DataFrame(text_features, index=df_batch.index)

    # 4. Generate Engineered Text Features
    df_batch['ipq'] = df_batch['catalog_content'].str.extract(r'\(IPQ (\d+)\)').fillna(1).astype(int)
    df_batch['text_len'] = df_batch['catalog_content'].str.len()
    engineered_feats_df = df_batch[['ipq', 'text_len']]

    # 5. Combine all features for the batch
    return pd.concat([engineered_feats_df, img_features_df, text_features_df], axis=1)

print("Helper functions and models are ready.")

model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Helper functions and models are ready.


In [4]:
# --- Initialize Models (do this only once) ---
# Image Model
IMG_MODEL_NAME = 'efficientnet_b0'
img_model = timm.create_model(IMG_MODEL_NAME, pretrained=True, num_classes=0).to(MODEL_DEVICE)
img_model.eval()
img_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Text Model
TEXT_MODEL_NAME = 'all-MiniLM-L6-v2'
text_model = SentenceTransformer(TEXT_MODEL_NAME, device=MODEL_DEVICE)


# --- Helper Functions ---
def download_batch_images(df_batch, output_dir):
    """Downloads a small batch of images."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for _, row in df_batch.iterrows():
        filepath = os.path.join(output_dir, f"{row['sample_id']}.jpg")
        if os.path.exists(filepath): continue
        try:
            response = requests.get(row['image_link'], timeout=15)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
            img.convert('RGB').save(filepath, 'JPEG')
        except Exception:
            pass

def get_image_embedding(image_path, model, transform):
    """Generates embedding for a single image."""
    try:
        image = Image.open(image_path).convert('RGB')
        image_tensor = transform(image).unsqueeze(0).to(MODEL_DEVICE)
        with torch.no_grad():
            return model(image_tensor).cpu().numpy().flatten()
    except Exception:
        return np.zeros(1280)

def process_batch(df_batch, temp_img_dir):
    """Processes one batch of data: download, featurize, cleanup."""
    # 1. Download images for this batch
    download_batch_images(df_batch, temp_img_dir)

    # 2. Generate Image Features
    img_features = []
    for sample_id in df_batch['sample_id']:
        fpath = os.path.join(temp_img_dir, f"{sample_id}.jpg")
        img_features.append(get_image_embedding(fpath, img_model, img_transform))
    img_features_df = pd.DataFrame(np.vstack(img_features), index=df_batch.index)

    # 3. Generate Text Features
    text_features = text_model.encode(df_batch['catalog_content'].tolist(), batch_size=64)
    text_features_df = pd.DataFrame(text_features, index=df_batch.index)

    # 4. Generate Engineered Text Features
    df_batch['ipq'] = df_batch['catalog_content'].str.extract(r'\(IPQ (\d+)\)').fillna(1).astype(int)
    df_batch['text_len'] = df_batch['catalog_content'].str.len()
    engineered_feats_df = df_batch[['ipq', 'text_len']]

    # 5. Combine all features for the batch
    return pd.concat([engineered_feats_df, img_features_df, text_features_df], axis=1)

print("Helper functions and models are ready.")

Helper functions and models are ready.


In [5]:
all_test_features = []

# Split test data into batches. Ensure BATCH_SIZE and TEMP_IMAGE_DIR are defined.
# If test_df is small, you might not need to split it.
if len(test_df) > BATCH_SIZE:
    test_batches = np.array_split(test_df, len(test_df) // BATCH_SIZE)
else:
    test_batches = [test_df]

# --- Start Feature Generation for TEST data ---
print("\n--- Starting Feature Generation for TEST data ---")
for i, batch in enumerate(tqdm(test_batches, desc="Test Batches")):
    try:
        # The same process_batch function is used
        batch_features = process_batch(batch, TEMP_IMAGE_DIR)
        all_test_features.append(batch_features)
    finally:
        # Clean up images for this batch to save space
        if os.path.exists(TEMP_IMAGE_DIR):
            shutil.rmtree(TEMP_IMAGE_DIR)

# --- Combine all processed batches ---
print("\nConcatenating all generated test features...")
X_test = pd.concat(all_test_features)

# Ensure column names are strings
X_test.columns = [str(col) for col in X_test.columns]

print("Test feature generation complete!")
print("Final Test Features Shape:", X_test.shape)

# --- Save the generated features to a file ---
X_test.to_csv("X_test_features.csv", index=False)
print("\nSuccessfully saved test features to 'X_test_features.csv'")


--- Starting Feature Generation for TEST data ---


Test Batches:   0%|          | 0/75 [00:00<?, ?it/s]


Concatenating all generated test features...
Test feature generation complete!
Final Test Features Shape: (75000, 1666)

Successfully saved test features to 'X_test_features.csv'


In [8]:
!ls /content/

sample_data  test.csv  train.csv  X_test_features.csv


In [9]:
from google.colab import files
files.download("/content/X_test_features.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>