In [1]:
import glob
from matplotlib import pyplot as plt
import tqdm
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import sys
import json
from sklearn.metrics import f1_score
import numpy as np
from typing import List, Tuple
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import json
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sentence_transformers import SentenceTransformer
import re
import rich
from rich import print as rprint
import os
import hashlib
from datetime import datetime
import json


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [13]:
CONFIG = {
    "embedder": "intfloat/multilingual-e5-large-instruct",
    "embedding_dimension": 1024,
    "use_description": True,
    "use_instruct": False,
}

EMBEDDINGS_DIMENSION = CONFIG["embedding_dimension"]
USE_DESCRIPTION = CONFIG["use_description"]
USE_INSTRUCT = CONFIG["use_instruct"]



In [14]:
class ClassificationHead(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.3):
        super().__init__()
        # Input dimension depends on whether we're using description
        combined_dim = input_dim * 2 if USE_DESCRIPTION else input_dim
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x1, x2=None):
        if USE_DESCRIPTION:
            combined = torch.cat((x1, x2), dim=1)
        else:
            combined = x1
        return self.classifier(combined)


# load model from path
path = "models/intfloat/multilingual-e5-large-instruct-0.86f1.pt"

text_classifier = ClassificationHead(EMBEDDINGS_DIMENSION, 27)
text_classifier.load_state_dict(torch.load(path))
text_classifier.to(device)

  text_classifier.load_state_dict(torch.load(path))


ClassificationHead(
  (classifier): Sequential(
    (0): Linear(in_features=2048, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): GELU(approximate='none')
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=256, out_features=27, bias=True)
  )
)

In [19]:
def get_cache_path(texts, embedder_name, use_instruct):
    """Generate a unique cache path based on input texts, embedder, and instruct setting"""
    text_hash = hashlib.md5(''.join(texts).encode()).hexdigest()
    embedder_hash = hashlib.md5(embedder_name.encode()).hexdigest()
    instruct_suffix = '_instruct' if use_instruct else ''
    return f'cache/embeddings_{embedder_hash}_{text_hash}{instruct_suffix}.pt'
# Function to get embeddings in batches

# Load model
model = SentenceTransformer(CONFIG["embedder"], trust_remote_code=True).to('cuda')
model.train()

def get_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Getting embeddings"):
        batch = texts[i:i + batch_size]
        # Only apply instruct formatting if USE_INSTRUCT is True
        processed_batch = [
            get_detailed_instruct("Match similar products based on their features, characteristics, and intended use.", text) 
            if USE_INSTRUCT and text.strip()
            else text 
            for text in batch
        ]
        with torch.no_grad():
            batch_embeddings = model.encode(processed_batch, max_length=EMBEDDINGS_DIMENSION)
            if isinstance(batch_embeddings, np.ndarray):
                batch_embeddings = torch.from_numpy(batch_embeddings)
            assert batch_embeddings.shape[1] == EMBEDDINGS_DIMENSION, f"Model output dimension mismatch. Expected {EMBEDDINGS_DIMENSION}, got {batch_embeddings.shape[1]}"
            embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

def load_or_compute_embeddings(texts, embedder_name, batch_size=32):
    """Load embeddings from cache if they exist, otherwise compute and cache them"""
    # Create cache directory if it doesn't exist
    os.makedirs('cache', exist_ok=True)
    
    cache_path = get_cache_path(texts, embedder_name, USE_INSTRUCT)
    
    # Try to load from cache
    if os.path.exists(cache_path):
        print(f"Loading embeddings from cache: {cache_path}")
        return torch.load(cache_path)
    
    # Compute embeddings
    print("Computing new embeddings...")
    embeddings = get_embeddings(texts, batch_size)
    
    # Save to cache
    print(f"Saving embeddings to cache: {cache_path}")
    torch.save(embeddings, cache_path)
    
    return embeddings


def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = text.strip()
    return text


In [20]:
# Custom dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, designation_embeddings, description_embeddings, labels):
        assert designation_embeddings.shape[1] == EMBEDDINGS_DIMENSION, f"Designation embeddings dimension mismatch. Expected {EMBEDDINGS_DIMENSION}, got {designation_embeddings.shape[1]}"
        if USE_DESCRIPTION:
            assert description_embeddings.shape[1] == EMBEDDINGS_DIMENSION, f"Description embeddings dimension mismatch. Expected {EMBEDDINGS_DIMENSION}, got {description_embeddings.shape[1]}"
        self.designation_embeddings = designation_embeddings
        self.description_embeddings = description_embeddings if USE_DESCRIPTION else None
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(labels)

    def __len__(self):
        return len(self.designation_embeddings)
    
    def __getitem__(self, idx):
        if USE_DESCRIPTION:
            return (self.designation_embeddings[idx], 
                    self.description_embeddings[idx], 
                    self.labels[idx])
        return (self.designation_embeddings[idx], self.labels[idx])


df = pd.read_csv('X_test.csv')
designations = [preprocess(text) for text in df['designation'].tolist()]
descriptions = [preprocess(text) for text in df['description'].tolist()]

labels = [0] * len(designations)
for i in range(len(labels)):
    labels[i] = i

designations_embeddings = load_or_compute_embeddings(designations, CONFIG["embedder"])
descriptions_embeddings = load_or_compute_embeddings(descriptions, CONFIG["embedder"])

dataset = TextClassificationDataset(designations_embeddings, descriptions_embeddings, labels)


Computing new embeddings...


Getting embeddings: 100%|██████████| 432/432 [00:13<00:00, 32.48it/s]


Saving embeddings to cache: cache/embeddings_571f3efdd580e6d678ec91a1b96b1ca1_ff21f37c2835453e2ef54e19667bb025.pt
Computing new embeddings...


Getting embeddings: 100%|██████████| 432/432 [02:05<00:00,  3.44it/s]

Saving embeddings to cache: cache/embeddings_571f3efdd580e6d678ec91a1b96b1ca1_7247ebafd4fcd6572425f0d8704a9543.pt





In [21]:

dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

all_outputs = []

with torch.no_grad():
    for batch in dataloader:
        if USE_DESCRIPTION:
            batch_des, batch_desc, batch_labels = batch
            batch_desc = batch_desc.to(device)
        else:
            batch_des, batch_labels = batch
            batch_desc = None
        
        batch_des = batch_des.to(device)
        outputs = text_classifier(batch_des, batch_desc)
        # apply softmax
        outputs = F.softmax(outputs, dim=1)
        all_outputs.append(outputs)

all_outputs = torch.cat(all_outputs, dim=0)
print(all_outputs.shape)

# save to pt
torch.save(all_outputs, 'text_softmaxes_test.pt')

torch.Size([13812, 27])


In [22]:
# Global model and processor cache
_model = None
_processor = None


def get_model_and_processor():
    """Cache and return the model and processor"""
    global _model, _processor
    if _model is None or _processor is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        _model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
        _processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        # Optimize model with torch.compile() if using PyTorch 2.0+
        if hasattr(torch, 'compile'):
            _model = torch.compile(_model)
        
        _model.eval()  # Set model to evaluation mode
    return _model, _processor



def process_batch(image_paths: List[str], categories: List[str], batch_size: int = 32) -> List[List[Tuple[str, float]]]:
    """Process images in batches for better performance"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, processor = get_model_and_processor()
    
    all_similarities = []
    
    # Process images in batches
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i + batch_size]
        images = []
        valid_indices = []
        
        # Load images
        for idx, path in enumerate(batch_paths):
            try:
                image = Image.open(path)
                images.append(image)
                valid_indices.append(idx)
            except Exception as e:
                print(f"Error loading image {path}: {e}")
                all_similarities.append(None)
        
        if not images:
            continue
            
        # Process batch
        with torch.no_grad():
            inputs = processor(
                images=images,
                text=[f"a photo of {category}" for category in categories],
                return_tensors="pt",
                padding=True
            ).to(device)
            
            outputs = model(**inputs)
            similarities = outputs.logits_per_image.softmax(dim=-1)
            all_similarities.append(similarities)
    
    return all_similarities

In [25]:
import pandas as pd
df = pd.read_csv("X_test.csv")


all_paths = []

for row in df.itertuples():
    path = f"images/image_test/image_{getattr(row, 'imageid')}_product_{getattr(row, 'productid')}.jpg"
    all_paths.append(path)


print(len(all_paths))

13812


In [26]:
category_ids = {
        "10": "Single book",
        "40": "Video Game Covers",
        "50": "Game Accessories",
        "60": "Game Console",
        "1140": "Video Game Figurines",
        "1160": "Cards",
        "1180": "Movies Figurine",
        "1280": "Plush toy",
        "1281": "Tabletop Game",
        "1300": "Toy Car",
        "1301": "Game Room Accessories",
        "1302": "Outdoor Toys",
        "1320": "Baby accessories",
        "1560": "Furnitures",
        "1920": "Pillows",
        "1940": "Food & Beverages",
        "2060": "Flags and decorations",
        "2220": "Pet Supplies",
        "2280": "Journals",
        "2403": "Book collection",
        "2462": "Game Console (occasion)",
        "2522": "Office Supplies",
        "2582": "Outdoor & Garden",
        "2583": "Spa Supplies",
        "2585": "Tools & Home Improvement",
        "2705": "Literature book",
        "2905": "PC games"
    }

In [27]:
res = process_batch(all_paths, category_ids.values(), 32)
print(type(res))
print(len(res))


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


<class 'list'>
432


In [28]:
full_res = torch.tensor([])

for tensor in res:
    full_res = torch.cat((full_res, tensor.to("cpu")), dim=0)

full_res = full_res.cpu().numpy()
print(full_res.shape)

(13812, 27)


In [29]:
# convert to pt
full_res = torch.tensor(full_res)

# save as pt
torch.save(full_res, "image_softmaxes_test.pt")
full_res.shape

torch.Size([13812, 27])

In [3]:
# save labels

df = pd.read_csv('train.csv')


labels = df['class']

labels



0          10
1        2280
2          50
3        1280
4        2705
         ... 
84911      40
84912    2583
84913    2280
84914    1560
84915    2522
Name: class, Length: 84916, dtype: int64

# Model creation

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import f1_score
import numpy as np
from sklearn.preprocessing import LabelEncoder

import pandas as pd


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Custom Dataset
class CombinedDataset(Dataset):
    def __init__(self, text_data_path, image_data_path):
        self.text_data = torch.load(text_data_path).to(device)
        self.image_data = torch.load(image_data_path).to(device)
        assert len(self.text_data) == len(self.image_data), "Datasets must have same length"
        df = pd.read_csv("output_X_train_update.csv")
        self.le = LabelEncoder()
        encoded_labels = self.le.fit_transform(df["class"].values)
        self.labels = torch.tensor(encoded_labels, dtype=torch.long).to(device)
        print(len(self.labels), self.labels)
        
    def __len__(self):
        return len(self.text_data)
    
    def __getitem__(self, idx):
        text_tensor = self.text_data[idx]  # Already on device
        image_tensor = self.image_data[idx]  # Already on device
        combined_input = torch.mean(torch.stack([0.7*text_tensor, 0.3*image_tensor]), dim=0)
        label = self.labels[idx]
        return combined_input, label

# MLP Model
class MLP(nn.Module):
    def __init__(self, input_size=54):  # 27 + 27 = 54 input features
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 27)
        )
    
    def forward(self, x):
        return self.layers(x)

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device=device):
    model = model.to(device)
    best_val_loss = float('inf')
    patience = 20
    patience_counter = 0
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
        # Validation
        model.eval()
        val_preds = []
        val_labels = []
        val_loss = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
        avg_val_loss = val_loss/len(val_loader)
        val_f1 = f1_score(val_labels, val_preds, average='weighted')
        
        # Create a nice formatted output with tqdm
        from tqdm.auto import tqdm
        
        # Update progress bar description with metrics
        desc = (f'Epoch {epoch+1}/{num_epochs} | '
               f'Train Loss: {train_loss/len(train_loader):.4f} | '
               f'Val Loss: {avg_val_loss:.4f} | '
               f'Val F1: {val_f1:.4f}')
        
        # Use tqdm.write to avoid interfering with progress bars
        tqdm.write("\n" + "=" * 80)
        tqdm.write(desc)
        tqdm.write("=" * 80)
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            tqdm.write(f'\n🔥 New best validation loss: {avg_val_loss:.4f}! Saving model...\n')
            torch.save(model.state_dict(), 'late_merger.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                tqdm.write(f'\nEarly stopping triggered after {epoch+1} epochs - no improvement in validation loss for {patience} epochs\n')
                break


cuda


In [17]:
from sklearn.model_selection import train_test_split

# Main execution
# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_EPOCHS = 1000

# Load dataset
dataset = CombinedDataset('text_softmaxes.pt', 'image_softmaxes.pt')

# Split dataset
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
# take only 1% of each 
train_dataset = train_dataset[:int(len(train_dataset)*0.01)]
val_dataset = val_dataset[:int(len(val_dataset)*0.01)]


# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Initialize model, criterion, and optimizer
model = MLP()
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Train model
train_model(model, train_loader, val_loader, criterion, None, NUM_EPOCHS)


  self.text_data = torch.load(text_data_path).to(device)
  self.image_data = torch.load(image_data_path).to(device)


84916 tensor([ 0, 18,  2,  ..., 18, 13, 21], device='cuda:0')


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [19]:
# show 5 vlaues of trainloader
for batch in train_loader:
    print(batch)
    break

[tensor([[2.7354e-04, 7.9283e-08, 3.4605e-08,  ..., 5.2863e-04, 3.8494e-01,
         1.7149e-04],
        [2.6582e-07, 1.7240e-08, 5.0679e-08,  ..., 3.5986e-02, 8.9124e-04,
         2.3434e-02],
        [1.1106e-07, 5.6565e-08, 9.1660e-08,  ..., 2.9585e-04, 5.0665e-02,
         2.9756e-01],
        ...,
        [1.5287e-05, 1.2892e-04, 6.5116e-06,  ..., 8.1616e-03, 1.7940e-02,
         3.4464e-02],
        [7.8493e-11, 2.2006e-09, 7.6715e-10,  ..., 5.7081e-04, 1.2668e-04,
         1.5415e-03],
        [1.4320e-08, 2.9513e-08, 4.9837e-08,  ..., 7.0822e-03, 4.5104e-03,
         1.4598e-03]], device='cuda:0'), tensor([19, 16, 15, 26,  5, 18,  8,  3,  7, 13, 16,  5, 16, 11, 11, 16, 23,  8,
        11, 21,  5,  4,  7,  0, 23,  2, 14,  7,  7,  9, 17, 15],
       device='cuda:0')]


# Model inference

In [9]:
late_merger = MLP()
late_merger.load_state_dict(torch.load('late_merger.pt'))

dataset = CombinedDataset("text_softmaxes_test.pt", "image_softmaxes_test.pt")
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)


  late_merger.load_state_dict(torch.load('late_merger.pt'))
  self.text_data = torch.load(text_data_path).to(device)
  self.image_data = torch.load(image_data_path).to(device)


84916 tensor([ 0, 18,  2,  ..., 18, 13, 21], device='cuda:0')


In [18]:
late_merger.eval()
print(device)
late_merger.to(device)

all_preds = []
with torch.no_grad():
    for inputs, labels  in dataloader:
        inputs, _ = inputs.to(device), labels.to(device)
        best = torch.argmax(inputs, dim=1)
        all_preds.extend(best.cpu().numpy())

len(all_preds)

real_preds = dataset.le.inverse_transform(all_preds)



cuda


In [19]:
df_test = pd.read_csv('X_test.csv')
res_df = pd.DataFrame({"": df_test['Unnamed: 0'], 'prdtypecode': real_preds})
res_df.to_csv('submission.csv', index=False)