In [17]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import re
from sentence_transformers import SentenceTransformer



In [18]:
df_train = pd.read_csv('train.csv')

label_encoder_train = LabelEncoder()
label_encoder_train.fit(df_train['class'])

print(label_encoder_train.classes_)

[  10   40   50   60 1140 1160 1180 1280 1281 1300 1301 1302 1320 1560
 1920 1940 2060 2220 2280 2403 2462 2522 2582 2583 2585 2705 2905]


In [25]:


model = SentenceTransformer("intfloat/multilingual-e5-large-instruct", trust_remote_code=True).to('cuda')
model.train()

EMBEDDINGS_DIMENSION = 1024
USE_DESCRIPTION = True

def preprocess(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = text.strip()
    return text

class TextClassificationDataset(Dataset):
    def __init__(self, designation_embeddings, description_embeddings, labels):
        assert designation_embeddings.shape[1] == EMBEDDINGS_DIMENSION, f"Designation embeddings dimension mismatch. Expected {EMBEDDINGS_DIMENSION}, got {designation_embeddings.shape[1]}"
        if USE_DESCRIPTION:
            assert description_embeddings.shape[1] == EMBEDDINGS_DIMENSION, f"Description embeddings dimension mismatch. Expected {EMBEDDINGS_DIMENSION}, got {description_embeddings.shape[1]}"
        self.designation_embeddings = designation_embeddings
        self.description_embeddings = description_embeddings if USE_DESCRIPTION else None
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(labels)

    def __len__(self):
        return len(self.designation_embeddings)
    
    def __getitem__(self, idx):
        if USE_DESCRIPTION:
            return (self.designation_embeddings[idx], 
                    self.description_embeddings[idx], 
                    self.labels[idx])
        return (self.designation_embeddings[idx], self.labels[idx])

def get_embeddings(texts, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Getting embeddings"):
        batch = texts[i:i + batch_size]
        with torch.no_grad():
            batch_embeddings = model.encode(batch, max_length=EMBEDDINGS_DIMENSION)
            if isinstance(batch_embeddings, np.ndarray):
                batch_embeddings = torch.from_numpy(batch_embeddings)
            assert batch_embeddings.shape[1] == EMBEDDINGS_DIMENSION, f"Model output dimension mismatch. Expected {EMBEDDINGS_DIMENSION}, got {batch_embeddings.shape[1]}"
            embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)


# Classification head remains the same
class ClassificationHead(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.3):
        super().__init__()
        # Input dimension depends on whether we're using description
        combined_dim = input_dim * 2 if USE_DESCRIPTION else input_dim
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x1, x2=None):
        if USE_DESCRIPTION:
            combined = torch.cat((x1, x2), dim=1)
        else:
            combined = x1
        return self.classifier(combined)

num_classes = 27

In [29]:
# load classifier
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classifier = ClassificationHead(input_dim=EMBEDDINGS_DIMENSION, num_classes=num_classes).to('cuda')
classifier.load_state_dict(torch.load('best_model.pt'))

df_test = pd.read_csv('X_test.csv')
designation_texts = [preprocess(text) for text in df_test['designation'].tolist()]
designation_embeddings = get_embeddings(designation_texts)
designation_embeddings = F.normalize(designation_embeddings, p=2, dim=1)

description_texts = [preprocess(text) for text in df_test['description'].tolist()]
description_embeddings = get_embeddings(description_texts)
description_embeddings = F.normalize(description_embeddings, p=2, dim=1)


dummy_labels = [0] * len(designation_embeddings)
for i in range(len(dummy_labels)):
    dummy_labels[i] = i
test_dataset = TextClassificationDataset(designation_embeddings, description_embeddings, dummy_labels)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Evaluate on test set
classifier.eval()
all_preds = []

with torch.no_grad():
    for batch in test_dataloader:
        if USE_DESCRIPTION:
            batch_des, batch_desc, batch_labels = batch
            batch_desc = batch_desc.to(device)
        else:
            batch_des, batch_labels = batch
            batch_desc = None
        
        batch_des = batch_des.to(device)
        outputs = classifier(batch_des, batch_desc)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)

# Convert numeric predictions back to original class labels
print(all_preds)
predicted_classes = label_encoder_train.inverse_transform(all_preds)
print("Predicted classes:", predicted_classes)


  classifier.load_state_dict(torch.load('best_model.pt'))
Getting embeddings: 100%|██████████| 432/432 [00:13<00:00, 32.46it/s]
Getting embeddings: 100%|██████████| 432/432 [02:05<00:00,  3.45it/s]


[np.int64(7), np.int64(5), np.int64(23), np.int64(23), np.int64(21), np.int64(9), np.int64(15), np.int64(18), np.int64(13), np.int64(23), np.int64(18), np.int64(7), np.int64(9), np.int64(1), np.int64(5), np.int64(9), np.int64(4), np.int64(18), np.int64(8), np.int64(20), np.int64(4), np.int64(24), np.int64(21), np.int64(21), np.int64(6), np.int64(12), np.int64(12), np.int64(23), np.int64(8), np.int64(20), np.int64(16), np.int64(25), np.int64(21), np.int64(24), np.int64(14), np.int64(23), np.int64(16), np.int64(2), np.int64(17), np.int64(4), np.int64(26), np.int64(24), np.int64(20), np.int64(7), np.int64(8), np.int64(7), np.int64(25), np.int64(23), np.int64(8), np.int64(21), np.int64(9), np.int64(18), np.int64(14), np.int64(20), np.int64(23), np.int64(23), np.int64(0), np.int64(1), np.int64(18), np.int64(14), np.int64(23), np.int64(23), np.int64(18), np.int64(24), np.int64(18), np.int64(18), np.int64(5), np.int64(18), np.int64(23), np.int64(24), np.int64(18), np.int64(23), np.int64(22), 

In [31]:
res_df = pd.DataFrame({"": df_test['Unnamed: 0'], 'prdtypecode': predicted_classes})
res_df.to_csv('submission.csv', index=False)
