In [1]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score

df_features = pd.read_json('/kaggle/input/indoml-phase2/train.features',lines=True)
df_labels = pd.read_json('/kaggle/input/indoml-phase2/train.labels',lines=True)

In [2]:
df = pd.merge(df_features,df_labels,on="indoml_id")

In [3]:
# df = df[:1000]

In [4]:
from sklearn.preprocessing import LabelEncoder
group_encoder = LabelEncoder()
supergroup_encoder = LabelEncoder()
module_encoder = LabelEncoder()
brand_encoder = LabelEncoder()

# Fit and transform each column
df['group'] = group_encoder.fit_transform(df['group'])
df['supergroup'] = supergroup_encoder.fit_transform(df['supergroup'])
df['module'] = module_encoder.fit_transform(df['module'])
df['brand'] = brand_encoder.fit_transform(df['brand'])
df['description'] = df['description'] + df['retailer']

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import DebertaModel, DebertaTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from torch.distributions import Categorical
import numpy as np

In [6]:
# For using the new one
# tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

# For loading the saved one
model_dir = "/kaggle/input/new-transformer-experiment-12-embedding-tmp/New_model"
tokenizer = DebertaTokenizer.from_pretrained(model_dir)

In [7]:
MAX_LENGTH = 12
BATCH_SIZE = 64
LEARNING_RATE = 5e-5
NUM_EPOCHS = 30
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device=DEVICE
PATIENCE = 5  # Early stopping patience
PATIENCE_LR = 3  # Reduce LR on plateau patience

In [8]:
from sklearn.metrics import accuracy_score
import torch

In [9]:
class ProductDataset(Dataset):
    def __init__(self, texts, labels1, labels2, labels3, labels4):
        self.texts = texts
        self.labels1 = labels1
        self.labels2 = labels2
        self.labels3 = labels3
        self.labels4 = labels4
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors="pt")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {key: val[idx].to(DEVICE) for key, val in self.encodings.items()}
        item['labels1'] = torch.tensor(self.labels1[idx], device=DEVICE)
        item['labels2'] = torch.tensor(self.labels2[idx], device=DEVICE)
        item['labels3'] = torch.tensor(self.labels3[idx], device=DEVICE)
        item['labels4'] = torch.tensor(self.labels4[idx], device=DEVICE)
        return item
        
def compute_accuracy(preds, labels):
    # Convert each tensor in the list to numpy arrays
    preds_np = [p.cpu().numpy() for p in preds]
    labels_np = [l.cpu().numpy() for l in labels]
    # Individual accuracies for each of the 4 labels
    accuracies = [accuracy_score(labels_np[i], preds_np[i]) for i in range(4)]
    # Overall accuracy where all 4 labels match
    overall_accuracy = accuracy_score(
        np.all([labels_np[i] == preds_np[i] for i in range(4)], axis=0), 
        np.ones(len(labels_np[0]))
    )
    # Return the 5 accuracies (4 individual, 1 overall)
    return accuracies + [overall_accuracy]

In [10]:
# Split data
train_texts, val_texts, train_labels1, val_labels1, train_labels2, val_labels2, train_labels3, val_labels3, train_labels4, val_labels4 = train_test_split(
    df['description'], 
    df['supergroup'], 
    df['group'], 
    df['module'], 
    df['brand'], 
    test_size=0.2, 
    random_state=42
)

train_dataset = ProductDataset(
    train_texts.tolist(), 
    train_labels1.tolist(), 
    train_labels2.tolist(), 
    train_labels3.tolist(), 
    train_labels4.tolist()
)

val_dataset = ProductDataset(
    val_texts.tolist(), 
    val_labels1.tolist(), 
    val_labels2.tolist(), 
    val_labels3.tolist(), 
    val_labels4.tolist()
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import DebertaModel, DebertaTokenizer
from torch.distributions import Categorical
import os

class AdvancedHierarchicalClassifier(nn.Module):
    def __init__(self, num_supergroups, num_groups, num_modules, num_brands, hidden_size=768, projection_dim=128):
        super().__init__()
        # For loading the new model
        self.deberta = DebertaModel.from_pretrained("microsoft/deberta-base")
        self.hidden_size = hidden_size
        # Classifiers for each hierarchy level
        self.supergroup_classifier = nn.Linear(hidden_size, num_supergroups)
        self.group_classifier = nn.Linear(hidden_size + num_supergroups, num_groups)
        self.module_classifier = nn.Linear(hidden_size + num_supergroups + num_groups, num_modules)
        self.brand_classifier = nn.Linear(hidden_size + num_supergroups + num_groups + num_modules, num_brands)
        # RL Policy networks for each level
        self.supergroup_policy = nn.Linear(hidden_size, num_supergroups)
        self.group_policy = nn.Linear(hidden_size + num_supergroups, num_groups)
        self.module_policy = nn.Linear(hidden_size + num_supergroups + num_groups, num_modules)
        self.brand_policy = nn.Linear(hidden_size + num_supergroups + num_groups + num_modules, num_brands)
        # Contrastive learning projection head
        self.projection = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, projection_dim)
        )
        # Few-shot learning prototypes
        self.prototypes = nn.Parameter(torch.randn(num_supergroups + num_groups + num_modules + num_brands, hidden_size))
    def forward(self, input_ids, attention_mask):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token representation
        # Supervised classification logits
        supergroup_logits = self.supergroup_classifier(hidden_states)
        group_input = torch.cat([hidden_states, torch.softmax(supergroup_logits, dim=1)], dim=1)
        group_logits = self.group_classifier(group_input)
        module_input = torch.cat([group_input, torch.softmax(group_logits, dim=1)], dim=1)
        module_logits = self.module_classifier(module_input)
        brand_input = torch.cat([module_input, torch.softmax(module_logits, dim=1)], dim=1)
        brand_logits = self.brand_classifier(brand_input)
        # RL policy logits
        supergroup_policy = self.supergroup_policy(hidden_states)
        group_policy = self.group_policy(group_input)
        module_policy = self.module_policy(module_input)
        brand_policy = self.brand_policy(brand_input)
        # Contrastive learning projection
        projection = self.projection(hidden_states)
        # Few-shot learning
        prototype_distances = torch.cdist(hidden_states, self.prototypes)
        few_shot_logits = -prototype_distances  # Negative distance as logits
        return (supergroup_logits, group_logits, module_logits, brand_logits), \
               (supergroup_policy, group_policy, module_policy, brand_policy), \
               projection, few_shot_logits
    def sample_actions(self, policies):
        return [Categorical(logits=policy).sample() for policy in policies]
        
class JointAccuracyTrainer:
    def __init__(self, model, supervised_lr=1e-5, rl_lr=1e-4, contrastive_temperature=0.07, loss_weights=None):
        self.model = model
        self.device = next(model.parameters()).device  # Get the device from the model's parameters
        self.supervised_optimizer = torch.optim.AdamW(model.parameters(), lr=supervised_lr)
        self.rl_optimizer = torch.optim.AdamW(model.parameters(), lr=rl_lr)
        self.criterion = nn.CrossEntropyLoss()
        self.contrastive_temperature = contrastive_temperature
        if loss_weights is None:
            self.loss_weights = [1.0, 1.0, 1.0, 1.0]
        else:
            self.loss_weights = loss_weights
    def compute_joint_loss(self, all_outputs, true_labels):
        # Unpack the model outputs
        supervised_logits, policy_logits, projection, few_shot_logits = all_outputs
        logits_supergroup, logits_group1, logits_group2, logits_group3 = supervised_logits
        # Compute classification losses for all levels
        loss_supergroup = self.criterion(logits_supergroup, true_labels['supergroup'])
        loss_group1 = self.criterion(logits_group1, true_labels['group1'])
        loss_group2 = self.criterion(logits_group2, true_labels['group2'])
        loss_group3 = self.criterion(logits_group3, true_labels['group3'])
        # Combine losses using weighted sum
        total_loss = (
            self.loss_weights[0] * loss_supergroup + 
            self.loss_weights[1] * loss_group1 + 
            self.loss_weights[2] * loss_group2 + 
            self.loss_weights[3] * loss_group3
        )
        return total_loss
        
    def supervised_step(self, batch, true_labels):
        self.supervised_optimizer.zero_grad()
        # Forward pass through the model
        all_outputs = self.model(batch['input_ids'], batch['attention_mask'])
        # Compute joint loss for all levels
        total_loss = self.compute_joint_loss(all_outputs, true_labels)
        # Backpropagate and update model weights
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=5.0)
        self.supervised_optimizer.step()
        return total_loss.item()
        
    def validation_step(self, batch, true_labels):
        with torch.no_grad():
            # Forward pass during validation
            all_outputs = self.model(batch['input_ids'], batch['attention_mask'])
            supervised_logits, _, _, _ = all_outputs
            logits_supergroup, logits_group1, logits_group2, logits_group3 = supervised_logits
            # Compute predictions
            preds_supergroup = torch.argmax(logits_supergroup, dim=-1)
            preds_group1 = torch.argmax(logits_group1, dim=-1)
            preds_group2 = torch.argmax(logits_group2, dim=-1)
            preds_group3 = torch.argmax(logits_group3, dim=-1)
            # Compute accuracies
            supergroup_acc = (preds_supergroup == true_labels['supergroup']).float().mean().item()
            group1_acc = (preds_group1 == true_labels['group1']).float().mean().item()
            group2_acc = (preds_group2 == true_labels['group2']).float().mean().item()
            group3_acc = (preds_group3 == true_labels['group3']).float().mean().item()
            # Joint accuracy
            item_acc = ((preds_supergroup == true_labels['supergroup']) &
                       (preds_group1 == true_labels['group1']) &
                       (preds_group2 == true_labels['group2']) &
                       (preds_group3 == true_labels['group3'])).float().mean().item()
        return supergroup_acc, group1_acc, group2_acc, group3_acc, item_acc
        
def train_and_evaluate(model, train_loader, val_loader, num_epochs=10):
    trainer = JointAccuracyTrainer(model)
    for epoch in range(num_epochs):
        model.train()
        total_sup_loss = 0.0
        # Training loop
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            true_labels = {
                'supergroup': batch['labels1'],
                'group1': batch['labels2'],
                'group2': batch['labels3'],
                'group3': batch['labels4']
            }
            sup_loss = trainer.supervised_step(batch, true_labels)
            total_sup_loss += sup_loss
        # Validation loop
        model.eval()
        val_accuracies = [0, 0, 0, 0, 0]
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            true_labels = {
                'supergroup': batch['labels1'],
                'group1': batch['labels2'],
                'group2': batch['labels3'],
                'group3': batch['labels4']
            }
            accs = trainer.validation_step(batch, true_labels)
            val_accuracies = [sum(x) for x in zip(val_accuracies, accs)]
        # Average accuracies
        val_accuracies = [x / len(val_loader) for x in val_accuracies]
        print(f"Epoch {epoch + 1}/{num_epochs} - "
              f"Train Loss: {total_sup_loss / len(train_loader):.4f}, "
              f"Val Accuracies - Supergroup: {val_accuracies[0]:.4f}, Group1: {val_accuracies[1]:.4f}, "
              f"Group2: {val_accuracies[2]:.4f}, Group3: {val_accuracies[3]:.4f}, Item Accuracy: {val_accuracies[4]:.4f}")

# Usage
model = AdvancedHierarchicalClassifier(num_supergroups=32, num_groups=228, num_modules=449, num_brands=5679).to(DEVICE)
# For loading the model saved after pre-training the saved dict
model_path = '/kaggle/input/new-transformer-experiment-12-embedding-tmp/New_model/model.pth'
model.load_state_dict(torch.load(model_path))
# Assuming you have train_loader and val_loader already defined
train_and_evaluate(model, train_loader, val_loader, num_epochs=NUM_EPOCHS)

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  model.load_state_dict(torch.load(model_path))


Epoch 1/30 - Train Loss: 0.0863, Val Accuracies - Supergroup: 0.8945, Group1: 0.8641, Group2: 0.8532, Group3: 0.9049, Item Accuracy: 0.8039
Epoch 2/30 - Train Loss: 0.0832, Val Accuracies - Supergroup: 0.8929, Group1: 0.8639, Group2: 0.8525, Group3: 0.9060, Item Accuracy: 0.8035
Epoch 3/30 - Train Loss: 0.0795, Val Accuracies - Supergroup: 0.8924, Group1: 0.8637, Group2: 0.8528, Group3: 0.9057, Item Accuracy: 0.8040
Epoch 4/30 - Train Loss: 0.0760, Val Accuracies - Supergroup: 0.8929, Group1: 0.8646, Group2: 0.8532, Group3: 0.9053, Item Accuracy: 0.8038
Epoch 5/30 - Train Loss: 0.0721, Val Accuracies - Supergroup: 0.8943, Group1: 0.8652, Group2: 0.8542, Group3: 0.9056, Item Accuracy: 0.8048
Epoch 6/30 - Train Loss: 0.0697, Val Accuracies - Supergroup: 0.8947, Group1: 0.8652, Group2: 0.8549, Group3: 0.9056, Item Accuracy: 0.8059
Epoch 7/30 - Train Loss: 0.0691, Val Accuracies - Supergroup: 0.8937, Group1: 0.8646, Group2: 0.8539, Group3: 0.9057, Item Accuracy: 0.8040
Epoch 8/30 - Train L

In [12]:
import os
# Define the directory to save the model and tokenizer
save_directory = "New_model"
# Create the directory if it doesn't exist

if not os.path.exists(save_directory):
    os.makedirs(save_directory)
# Save the model's state dictionary

model_save_path = os.path.join(save_directory, "model.pth")
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")
# Save the tokenizer

tokenizer.save_pretrained(save_directory)
print(f"Tokenizer saved to {save_directory}")

Model saved to New_model/model.pth
Tokenizer saved to New_model


In [13]:
df_test_feat = pd.read_json('/kaggle/input/indoml-phase2/final_test_data.features',lines=True)

In [14]:
df_test_feat.head()

Unnamed: 0,indoml_id,description,retailer,price
0,0,14 in hybrid blade,wilko,4.5
1,1,2 pk vent stick a fres,noshify,0.69
2,2,4 tyrefix 450 ml,noshify,2.99
3,3,4 x 4 tyrefix 450 ml,noshify,2.99
4,4,5 l adbluescr diesel vehicles,noshify,4.99


In [15]:
def predict(model, tokenizer, text):
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        padding='max_length', 
        max_length=MAX_LENGTH
    ).to(DEVICE)
    with torch.no_grad():
        logits, _, _, _ = model(inputs['input_ids'], inputs['attention_mask'])
    predictions = [torch.argmax(logit, dim=1).item() for logit in logits]
    return predictions

# Example prediction
sample_text = "Short product description here"
predictions = predict(model, tokenizer, sample_text)
print(f"Supergroup: {predictions[0]}, Group: {predictions[1]}, Module: {predictions[2]}, Brand: {predictions[3]}")

Supergroup: 29, Group: 202, Module: 377, Brand: 4078


In [16]:
# test_tmp = df_test_feat[:5]

In [17]:
def make_test_pred_and_save(df_test_feat):
    supergroups_list = []
    groups_list = []
    modules_list = []
    brands_list = []
    indoml_id_list = range(0, len(df_test_feat))
    length_df = df_test_feat.shape[0]
    with torch.no_grad():
        for i in range(length_df):
            if i % 1000 == 0:
                print(f"Processing {i} of {length_df - 1}")
            inputs = tokenizer(
                df_test_feat.iloc[i].description, 
                return_tensors="pt", 
                truncation=True, 
                padding='max_length', 
                max_length=MAX_LENGTH
            ).to(DEVICE)
            logits, _, _, _ = model(inputs['input_ids'], inputs['attention_mask'])
            predictions = [torch.argmax(logit, dim=1).item() for logit in logits]
            # Append predictions to respective lists
            supergroups_list.append(predictions[0])
            groups_list.append(predictions[1])
            modules_list.append(predictions[2])
            brands_list.append(predictions[3])
        try:
            supergroups_names = supergroup_encoder.inverse_transform(supergroups_list)
        except ValueError as e:
            print(f"Error in supergroups: {e}")
            supergroups_names = ['Unknown' if x not in supergroup_encoder.classes_ else x for x in supergroups_list]
        try:
            groups_names = group_encoder.inverse_transform(groups_list)
        except ValueError as e:
            print(f"Error in groups: {e}")
            groups_names = ['Unknown' if x not in group_encoder.classes_ else x for x in groups_list]
        try:
            modules_names = module_encoder.inverse_transform(modules_list)
        except ValueError as e:
            print(f"Error in modules: {e}")
            modules_names = ['Unknown' if x not in module_encoder.classes_ else x for x in modules_list]
        try:
            brands_names = brand_encoder.inverse_transform(brands_list)
        except ValueError as e:
            print(f"Error in brands: {e}")
            brands_names = ['Unknown' if x not in brand_encoder.classes_ else x for x in brands_list]
        # Create a DataFrame with predictions
        predictions_df = pd.DataFrame({
            'indoml_id': indoml_id_list,
            'supergroup': supergroups_names,
            'group': groups_names,
            'module': modules_names,
            'brand': brands_names
        })
        predictions_df.to_json('/kaggle/working/predictions.predict', orient='records', lines=True)
        print("predictions.predict saved")
print(make_test_pred_and_save(df_test_feat))

Processing 0 of 184663
Processing 1000 of 184663
Processing 2000 of 184663
Processing 3000 of 184663
Processing 4000 of 184663
Processing 5000 of 184663
Processing 6000 of 184663
Processing 7000 of 184663
Processing 8000 of 184663
Processing 9000 of 184663
Processing 10000 of 184663
Processing 11000 of 184663
Processing 12000 of 184663
Processing 13000 of 184663
Processing 14000 of 184663
Processing 15000 of 184663
Processing 16000 of 184663
Processing 17000 of 184663
Processing 18000 of 184663
Processing 19000 of 184663
Processing 20000 of 184663
Processing 21000 of 184663
Processing 22000 of 184663
Processing 23000 of 184663
Processing 24000 of 184663
Processing 25000 of 184663
Processing 26000 of 184663
Processing 27000 of 184663
Processing 28000 of 184663
Processing 29000 of 184663
Processing 30000 of 184663
Processing 31000 of 184663
Processing 32000 of 184663
Processing 33000 of 184663
Processing 34000 of 184663
Processing 35000 of 184663
Processing 36000 of 184663
Processing 370