In [1]:
import numpy as np
import pandas as pd
from transformers import CamembertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch 
import torch.nn as nn
import lightning.pytorch as pl
from transformers import PreTrainedTokenizer
from typing import List
from sklearn.preprocessing import StandardScaler
from transformers import CamembertModel
from lightning.pytorch.loggers import WandbLogger
from transformers import CamembertTokenizer
from typing import List, Dict
import wandb

In [4]:

df = pd.read_csv('/home/amazon_ml_challenge/dataset/train.csv') 
df.dropna(subset=['DESCRIPTION', 'BULLET_POINTS'], inplace=True)
data = df.sample(n = 1000, random_state = 7)
data['PRODUCT_LENGTH'] = data['PRODUCT_LENGTH'].astype(int)

In [5]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

In [6]:
encoded_corpus = tokenizer(text=data.TITLE.tolist(),
                            add_special_tokens=True,
                            padding='max_length',
                            truncation='longest_first',
                            max_length=300,
                            return_attention_mask=True)
input_ids = encoded_corpus['input_ids']
attention_mask = encoded_corpus['attention_mask']

In [7]:
import numpy as np
def filter_long_descriptions(tokenizer, descriptions, max_len):
    indices = []
    lengths = tokenizer(descriptions, padding=False, 
                     truncation=False, return_length=True)['length']
    for i in range(len(descriptions)):
        if lengths[i] <= max_len-2:
            indices.append(i)
    return indices

short_descriptions = filter_long_descriptions(tokenizer, 
                               data.TITLE.tolist(), 300)
input_ids = np.array(input_ids)[short_descriptions]
attention_mask = np.array(attention_mask)[short_descriptions]
labels = df.PRODUCT_LENGTH.to_numpy()[short_descriptions]

In [8]:
from sklearn.model_selection import train_test_split
test_size = 0.1
seed = 42
train_inputs, test_inputs, train_labels, test_labels = \
            train_test_split(input_ids, labels, test_size=test_size, 
                             random_state=seed)
train_masks, test_masks, _, _ = train_test_split(attention_mask, 
                                        labels, test_size=test_size, 
                                        random_state=seed)

In [9]:
from sklearn.preprocessing import StandardScaler

price_scaler = StandardScaler()
price_scaler.fit(train_labels.reshape(-1, 1))

train_labels = price_scaler.transform(train_labels.reshape(-1, 1))
test_labels = price_scaler.transform(test_labels.reshape(-1, 1))

In [55]:
import torch

from torch.utils.data import TensorDataset, DataLoader

batch_size = 64

def create_dataloaders(inputs, masks, labels, batch_size):
    input_tensor = torch.tensor(inputs)
    mask_tensor = torch.tensor(masks)
    labels_tensor = torch.tensor(labels)
    dataset = TensorDataset(input_tensor, mask_tensor, 
                            labels_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, 
                            shuffle=True)
    return dataloader

train_dataloader = create_dataloaders(train_inputs, train_masks, 
                                      train_labels, batch_size)

test_dataloader = create_dataloaders(test_inputs, test_masks, 
                                     test_labels, batch_size)

In [59]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from transformers import CamembertModel
import wandb


class CamembertRegressor(pl.LightningModule):
    
    def __init__(self, drop_rate=0.2):
        super(CamembertRegressor, self).__init__()
        D_in, D_out = 768, 1
        
        self.camembert = CamembertModel.from_pretrained('camembert-base')
        for param in self.camembert.parameters():
            param.requires_grad = False  # Freeze the pre-trained weights
        
        # Change the head of the model to a regressor
        self.head = nn.Sequential(
            nn.Dropout(drop_rate),
            nn.Linear(D_in, D_out))
        self.double()
    
       
    def forward(self, input_ids, attention_masks):
        outputs = self.camembert(input_ids, attention_masks)
        class_label_output = outputs[1]
        outputs = self.head(class_label_output)
        return outputs
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_masks, labels = batch
        outputs = self(input_ids, attention_masks)
        loss = nn.MSELoss()(outputs.view(-1), labels.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_masks, labels = batch
        outputs = self(input_ids, attention_masks)
        loss = nn.MSELoss()(outputs.view(-1), labels.view(-1))
        self.log('val_loss', loss, prog_bar=True)
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.head.parameters(), lr=2e-5)






In [60]:
model = CamembertRegressor(drop_rate=0.2)


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [61]:
wandb_logger = pl.loggers.WandbLogger(project='Amazon_ML_Challenge')

trainer = pl.Trainer(logger=wandb_logger, accelerator='auto', max_epochs=20)
trainer.fit(model, train_dataloader, test_dataloader)


  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | camembert | CamembertModel | 110 M 
1 | head      | Sequential     | 769   
---------------------------------------------
769       Trainable params
110 M     Non-trainable params
110 M     Total params
442.491   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [62]:
wandb.finish()