In [196]:
#!g1.4
%pip install transformers
%pip install wandb
%pip install optuna

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [197]:
#!g1.4
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, roc_auc_score
from tqdm.notebook import trange, tqdm
from IPython.display import clear_output
import wandb
import os
# import optuna

In [198]:
#!g1.4

class ConfigDict(dict):
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [199]:
#!g1.4
config = ConfigDict({
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 10,
    'batch_size': 16,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'no freeeze',
    'test_size': 0.15,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'test_size': 0.15,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [201]:
#!g1.4
random.seed(config.seed)
np.random.seed(config.seed)
torch.manual_seed(config.seed)
torch.backends.cudnn.deterministic = True

In [202]:
#!g1.4
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(config.seed)

# Preprocessing

In [203]:
#!g1.4
train = pd.read_csv('/home/jupyter/mnt/s3/newbacketttt/train.csv', encoding = 'utf-8')

ctg_encoder = LabelEncoder()
train['1category'] = ctg_encoder.fit_transform(train['1category'])
snt_encoder = LabelEncoder()
train['sentiment'] = snt_encoder.fit_transform(train['sentiment'])

ctg_labels = train['1category'].unique()
snt_labels = train['sentiment'].unique()

x_ctg = train['sentence'].values
y_ctg = train['1category'].values
x_snt = train['sentence'].values
y_snt = train['sentiment'].values

In [204]:
#!g1.4
class Preprocessor():

    def __init__(self, tokenizer, config=config):
        self.config=config
        self.tokenizer = tokenizer
        
    def create_test_dataloader(self, x_test_final):
        
        x_test_final_enc = self.tokenizer.batch_encode_plus(
            x_test_final.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)

        
        input_ids_test_final = x_test_final_enc['input_ids']
        attention_masks_test_final = x_test_final_enc['attention_mask']

        dataset_test_final = TensorDataset(
            input_ids_test_final.to(config.device),
            attention_masks_test_final.to(self.config.device),
        )

        dataloader_test_final = DataLoader(
            dataset_test_final,
            batch_size = self.config.batch_size
        )

        return dataloader_test_final
    

    def create_dataloaders(self, x, y, x_test_final):
        x_train, x_test, y_train, y_test = train_test_split(
            x, y, 
            test_size=self.config.test_size, 
            random_state=self.config.random_state, 
            stratify=y
        )

        x_train_enc = self.tokenizer.batch_encode_plus(
            x_train.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)

        x_test_enc = self.tokenizer.batch_encode_plus(
            x_test.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)
        
        x_test_final_enc = self.tokenizer.batch_encode_plus(
            x_test_final.tolist(),
            add_special_tokens=self.config.add_special_tokens, 
            return_attention_mask=self.config.return_attention_mask, 
            pad_to_max_length=self.config.pad_to_max_length, 
            truncation=self.config.truncation,
            max_length=self.config.max_length, 
            return_tensors=self.config.return_tensors
        ).to(self.config.device)

        input_ids_train = x_train_enc['input_ids']
        attention_masks_train = x_train_enc['attention_mask']
        labels_train = torch.tensor(y_train)

        input_ids_test = x_test_enc['input_ids']
        attention_masks_test = x_test_enc['attention_mask']
        labels_test = torch.tensor(y_test)
        
        indup_ids_test_final = x_test_final_enc['input_ids']
        attention_masks_test_final = x_test_final_enc['attention_mask']

        dataset_train = TensorDataset(
            input_ids_train.to(self.config.device), 
            attention_masks_train.to(self.config.device),
            labels_train.to(self.config.device)
        )

        dataset_test = TensorDataset(
            input_ids_test.to(self.config.device), 
            attention_masks_test.to(self.config.device),
            labels_test.to(self.config.device)
        )
        
        dataset_test_final = TensorDataset(
            input_ids_test_final.to(config.device),
            attention_masks_test_final.to(self.config.device),
        )

        dataloader_train = DataLoader(
            dataset_train,
            sampler=RandomSampler(dataset_train),
            batch_size=self.config.batch_size
        )

        dataloader_test = DataLoader(
            dataset_test,
            sampler=RandomSampler(dataset_test),
            batch_size=self.config.batch_size
        )
        
        dataloader_test_final = DataLoder(
            dataset_test_final,
            batch_size = self.config.batch_size
        )

        return dataloader_train, dataloader_test

# Training

In [205]:
#!g1.4
class Trainer():
    
    def __init__(self,  model, optimizer, scheduler, config=config):
        self.model = model
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.config = config

    def evaluate(self, dataloader):

        self.model.eval()
        with torch.no_grad(): 
            test_loss_total = 0
            y_pred_logits, y_pred_probs, y_true = [], [], []
            
            for batch in dataloader:
                
                batch = tuple(x.to(self.config.device) for x in batch)
                
                inputs = {
                    'input_ids':      batch[0],
                    'attention_mask': batch[1],
                    'labels':         batch[2],
                }
    
                outputs = self.model(**inputs)
                    
                batch_loss = outputs[0]
                logits = outputs[1]
                test_loss_total += batch_loss.item()

                probs = torch.softmax(logits, dim=1).cpu().numpy()
                logits = logits.cpu().numpy()
                label_ids = inputs['labels'].cpu().numpy()
                y_pred_probs.append(probs)
                y_pred_logits.append(logits)
                y_true.append(label_ids)
                
            test_loss = test_loss_total / len(dataloader) 
            y_pred_probs = np.concatenate(y_pred_probs, axis=0)
            y_pred_logits = np.concatenate(y_pred_logits, axis=0)
            y_true = np.concatenate(y_true, axis=0)

            return test_loss, y_pred_probs, y_pred_logits, y_true


    def train(self, dataloader_train, dataloader_test, save=False):

        step = 0
        for epoch in trange(self.config.epochs, desc=f"Traning Model on {self.config.epochs} Epochs"):

            self.model.train()
            train_loss_total = 0

            for batch in tqdm(dataloader_train, desc=f'Epoch {epoch}'):

                batch = tuple(b.to(self.config.device) for b in batch)
                
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[2],
                }       

                outputs = self.model(**inputs)
                
                loss = outputs[0]
                train_loss_total += loss.item()
                
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                self.optimizer.step()
                self.scheduler.step()

                if step % 100 == 0:
                    wandb.log({
                        'batch_loss': loss.item(), 
                        'learning_rate': self.scheduler.get_last_lr()[0]
                    })
                step += 1

            if save:
                torch.save(self.model.state_dict(), f'_BERT_epoch_{epoch}_{self.config.model_id}.model')
            train_loss = train_loss_total / len(dataloader_train)
            test_loss, y_pred_probs, y_pred_logits, y_true = self.evaluate(dataloader_test)
            test_roc_auc = roc_auc_score(y_true, y_pred_probs, multi_class='ovr', labels=range(self.model.num_labels))
            wandb.log({
                    'train_loss': train_loss, 
                    'test_loss': test_loss,
                    'test_roc_auc': test_roc_auc
                })

In [206]:
#!g1.4
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhalaction[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Categories models

In [None]:
#!g1.4
config_ctg_0 = ConfigDict({
    'model_id': 'ctg_0',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 100,
    'batch_size': 16,
    'max_length': 256,
    'lr': 3e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'embedding freeze',
    'test_size': 0.15,
    'optimizer': AdamW,
    'scheduler': get_linear_schedule_with_warmup,
    'test_size': 0.15,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [None]:
#!g1.4
config = config_ctg_0

wandb.init(project='hse-hack-ctg', config=config)

model_ctg = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(ctg_labels),
    output_attentions = False,
    output_hidden_states = False
).to(config.device)

wandb.watch(model_ctg)

for param in model_ctg.bert.parameters():
    param.requires_grad = False

tokenizer_ctg = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_ctg)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_ctg, y_ctg)

optimizer_ctg = AdamW(
    model_ctg.parameters(),
    lr=config.lr, 
    eps=config.eps
)
                  
scheduler_ctg = get_linear_schedule_with_warmup(
    optimizer_ctg, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_ctg, optimizer_ctg, scheduler_ctg, config=config).train(dataloader_train, dataloader_test)

In [None]:
#!g1.4


In [None]:
#!g1.4
config_ctg_1 = ConfigDict({
    'model_id': 'ctg_1',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 100,
    'batch_size': 16,
    'max_length': 256,
    'pretrained_lr': 5e-6,
    'classifier_lr': 5e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'freeze embedding + diff lr + dropout',
    'test_size': 0.2,
    'optimizer': 'AdamW',
    'scheduler': 'get_linear_schedule_with_warmup',
    'num_warmup_steps': 100,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

In [None]:
#!g1.4
config = config_ctg_1

wandb.init(project='hse-hack-ctg', config=config)

model_ctg = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(ctg_labels),
    output_attentions = False,
    output_hidden_states = False, 
    attention_probs_dropout_prob = 0.2,
    classifier_dropout = 0.2
).to(config.device)
wandb.watch(model_ctg)

for param in model_ctg.bert.embeddings.parameters():
    param.requires_grad = False

tokenizer_ctg = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_ctg)
dataloader_train, dataloader_test = preprocessor.create_dataloaders(x_ctg, y_ctg)

optimizer_ctg = AdamW(
    [
        {"params": model_ctg.bert.parameters(), "lr": config.pretrained_lr},
        {"params": model_ctg.dropout.parameters(), "lr": config.classifier_lr},
        {"params": model_ctg.classifier.parameters(), "lr": config.classifier_lr},
    ],
    lr=config.classifier_lr, 
    eps=config.eps
)
                  
scheduler_ctg = get_linear_schedule_with_warmup(
    optimizer_ctg, 
    num_warmup_steps=config.num_warmup_steps,
    num_training_steps=len(dataloader_train)*config.epochs 
)

Trainer(model_ctg, optimizer_ctg, scheduler_ctg, config=config).train(dataloader_train, dataloader_test, save=True)

wandb.finish()

In [178]:
#!g1.4
Trainer(model_ctg, optimizer_ctg, scheduler_ctg, config=config).train(dataloader_train, dataloader_test, save=True)

HBox(children=(HTML(value='Traning Model on 100 Epochs'), FloatProgress(value=0.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 0'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))




HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=1029.0), HTML(value='')))





KeyboardInterrupt: 

In [210]:
#!g1.4

config = ConfigDict({
    'model_id': 'ctg_1',
    'seed': 0,
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'epochs': 100,
    'batch_size': 16,
    'max_length': 256,
    'pretrained_lr': 5e-6,
    'classifier_lr': 5e-4,
    'eps': 1e-8,
    'pretrained_model': 'cointegrated/rubert-tiny2',
    'tokenizer': 'cointegrated/rubert-tiny2',
    'info': 'freeze embedding + diff lr + dropout',
    'test_size': 0.2,
    'optimizer': 'AdamW',
    'scheduler': 'get_linear_schedule_with_warmup',
    'num_warmup_steps': 100,
    'random_state': 0,
    'truncation': True,
    'add_special_tokens': True,
    'return_attention_mask': True,
    'pad_to_max_length': True,
    'do_lower_case': False,
    'return_tensors': 'pt'
})

final_ctg_model = BertForSequenceClassification.from_pretrained(
    config.pretrained_model, 
    num_labels = len(ctg_labels),
    output_attentions = False,
    output_hidden_states = False, 
    attention_probs_dropout_prob = 0.2,
    classifier_dropout = 0.2
).to(config.device)

final_ctg_model.load_state_dict(torch.load('/home/jupyter/mnt/s3/newbacketttt/ctg-models/_BERT_epoch_0_ctg_1.model'))

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

<All keys matched successfully>

In [211]:
#!g1.4

test = pd.read_csv('/home/jupyter/mnt/s3/newbacketttt/1sentencenewtest.csv', encoding = 'utf-8')

test = test['0'].values

tokenizer_ctg = AutoTokenizer.from_pretrained(config.tokenizer)
preprocessor = Preprocessor(tokenizer_ctg)

dataloader = preprocessor.create_test_dataloader(test)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=401.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1080667.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1741842.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))






In [212]:
#!g1.4

def evaluate_test(model, dataloader):

    model.eval()

    with torch.no_grad(): 
        y_pred_logits, y_pred_probs = [], []

        for batch in dataloader:

            batch = tuple(x.to(config.device) for x in batch)

            inputs = {
                'input_ids':      batch[0].to(config.device),
                'attention_mask': batch[1].to(config.device),
            }

            outputs = final_ctg_model(**inputs) 

            logits = outputs.logits

            probs = torch.softmax(logits, dim=1).cpu().numpy()
            y_pred_probs.append(probs)
            
        y_pred_probs = np.concatenate(y_pred_probs, axis=0)

        return y_pred_probs

In [213]:
#!g1.4
probs = evaluate_test(final_ctg_model, dataloader)
probs

array([[8.53377283e-01, 1.21374629e-01, 2.22989661e-03, 2.26409994e-02,
        3.77206947e-04],
       [1.18600495e-01, 1.83579773e-01, 7.64918150e-05, 6.97699785e-01,
        4.35307193e-05],
       [7.56129622e-01, 1.79559439e-01, 3.02545290e-04, 6.37237057e-02,
        2.84719979e-04],
       ...,
       [7.72497892e-01, 4.97287847e-02, 1.41088367e-01, 3.63387838e-02,
        3.46147281e-04],
       [5.32602593e-02, 3.94340545e-01, 4.89725571e-05, 5.52084029e-01,
        2.66234943e-04],
       [9.38944072e-02, 5.24938881e-01, 8.84576514e-03, 3.72314364e-01,
        6.56227621e-06]], dtype=float32)

In [215]:
#!g1.4
result = pd.DataFrame(probs, columns=ctg_encoder.inverse_transform(range(5)))

result

Unnamed: 0,?,Communication,Price,Quality,Safety
0,0.853377,0.121375,0.002230,0.022641,0.000377
1,0.118600,0.183580,0.000076,0.697700,0.000044
2,0.756130,0.179559,0.000303,0.063724,0.000285
3,0.127886,0.186262,0.000094,0.552460,0.133298
4,0.423456,0.113762,0.001832,0.456505,0.004445
...,...,...,...,...,...
944,0.184641,0.272150,0.000113,0.543049,0.000047
945,0.846399,0.121995,0.006126,0.025285,0.000195
946,0.772498,0.049729,0.141088,0.036339,0.000346
947,0.053260,0.394341,0.000049,0.552084,0.000266


In [216]:
#!g1.4
result.to_csv('submit_ctg.csv')

In [None]:
#!g1.4
