### Imports

In [None]:
#!g1.1


In [32]:
%pip install wandb

Defaulting to user installation because normal site-packages is not writeable
Collecting wandb
  Downloading wandb-0.14.2-py3-none-any.whl (2.0 MB)
     |████████████████████████████████| 2.0 MB 1.7 MB/s            
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.19.1-py2.py3-none-any.whl (199 kB)
     |████████████████████████████████| 199 kB 117.3 MB/s            
Building wheels for collected packages: pathtools
  Building wheel for pathtools (setup.py) ... [?25ldone
[?25h  Created wheel for pathtools: filename=pathtools-0.1.2-py3-none-any.whl size=8785 sha256=8e42634d87465da2981e42e076b9179b47427216d9f6f407

In [481]:
#!g1.1
import re
from warnings import filterwarnings
import numpy as np
import pandas as pd
import os
import random
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from torch.utils.data import Dataset 
from transformers import AutoModel, AdamW, AutoTokenizer, get_linear_schedule_with_warmup,AutoConfig
import wandb

filterwarnings('ignore')

In [482]:
#!g1.1


In [483]:
#!g1.1
class cfg:
    data_dir = './'
    num_epochs = 5
    learning_rate = 5e-5
    batch_size = 32
    model_name ="cointegrated/LaBSE-en-ru"# "cointegrated/rubert-tiny2"
    dropout_prob = 0.2
    downsample = False
    warmup_steps = 200
    seed=21
    
    
    
def seed_everything(seed: int):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(cfg.seed)
    


# Read the data

In [484]:
#!g1.1
from sklearn.model_selection import GroupShuffleSplit 

In [485]:
#!g1.1
# train_df = pd.read_csv(cfg.data_dir + 'train_train.csv')
# test_df = pd.read_csv(cfg.data_dir + 'train_test.csv')


train = pd.read_csv(cfg.data_dir + 'train.csv')

train = train.drop_duplicates(['sentence','1category','2category'])

def split_df(df):
    splitter = GroupShuffleSplit(test_size=0.2, n_splits=2, random_state = cfg.seed)
    split = splitter.split(df, groups=df['sentence'])
    train_inds, test_inds = next(split)

    train = df.iloc[train_inds]
    test = df.iloc[test_inds]
    return train,test


train_df, test_df = split_df(train)

### Handle labels

In [486]:
#!g1.1
def handle_labels(data, multilabel=False):
    mapping = {
        'Communication':'0',
        '?':'4',
        ' ':'5',
        'Quality':'1',
        'Price':'2',
        'Safety':'3'
    }
    data = data.copy()
    data['2category'] = data['2category'].fillna(' ')
    data['1category'] = data['1category'].map(mapping).astype(int)
    data['2category'] = data['2category'].map(mapping).astype(int)
    
    data['cat_0'] = ((data['1category'] == 0) |(data['2category'] == 0)).astype(int)
    data['cat_1'] = ((data['1category'] == 1) |(data['2category'] == 1)).astype(int)
    data['cat_2'] = ((data['1category'] == 2) |(data['2category'] == 2)).astype(int)
    data['cat_3'] = ((data['1category'] == 3) |(data['2category'] == 3)).astype(int)

    data['sentiment'] = data['sentiment'].map({'+':1,'−':2,'?':0})

    return data.reset_index()


In [487]:
#!g1.1
train_df = handle_labels(train_df)

test_df = handle_labels(test_df)

In [488]:
#!g1.1
def process_duplicates(group):
    o = group[["cat_0", "cat_1", "cat_2", "cat_3"]].sum(axis=0).clip(0,1)
    return o
gb = train_df.groupby('sentence').apply(process_duplicates)
train_df = train_df[["sentence", "sentiment"]].merge(gb, on='sentence').drop_duplicates(['sentence']).reset_index()

gb = test_df.groupby('sentence').apply(process_duplicates)
test_df = test_df[["sentence", "sentiment"]].merge(gb, on='sentence').drop_duplicates(['sentence']).reset_index()

print(train_df.shape[0],test_df.shape[0])

5732 1434


### Model and dataset

In [489]:
#!g1.1
class SentimentDataset(Dataset):
    def __init__(self, text, sentiment_targets,category_targets, tokenizer, max_len):
        self.text = text
        self.sentiment_targets = sentiment_targets
        self.category_targets = category_targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        sentiment_target = self.sentiment_targets[item]
        category_target = self.category_targets[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiment_targets': torch.tensor(sentiment_target, dtype=torch.long),
            'category_targets': torch.tensor(category_target, dtype=torch.long),
        }

In [490]:
#!g1.1
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        config = AutoConfig.from_pretrained(cfg.model_name)
        config.update({'output_hidden_states':True})
        self.bert = AutoModel.from_pretrained(cfg.model_name, config=config)
        self.drop = nn.Dropout(p=cfg.dropout_prob)
        self.fc = nn.Linear(self.bert.config.hidden_size,self.bert.config.hidden_size)
        self.out = nn.Linear(self.bert.config.hidden_size*4, n_classes)
        self.out_sent = nn.Linear(self.bert.config.hidden_size*4, 3)
        self.act = nn.LeakyReLU()


    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        all_hidden_states = torch.stack(outputs[2])

        concatenate_pooling = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        last_hidden_state_cls = concatenate_pooling[:, 0]



        #last_hidden_state_cls = outputs[0][:, 0, :]

        x = self.drop(last_hidden_state_cls)
        out_sent = self.out_sent(x)
        out = self.out(x)

        return out, out_sent
    
    
class SentimentClassifier2(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier2, self).__init__()
        self.bert = AutoModel.from_pretrained(cfg.model_name)
        self.drop = nn.Dropout(p=cfg.dropout_prob)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.out_sent = nn.Linear(self.bert.config.hidden_size, 3)
        self.act = nn.LeakyReLU()


    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         all_hidden_states = torch.stack(outputs[2])

#         concatenate_pooling = torch.cat(
#             (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
#         )
#         last_hidden_state_cls = concatenate_pooling[:, 0]

        last_hidden_state_cls = outputs[0][:, 0, :]
        x = self.drop(last_hidden_state_cls)
        out_sent = self.out_sent(x)
        out = self.out(x)

        return out, out_sent

In [491]:
#!g1.1


In [492]:
#!g1.1
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

train_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in train_df.sentence]
test_tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in test_df.sentence]

train_max_len = max(map(len, train_tokenized))
test_max_len = max(map(len, test_tokenized))

print(train_max_len)
print(test_max_len)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=49.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=806.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=521414.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))


276
270


In [493]:
#!g1.1
def create_data_loader(df, tokenizer, batch_size, max_len):
    if 'label' in df:
        labels = df.label.values
    else:
        labels = [0] * len(df)
    ds = SentimentDataset(
        text= df.sentence,
        sentiment_targets= df.sentiment,
        category_targets= df[['cat_0','cat_1','cat_2','cat_3']].values,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
    )

BATCH_SIZE = cfg.batch_size

train_data_loader = create_data_loader(train_df, tokenizer, BATCH_SIZE, train_max_len)
test_data_loader = create_data_loader(test_df, tokenizer, BATCH_SIZE, test_max_len)



In [494]:
#!g1.1
# df = SentimentDataset(
#         text= train_df.sentence,
#         sentiment_targets= train_df.sentiment,
#         category_targets= train_df.category_targets,
#         tokenizer=tokenizer,
#         max_len=230
#     )


### Training

In [495]:
#!g1.1
class CustomTrainer: 
    def __init__(self, model, train_data_loader, val_data_loader, loss_fn, optimizer, device, scheduler, n_train_examples, n_val_examples): 
        self.model = model 
        self.train_data_loader = train_data_loader 
        self.val_data_loader = val_data_loader 
        self.loss_fn = loss_fn 
        self.optimizer = optimizer 
        self.device = device 
        self.scheduler = scheduler 
        self.n_train_examples = n_train_examples 
        self.n_val_examples = n_val_examples 

    
    def calc_loss(self, logits, labels):
            loss = self.loss_fn(logits.view(-1, 4),
                            labels.float().view(-1, 4))
            return loss
 
    def train_epoch(self): 
        self.model = self.model.train() 
        losses = [] 
        correct_predictions = 0 
        for step, batch in enumerate(tqdm(self.train_data_loader, desc='TRAIN')): 
            input_ids = batch['input_ids'].to(self.device) 
            attention_mask = batch['attention_mask'].to(self.device) 
            targets = batch['category_targets'].to(self.device) 
            sentiment_targets = batch['sentiment_targets'].to(self.device) 
 
            outputs, outputs_sent = self.model(input_ids=input_ids, attention_mask=attention_mask) 
            #preds = torch.argmax(outputs, dim=1).detach()
            preds = (outputs > 0.5).float().detach()
            loss_cat = self.calc_loss(outputs, targets) 
            sent_loss = F.cross_entropy(outputs_sent,sentiment_targets)

            loss = (2*loss_cat + sent_loss) / 3


            correct_predictions += torch.sum(preds == targets) / 4
            losses.append(loss.item()) 
            loss.backward() 
            nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=100.0) 
            self.optimizer.step() 
            self.scheduler.step() 
            self.optimizer.zero_grad() 

            

            acc = (preds == targets).float().mean()

            wandb.log({
                'train_loss':loss.detach().cpu(),
                'train_loss_cat':loss_cat.detach().cpu(),
                'train_loss_sent':sent_loss.detach().cpu(),
                'train_acc':acc.detach().cpu(),
                'lr':self.scheduler.get_last_lr()

            })



        return correct_predictions.double() / self.n_train_examples, np.mean(losses)
 
    def evaluate(self): 
        self.model = self.model.eval() 
        losses = [] 
        correct_predictions = 0 
        trg, p = [], []
        trg_sent, p_sent = [], []
        for batch in tqdm(self.val_data_loader, desc='EVALUATION'): 
            input_ids = batch['input_ids'].to(self.device) 
            attention_mask = batch['attention_mask'].to(self.device) 
            targets = batch['category_targets'].to(self.device) 
            sentiment_targets = batch['sentiment_targets'].to(self.device) 
            outputs, outputs_sent = self.model(input_ids=input_ids, attention_mask=attention_mask) 
            preds = (outputs > 0.5).float().detach()
            loss = self.calc_loss(outputs, targets) 
            correct_predictions += torch.sum(preds == targets) / 4
            losses.append(loss.item()) 
            trg.append(targets.detach().cpu())
            p.append(outputs.detach().cpu())
            p_sent.append(outputs_sent.detach().cpu())
            trg_sent.append(sentiment_targets.detach().cpu())

        p = torch.cat(p, dim=0)
        trg = torch.cat(trg, dim=0)
        p_sent = torch.cat(p_sent, dim=0)
        trg_sent = torch.cat(trg_sent, dim=0)

        p = F.sigmoid(p)
        roc_auc = roc_auc_score(trg, p,  multi_class = 'ovr')

        p_sent = F.softmax(p_sent, dim=1)
        roc_auc_sent = roc_auc_score(trg_sent, p_sent,  multi_class = 'ovr')

        return correct_predictions.double() / self.n_val_examples, np.mean(losses), roc_auc, roc_auc_sent

 
    def train(self, n_epochs): 
        wandb_conf = dict(vars(cfg))
        del wandb_conf['__dict__']
        del wandb_conf['__weakref__']
        del wandb_conf['__doc__']
        wandb.init(project = 'hack_category', config=wandb_conf)
        for epoch in range(n_epochs): 
            print(f'Epoch {epoch + 1}/{n_epochs}') 
            print('-' * 10) 
            train_acc, train_loss = self.train_epoch() 
            print(f'Train loss {train_loss} accuracy {train_acc}') 
            val_acc, val_loss, roc_auc,roc_auc_sent = self.evaluate() 
            wandb.log({
                'val_acc':val_acc, 'val_loss':val_loss, 'val_roc_auc':roc_auc, 'val_roc_auc_sentiment':roc_auc_sent}
            )
            print(f'Validation loss {val_loss} accuracy {val_acc} roc_auc {roc_auc}')


In [496]:
#!g1.1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentClassifier(4)
model = model.to(device)


EPOCHS = cfg.num_epochs
optimizer = AdamW(model.parameters(), lr=cfg.learning_rate, correct_bias=True)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=cfg.warmup_steps,
    num_training_steps=total_steps
)
loss_fn = nn.BCEWithLogitsLoss().to(device)


trainer = CustomTrainer(
    model = model,
    train_data_loader=train_data_loader, 
    val_data_loader=test_data_loader, 
    loss_fn=loss_fn,
    optimizer=optimizer, 
    device=device, 
    scheduler=scheduler, 
    n_train_examples=len(train_data_loader) * cfg.batch_size, 
    n_val_examples=len(test_data_loader) * cfg.batch_size,
)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=516063655.0), HTML(value='')))




Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [423]:
#!g1.1
trainer.train(n_epochs=cfg.num_epochs)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmikezz1[0m. Use [1m`wandb login --relogin`[0m to force relogin


HBox(children=(HTML(value='TRAIN'), FloatProgress(value=0.0, max=180.0), HTML(value='')))

HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=45.0), HTML(value='')))

HBox(children=(HTML(value='TRAIN'), FloatProgress(value=0.0, max=180.0), HTML(value='')))

HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=45.0), HTML(value='')))

HBox(children=(HTML(value='TRAIN'), FloatProgress(value=0.0, max=180.0), HTML(value='')))

HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=45.0), HTML(value='')))

HBox(children=(HTML(value='TRAIN'), FloatProgress(value=0.0, max=180.0), HTML(value='')))

HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=45.0), HTML(value='')))

HBox(children=(HTML(value='TRAIN'), FloatProgress(value=0.0, max=180.0), HTML(value='')))

HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=45.0), HTML(value='')))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/5
----------

Train loss 0.5056963149044249 accuracy 0.7678385416666667

Validation loss 0.3407483743296729 accuracy 0.8189236111111111 roc_auc 0.7944189527272
Epoch 2/5
----------

Train loss 0.3210673049920135 accuracy 0.8534722222222223

Validation loss 0.3295520838763979 accuracy 0.8241319444444445 roc_auc 0.8523616675259907
Epoch 3/5
----------

Train loss 0.23592150029208925 accuracy 0.8836805555555556

Validation loss 0.3370925254291958 accuracy 0.8336805555555555 roc_auc 0.8634388345765001
Epoch 4/5
----------

Train loss 0.17056634612381458 accuracy 0.9071180555555556

Validation loss 0.32816669874721105 accuracy 0.851388888888889 roc_auc 0.8666692739696636
Epoch 5/5
----------

Train loss 0

In [424]:
#!g1.1
folder = "model_laBSE-final"
part = 1
torch.save(model.state_dict(),f'model_laBSE-final/model_{part}.pth')
#torch.save(optimizer.state_dict(),'opt_laBSE-2loss-cleaned-4lidden-57.pth')


In [453]:
#!g1.1

def predict(loader, model, model_num=0):
    model.load_state_dict(torch.load(f'model_laBSE-final/model_{model_num}.pth'))
    model.to(device)
    p, p_sent = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc='EVALUATION'): 
            
            input_ids = batch['input_ids'].to(device) 
            attention_mask = batch['attention_mask'].to(device) 
            outputs, outputs_sent = model(input_ids=input_ids, attention_mask=attention_mask) 
            p.append(outputs.detach().cpu())
            p_sent.append(outputs_sent.detach().cpu())

        p = torch.cat(p, dim=0)
        p_sent = torch.cat(p_sent, dim=0)

        p = F.sigmoid(p)
        p_sent = F.softmax(p_sent, dim=1)
        return p, p_sent
    

# for i in range(3):
#     model = model.load_state_dict(torch.load(f'model_laBSE-final/model_{i}.pth'))
#     model.eval()
#     p, p_sent = [], []
#     for batch in tqdm(self.val_data_loader, desc='EVALUATION'): 
#         input_ids = batch['input_ids'].to(self.device) 
#         attention_mask = batch['attention_mask'].to(self.device) 
#         outputs, outputs_sent = self.model(input_ids=input_ids, attention_mask=attention_mask) 
#         p.append(outputs.detach().cpu())
#         p_sent.append(outputs_sent.detach().cpu())

#     # p - (n, 4), p_sent - (n, 4)
#     p = torch.cat(p, dim=0)
#     p_sent = torch.cat(p_sent, dim=0)

#     p = F.sigmoid(p)
#     p_sent = F.softmax(p_sent, dim=1)




        
    

In [497]:
#!g1.1


In [506]:
#!g1.1
new_df = pd.read_csv('data_participants_with_probs.csv')

In [507]:
#!g1.1

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,sentence,prob_?,prob_+,prob_-
0,0,0,0,15.03.2022 обратился на горячую линию для закр...,0.999298,0.000370,0.000332
1,1,1,1,"Уже который год в ТКБ не решается ""глобальная ...",0.000055,0.000026,0.999918
2,2,2,2,Добрый день,0.998809,0.000988,0.000203
3,3,3,3,"Добрый день Сегодня, зайдя в свой личный кабин...",0.000113,0.000032,0.999855
4,4,4,4,"Обслуживаюсь в Тинькофф пару лет, возникла жес...",0.999036,0.000734,0.000230
...,...,...,...,...,...,...,...
944,944,944,944,Отвратительный сервис и отношение к клиентам! ...,0.000048,0.000025,0.999927
945,945,945,945,28.04.2022 обратилась в банк о возможности пер...,0.999307,0.000377,0.000316
946,946,946,946,В начале 2021 года была акция по выплате 8% ке...,0.999454,0.000320,0.000225
947,947,947,947,Бездействие банка и некомпетентность сотрудников,0.000043,0.000027,0.999930


In [508]:
#!g1.1
new_df = pd.read_csv('data_participants_with_probs.csv')
new_df['category_targets'] = 0
new_df['sentiment'] = 0
new_df['cat_0']=0
new_df['cat_1']=0
new_df['cat_2']=0
new_df['cat_3']=0
new_data = create_data_loader(new_df, tokenizer, BATCH_SIZE, train_max_len)

In [509]:
#!g1.1


In [510]:
#!g1.1


In [520]:
#!g1.1
p0,sent_p0 = predict(new_data, model=SentimentClassifier(4), model_num=0)

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [518]:
#!g1.1
p1, sent_p1 = predict(new_data, model=SentimentClassifier(4), model_num=1)

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [519]:
#!g1.1
p2, sent_p2 = predict(new_data, model=SentimentClassifier(4), model_num=2)

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(HTML(value='EVALUATION'), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [522]:
#!g1.1
preds_sent

tensor([[9.8796e-01, 1.8422e-03, 1.0195e-02],
        [2.8869e-03, 4.2184e-04, 9.9669e-01],
        [9.3149e-01, 3.5332e-02, 3.3180e-02],
        ...,
        [9.8275e-01, 1.5016e-02, 2.2339e-03],
        [1.1529e-03, 3.9518e-03, 9.9490e-01],
        [3.9336e-03, 1.5961e-03, 9.9447e-01]])

In [521]:
#!g1.1
preds_sent = torch.stack([sent_p0,sent_p1,sent_p2]).mean(dim=0)

In [541]:
#!g1.1
(train_df[['cat_0', 'cat_1', 'cat_2','cat_3']].sum(axis=1) > 1).mean()

0.31524773203070483

In [572]:
#!g1.1
preds_cat = torch.stack([p0,p1,p2]).mean(dim=0)
new_df['communication'] = preds_cat[:, 0]
new_df['quality'] = preds_cat[:, 1]
new_df['price'] = preds_cat[:, 2]
new_df['safety'] = preds_cat[:, 3]

new_df['my_prob_?'] = preds_sent[:, 0]
new_df['my_prob_+'] = preds_sent[:, 1]
new_df['my_prob_-'] = preds_sent[:, 2]

new_df['?'] = (new_df['my_prob_?'] + new_df['prob_?']) / 2
new_df['+'] = (new_df['my_prob_+'] + new_df['prob_+']) / 2
new_df['-'] = (new_df['my_prob_-'] + new_df['prob_-']) / 2

new_df['second_category'] = ((new_df[['cat_Communication_proba', 'cat_Quality_proba', 'cat_Price_proba','cat_Safety_proba']] > 0.5).sum(axis=1) > 1).astype(int)


new_df[['sentence', 'communication',
        'quality','price',
        'safety','?','-','+','second_category']].to_csv('test_scored_cats_mike_vFF.csv')
