# Install pip

In [2]:
!pip install datasets --upgrade

Collecting datasets
  Downloading datasets-2.14.1-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, datasets
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.12.0
    Uninstalling huggingface-hub-0.12.0:
      Successfully uninstalled huggingface-hub-0.12.0
  Attempting uninstall: datasets
    Found existing installation: datasets 2.4.0
    Uninstalling datasets-2.4.0:
      Successfully uninstalled datasets-2.4.0
Successfully installed datasets-2.14.1 huggingface-hub-0.16.4
[0m

# download data

In [3]:
import numpy as np 
import pandas as pd 
from datasets import load_dataset

In [4]:

TOKEN = None
from huggingface_hub.hf_api import HfFolder
#HfFolder.save_token(TOKEN)



# Package

In [8]:
import torch
from torch import nn
from torch.nn import functional as F 
from torch.cuda.amp import GradScaler, autocast
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold 
from torch.utils.data import Dataset, DataLoader
import os 
import random
import numpy as np 
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
import os
from transformers import get_linear_schedule_with_warmup

os.environ["TOKENIZERS_PARALLELISM"] = "true"
DEFAULT_RANDOM_SEED = 2021

def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(DEFAULT_RANDOM_SEED)

# DataLoader

In [9]:
# 2. Create a dataset Class for NER task
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts  
        self.labels = labels  
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]          
        tokens = torch.as_tensor(text).long()
        label = torch.as_tensor(label).long()
        attention_mask = (tokens != self.tokenizer.pad_token_id).long()
         
        return {"input_ids":tokens, "attention_mask":attention_mask, 'label':label}
    
def collater(data, pad_token ):
    input_ids = []
    token_type_ids = []
    attention_masks = []
    labels = []
    #for item in data:
    #    print(item)
    #    break
    max_ = max([len(item["input_ids"]) for item in data])
  
    
    for item in data:
        length = len(item["input_ids"])
        if  length < max_:
            to_add = max_ - length 
            input_id = torch.concat((item["input_ids"], torch.ones((to_add,))*  pad_token))
            label = item["label"]
            attention_mask = torch.concat((item["attention_mask"], torch.zeros((to_add,)) ))
            #token_type_id = torch.concat((item["token_type_ids"], torch.zeros((to_add,)) ))
        else:
            input_id = item["input_ids"] 
            label = item["label"] 
            attention_mask =  item["attention_mask"] 
            #token_type_id =  item["token_type_ids"] 
        input_ids.append(input_id)
        labels.append(label)
        attention_masks.append(attention_mask)
        #token_type_ids.append(token_type_id)
        
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels)
    attention_masks = torch.stack(attention_masks)
    #token_type_ids = torch.stack(token_type_ids)
    return {"input_ids":input_ids.long(),   "attention_mask":attention_masks.long(), 'label':labels.long()}

In [10]:
class TeacherFreeLoss(torch.nn.Module):
    def __init__(self, loss_fn, virtual_teacher= "fix" , smoothing=0.1, multi_class=False ):
        super(TeacherFreeLoss, self).__init__()
        self.loss_fn = loss_fn 
        self.smoothing = smoothing 
        self.virtual_teacher = "fix"
        assert virtual_teacher in ["fix", "random"]
        self.multi_class = multi_class
        self.uniform = torch.distributions.uniform.Uniform(-smoothing/2, smoothing/2)

    def forward(self, output, target):
        # if multi class return a int only
        if self.multi_class:
            target = torch.nn.functional.one_hot(target, output.shape[1]).to(output.device).float()
            #target = torch.zeros_like(output, device=output.dtype)

            smoothing = self.smoothing/output.shape[1]
            target[target == 1] = 1 - smoothing 
            target[target != 1] = smoothing 
            if self.virtual_teacher == "random":
                target = target + self.uniform.sample(target.shape) 
                target = target/target.sum(-1).unsqueeze(-1)
        else:
            target = torch.abs(target - self.smoothing)
            if self.virtual_teacher == "random":
                target = target + self.uniform.sample(target.shape) 
        if type(self.loss_fn) ==  nn.KLDivLoss:
            output = F.log_softmax(output, dim=-1)
        loss = self.loss_fn(output, target)
        return loss 

# Train & Eval fn

In [11]:
def train_step(model, dataset, dataloader, optimizer, device, loss_fn, num_steps, mixed_precision=False,
               scheduler=None, loss_fn2 = None):
    model.train()
    total_loss = 0
    t = tqdm(total=num_steps)
    scaler = GradScaler()
    c = 0
    while c < num_steps:
        try:
            batch = next(dataloader)
        except:
            dataloader = iter(dataset)
            batch = next(dataloader)
        for k,v in batch.items():
            batch[k] = batch[k].to(device)
        labels = batch["label"]
        del batch["label"]
      

        optimizer.zero_grad()
        with autocast(mixed_precision):
            outputs = model(**batch)
            out = outputs.logits
            
            if loss_fn2 is not None:
                loss = 0.5 * loss_fn(out , labels) + 0.5 * loss_fn2(out , labels)
            else:
                loss =  loss_fn(out , labels)
            #print(loss)
            if torch.isnan(loss).sum()>0:
                raise("NAN")
        if mixed_precision:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        if scheduler is not None:
            scheduler.step()
        total_loss += loss.item()
        t.update(1)
        t.set_description(f"cross_entropy_loss_train:{total_loss/(c+1)}")
        t.refresh()
        c += 1
    return total_loss / num_steps

In [13]:
def evaluate(model, dataloader, device, loss_fn ):
    model.eval()
    total_loss = 0
    predictions = []
    y_true = []
    doc_ids = []
    t = tqdm(dataloader)
    ner_bce_loss = nn.CrossEntropyLoss(reduction="none", label_smoothing=0.0)
    with torch.no_grad():
        for c,batch in enumerate(t):
            for k,v in batch.items():
                batch[k] = batch[k].to(device)
            labels = batch["label"] 
            del batch["label"]

            outputs = model(**batch)
            out = outputs.logits
            loss = loss_fn(out, labels)
            total_loss += loss.item()
            
            probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()
            predictions.append(probs )
            
            y_true.append(labels.cpu().numpy() )
            
            t.set_description(f"bce_loss_val:{total_loss/(c+1)}")
            t.refresh()
            
    y_true =  np.concatenate(y_true)
    predictions = np.concatenate(predictions)
    
    return total_loss / len(dataloader), predictions, y_true

# Split training set

In [14]:
df_tr = pd.read_csv('data/movies_train_15fold.csv')
df_test = pd.read_csv('data/movies_test.csv')

# Arch model

In [19]:
import os
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
from transformers import PreTrainedModel
import torch
from torch import Tensor
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.modeling_outputs import SequenceClassifierOutput

In [20]:
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]



In [21]:
class TransformersForSequenceClassification(nn.Module):
    def __init__(self, model, num_labels, hidden_size=1024):
        super().__init__()
        self.num_labels = num_labels
        self.bert = model
        self.dropout = nn.Dropout(0.3)
        #self.linear = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.ReLU())
        self.classifier = nn.Linear(hidden_size, num_labels)


    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else True

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        #pooled_output = outputs[1]
        pooled_output = average_pool(outputs[0], attention_mask)
        #pooled_output = self.linear(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

# Train model

In [22]:
df_aug = pd.read_csv("checkpoint/roberta-large-movies-15fold-tf/roberta-large-movies-15fold-tf-movies.csv")

In [23]:
classes_names = list(sorted(list(set(df_tr.genre.values.tolist()))))
class2id = {v:k for k, v in enumerate(classes_names)}
id2class = {k:v for k, v in enumerate(classes_names)}

In [24]:
NUM_FOLD = 15

In [25]:
df_tr["target"] = df_tr.genre.apply(lambda x: class2id[x])
df_aug["target"] = df_aug.genre.apply(lambda x: class2id[x])

In [26]:
mixed_precision = False
model_name = "roberta-large-movies" #/checkpoint-72500"
lr = 5e-6
num_classes= 10
NUM_EPOCH = 4
max_length=100
num_steps = 500
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name_save = 'roberta-large-movies-15fold-tf-PL' #model_name.split("/")[-1]
if not os.path.exists(f'checkpoint/{model_name_save}'):
    os.makedirs(f'checkpoint/{model_name_save}')

In [27]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [28]:
loss_fn = nn.CrossEntropyLoss()
loss_fn_2 = TeacherFreeLoss(loss_fn=nn.KLDivLoss(reduction="batchmean", log_target=False), 
                            virtual_teacher= "fix" , smoothing=0.15, multi_class=True )

In [29]:
df_test["concat"] = df_test.apply(lambda x: f"Title:{x.title}\nText:{x.text}", axis=1)
text_encoded_pl = df_test.concat.apply(lambda x: tokenizer.encode(x)).values.tolist()


In [30]:
df_tr["concat"] = df_tr.apply(lambda x: f"Title:{x.title}\nText:{x.text}", axis=1)
text_encoded = df_tr.concat.apply(lambda x: tokenizer.encode(x))
df_tr['length'] = [len(t) for t in text_encoded]

In [31]:
df_test["concat"] = df_test.apply(lambda x: f"Title:{x.title}\nText:{x.text}", axis=1)


In [32]:
df_tr.length.max(), df_tr.length.mean()

(106, 43.73298148148148)

In [33]:
best_scores = []
for n in range(NUM_FOLD):
 
    print("FOLD :", n)
    best_score = 0.
    train_df = df_tr.query(f'fold != {n}') 
    #train_df = pd.concat((df_tr, df_aug), axis=0).reset_index().drop(columns=["index"])
    val_df =  df_tr.query(f'fold == {n}') 
    #print(val_df.index.values)
    dataset_tr = TextDataset(texts=[text_encoded[index] for index in train_df.index.values] + text_encoded_pl, 
                             labels=train_df.target.values.tolist() + df_aug.target.values.tolist(),  
                                tokenizer=tokenizer, max_length=max_length)
    
    dataloader_tr = DataLoader(dataset_tr, batch_size=16, num_workers=4, shuffle=True, 
                               collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))
                               
    dataset_val = TextDataset(texts=[text_encoded[index] for index in val_df.index.values], 
                              labels=val_df.target.values.tolist(), 
                                tokenizer=tokenizer, max_length=max_length)
    dataloader_val = DataLoader(dataset_val, batch_size=16, num_workers=1, shuffle=False, 
                               collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))
    
    model = TransformersForSequenceClassification(AutoModel.from_pretrained(model_name),
                                                 num_labels=num_classes, hidden_size=1024)
    #model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    model=model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.2, patience=3,
    #                                                   min_lr=0, eps=1e-08, verbose=True)
    scheduler =get_linear_schedule_with_warmup(optimizer=optimizer, 
                                               num_warmup_steps=0,
                                               num_training_steps=len(dataloader_tr)*NUM_EPOCH,
                                               last_epoch = -1 )
    steps = 0
    total_steps = len(dataloader_tr)*NUM_EPOCH
    dataloader_tr_iter = iter(dataloader_tr)
    while steps < total_steps:
        train_step(model, dataloader_tr, dataloader_tr_iter, optimizer, device, loss_fn, num_steps=num_steps,
                    mixed_precision=mixed_precision,
             scheduler=scheduler, loss_fn2= loss_fn_2)
        steps += num_steps
        print("num steps:", steps)
        val_loss, preds, y_val = evaluate(model, dataloader_val, device, loss_fn )
        acc = accuracy_score(y_val, preds.argmax(axis=1))
        
        if best_score < acc:
            best_score = acc
            torch.save(model.state_dict(), f'checkpoint/{model_name_save}/{model_name_save}-fold{n}.pth')
            print("Best Accuracy : ", acc)
        else:
            print("Accuracy : ", acc)
        #scheduler.step(acc)
        if steps >= 6000:
            break
    best_scores.append(best_score)
 

FOLD : 0


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4297222222222222


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4411111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4436111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44083333333333335


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4483333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4494444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4494444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44583333333333336


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4483333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44333333333333336


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.43972222222222224


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44555555555555554
FOLD : 1


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4175


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4330555555555556


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43416666666666665


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43666666666666665


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44277777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44277777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4388888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44027777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4436111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4444444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4444444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44333333333333336
FOLD : 2


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43777777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45361111111111113


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45444444444444443


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.455


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45361111111111113


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.455


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4558333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4663888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.46111111111111114


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4597222222222222


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45611111111111113
FOLD : 3


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43666666666666665


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4477777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4488888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4577777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4552777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45361111111111113


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45055555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45916666666666667


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45666666666666667


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4558333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45444444444444443
FOLD : 4


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43694444444444447


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44222222222222224


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4444444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44305555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4436111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4488888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4502777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4575


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4513888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45111111111111113


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4508333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4513888888888889
FOLD : 5


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4280555555555556


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4288888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4363888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.43555555555555553


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44222222222222224


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4388888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4363888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4436111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4411111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.43972222222222224


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44277777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4386111111111111
FOLD : 6


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43194444444444446


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43527777777777776


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4386111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44027777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45472222222222225


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45666666666666667


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4602777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4558333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4552777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4513888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4525


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45555555555555555
FOLD : 7


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.42527777777777775


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4286111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4288888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4338888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43777777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.43722222222222223


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43916666666666665


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4425


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4425


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4438888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.43916666666666665


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.43916666666666665
FOLD : 8


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4361111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44333333333333336


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45055555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45444444444444443


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45555555555555555


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45444444444444443


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45805555555555555


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4622222222222222


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.455


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4633333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4597222222222222


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.46194444444444444
FOLD : 9


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43222222222222223


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44083333333333335


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44583333333333336


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44055555555555553


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44472222222222224


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44055555555555553


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4413888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45555555555555555


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4525


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45444444444444443


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45
FOLD : 10


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43777777777777777


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4525


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4577777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45611111111111113


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4602777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45666666666666667


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4563888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.46194444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4633333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45805555555555555


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.46111111111111114


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4666666666666667
FOLD : 11


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43083333333333335


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4413888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45222222222222225


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4469444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45416666666666666


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4494444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44722222222222224


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4502777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4502777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4486111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45666666666666667


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4563888888888889
FOLD : 12


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4386111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4538888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4533333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45111111111111113


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.455


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4477777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45472222222222225


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45555555555555555


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4519444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44972222222222225


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45166666666666666
FOLD : 13


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44305555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44666666666666666


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4527777777777778


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45222222222222225


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44555555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45555555555555555


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4494444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45694444444444443


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4519444444444444


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45944444444444443


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4527777777777778
FOLD : 14


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4311111111111111


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.43972222222222224


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 1500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.43694444444444447


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.44805555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 2500


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.45111111111111113


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4558333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 3500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4533333333333333


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4000


  0%|          | 0/225 [00:00<?, ?it/s]

Best Accuracy :  0.4572222222222222


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 4500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45305555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.45305555555555554


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 5500


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.4563888888888889


  0%|          | 0/500 [00:00<?, ?it/s]

num steps: 6000


  0%|          | 0/225 [00:00<?, ?it/s]

Accuracy :  0.44333333333333336


In [34]:
'''
best_scores = []
for n in range(NUM_FOLD):
    print("FOLD :", n)
    best_score = 0.
    train_df = df_tr.query(f'fold != {n}') 
    val_df =  df_tr.query(f'fold == {n}') 
    #print(val_df.index.values)
    dataset_tr = TextDataset(texts=[text_encoded[index] for index in train_df.index.values], 
                             labels=train_df.target.values.tolist(), 
                                tokenizer=tokenizer, max_length=max_length)
    
    dataloader_tr = DataLoader(dataset_tr, batch_size=16, num_workers=4, shuffle=True, 
                               collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))
                               
    dataset_val = TextDataset(texts=[text_encoded[index] for index in val_df.index.values], 
                              labels=val_df.target.values.tolist(), 
                                tokenizer=tokenizer, max_length=max_length)
    dataloader_val = DataLoader(dataset_val, batch_size=16, num_workers=1, shuffle=False, 
                               collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))
    
    model = TransformersForSequenceClassification(AutoModel.from_pretrained(model_name),
                                                  num_labels=num_classes, hidden_size=1024)#AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    model=model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.2, patience=3,
    #                                                   min_lr=0, eps=1e-08, verbose=True)
    scheduler =get_linear_schedule_with_warmup(optimizer=optimizer, 
                                               num_warmup_steps=0,
                                               num_training_steps=len(dataloader_tr)*NUM_EPOCH,
                                               last_epoch = -1 )
    for e in range(NUM_EPOCH):
        train(model, dataloader_tr, optimizer, device, loss_fn, mixed_precision=mixed_precision,
             scheduler=scheduler)
        val_loss, preds, y_val = evaluate(model, dataloader_val, device, loss_fn )
        acc = accuracy_score(y_val, preds.argmax(axis=1))
        
        if best_score < acc:
            best_score = acc
            torch.save(model.state_dict(), f'{model_name_save}-fold{n}.pth')
            print("Best Accuracy : ", acc)
        else:
            print("Accuracy : ", acc)
        #scheduler.step(acc)
    best_scores.append(best_score)
 
Fold [0.425, 0.429, 0.431, 0.429] # epoch + logits
'''

'\nbest_scores = []\nfor n in range(NUM_FOLD):\n    print("FOLD :", n)\n    best_score = 0.\n    train_df = df_tr.query(f\'fold != {n}\') \n    val_df =  df_tr.query(f\'fold == {n}\') \n    #print(val_df.index.values)\n    dataset_tr = TextDataset(texts=[text_encoded[index] for index in train_df.index.values], \n                             labels=train_df.target.values.tolist(), \n                                tokenizer=tokenizer, max_length=max_length)\n    \n    dataloader_tr = DataLoader(dataset_tr, batch_size=16, num_workers=4, shuffle=True, \n                               collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))\n                               \n    dataset_val = TextDataset(texts=[text_encoded[index] for index in val_df.index.values], \n                              labels=val_df.target.values.tolist(), \n                                tokenizer=tokenizer, max_length=max_length)\n    dataloader_val = DataLoader(dataset_val, batch_size=16, num_worke

In [35]:
best_scores = []
for n in range(NUM_FOLD):
    print("FOLD :", n)
    best_score = 0.
    train_df = df_tr.query(f'fold != {n}') 
    val_df =  df_tr.query(f'fold == {n}') 
    #print(val_df.index.values)
    dataset_tr = TextDataset(texts=[text_encoded[index] for index in train_df.index.values], 
                             labels=train_df.target.values.tolist(), 
                                tokenizer=tokenizer, max_length=max_length)
    
    dataloader_tr = DataLoader(dataset_tr, batch_size=16, num_workers=4, shuffle=True, 
                               collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))
                               
    dataset_val = TextDataset(texts=[text_encoded[index] for index in val_df.index.values], 
                              labels=val_df.target.values.tolist(), 
                                tokenizer=tokenizer, max_length=max_length)
    dataloader_val = DataLoader(dataset_val, batch_size=16, num_workers=1, shuffle=False, 
                               collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))
    
    model = TransformersForSequenceClassification(AutoModel.from_pretrained(model_name),
                                                 num_labels=num_classes, hidden_size=1024)
    #model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    model=model.to(device)
    
    model.load_state_dict(torch.load(f'checkpoint/{model_name_save}/{model_name_save}-fold{n}.pth'))
    val_loss, preds, y_val = evaluate(model, dataloader_val, device, loss_fn )
    acc = accuracy_score(y_val, preds.argmax(axis=1))

    best_scores.append(acc)

FOLD : 0


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 1


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 2


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 3


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 4


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 5


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 6


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 7


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 8


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 9


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 10


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 11


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 12


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 13


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

FOLD : 14


Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/225 [00:00<?, ?it/s]

In [36]:
print(best_scores)

[0.4494444444444444, 0.4444444444444444, 0.4663888888888889, 0.45916666666666667, 0.4575, 0.4436111111111111, 0.4602777777777778, 0.4438888888888889, 0.4633333333333333, 0.45555555555555555, 0.4666666666666667, 0.45666666666666667, 0.45555555555555555, 0.45944444444444443, 0.4572222222222222]


In [37]:
with open(f"checkpoint/{model_name_save}/{model_name_save}-{round(np.mean(best_scores),3)}.txt", 'w') as f:
    f.write(f"scores: {best_scores}\n")
    f.write(f"model: {model_name}\n")
    f.write(f'lr: {lr}\n')
    f.write(f'epoch: {NUM_EPOCH}\n')

# submission

In [34]:
model = TransformersForSequenceClassification(AutoModel.from_pretrained(model_name),
                                                  num_labels=num_classes, hidden_size=1024)#AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
model=model.to(device)

Some weights of the model checkpoint at roberta-large-movies were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large-movies and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
#df_test["concat"] = df_test.apply(lambda x: x.text, axis=1)
test_encoded = df_test.concat.apply(lambda x: tokenizer.encode(x))
df_test['length'] = [len(t) for t in test_encoded]

In [36]:
dataset_test = TextDataset(texts= test_encoded , 
                              labels=np.zeros((len(df_test),)), 
                                tokenizer=tokenizer, max_length=max_length)
dataloader_test = DataLoader(dataset_test, batch_size=16, num_workers=1, shuffle=False, 
                           collate_fn=lambda x: collater(x, pad_token=tokenizer.pad_token_id))

In [37]:
alls= []
for n in range(NUM_FOLD):
    model.load_state_dict(torch.load(f'checkpoint/{model_name_save}/{model_name_save}-fold{n}.pth'))
    _, preds_test, _ = evaluate(model, dataloader_test, device, loss_fn )
    alls.append(preds_test)
alls = np.stack(alls).mean(0)


  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

  0%|          | 0/2250 [00:00<?, ?it/s]

In [38]:
sub = pd.DataFrame()
sub["id"] = df_test.id
sub['genre'] = [id2class[x] for x in alls.argmax(1)]
sub.to_csv(f'checkpoint/{model_name_save}/{model_name_save}-movies.csv', index=False)

In [39]:
sub

Unnamed: 0,id,genre
0,16863,family
1,48456,horror
2,41383,fantasy
3,84007,mystery
4,40269,fantasy
...,...,...
35995,73156,scifi
35996,21242,romance
35997,70135,adventure
35998,85987,thriller
