Testing the model on a target of two variables. Cross validation on folds by prompts_id. Saving the best models for later averaging.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoConfig, AutoTokenizer, AutoModel
#from transformers import DebertaTokenizer, DebertaModel
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
from argparse import Namespace
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneGroupOut

from tqdm import tqdm as tq
from tqdm import notebook



In [2]:
import random
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

In [3]:
sum_train=pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv')
sum_train

Unnamed: 0,student_id,prompt_id,text,content,wording
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757
...,...,...,...,...,...
7160,ff7c7e70df07,ebad26,They used all sorts of chemical concoctions to...,0.205683,0.380538
7161,ffc34d056498,3b9047,The lowest classes are slaves and farmers slav...,-0.308448,0.048171
7162,ffd1576d2e1b,3b9047,they sorta made people start workin...,-1.408180,-0.493603
7163,ffe4a98093b2,39c16e,An ideal tragety has three elements that make ...,-0.393310,0.627128


In [4]:
sum_train.prompt_id.nunique()

4

In [5]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer =AutoTokenizer.from_pretrained("microsoft/deberta-base")
sep=tokenizer.sep_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [6]:
import pickle
with open("bert_tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [7]:
tkn=tokenizer.tokenize('Hello, student 2023!')
tkn

['Hello', ',', 'Ġstudent', 'Ġ20', '23', '!']

In [8]:
sum_train['len_text']=sum_train.text.apply(lambda x: len(tokenizer.tokenize(x)))
max_len=sum_train.len_text.max()
q_97=sum_train.len_text.quantile(q=0.97)
sum_train.head(3)

Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,student_id,prompt_id,text,content,wording,len_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,69
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,56
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,291


In [9]:
max_len, q_97

(843, 259.0799999999999)

In [10]:
prompts_train=pd.read_csv('/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv')
prompts_train['prompt_text_token']=prompts_train.prompt_text.apply(lambda x: tokenizer.tokenize(x)[:1200])
prompts_train['prompt_text_ids']=prompts_train.prompt_text_token.apply(lambda x: [1]+tokenizer.convert_tokens_to_ids(x)+[2])
#prompts_train['prompt_q_token']=prompts_train.prompt_question.apply(lambda x: tokenizer.tokenize(x)[:1200])
#prompts_train['prompt_q_ids']=prompts_train.prompt_q_token.apply(lambda x: [1]+tokenizer.convert_tokens_to_ids(x)+[2])
prompts_train

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,prompt_text_token,prompt_text_ids
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,"[Chapter, Ġ13, Ġ, č, Ċ, As, Ġthe, Ġsequel, Ġto...","[1, 45642, 508, 1437, 50121, 50118, 1620, 5, 1..."
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,"[Egypt, ian, Ġsociety, Ġwas, Ġstructured, Ġlik...","[1, 37552, 811, 2313, 21, 16697, 101, 10, 3334..."
2,814d6b,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,"[Background, Ġ, č, Ċ, The, ĠThird, ĠWave, Ġexp...","[1, 48277, 1437, 50121, 50118, 133, 7470, 2118..."
3,ebad26,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...","[With, Ġone, Ġmember, Ġtrim, ming, Ġbeef, Ġin,...","[1, 3908, 65, 919, 10723, 7059, 6829, 11, 10, ..."


In [11]:
prompts_train['len_prompt']=prompts_train.prompt_text.apply(lambda x: len(tokenizer.tokenize(x)))
prompts_train.len_prompt.max(), prompts_train.len_prompt.min()

(1199, 700)

Let's create embeddings for each of the topic descriptions. To do this, let's average the penultimate layer of each token. https://russianblogs.com/article/60231465736/

In [12]:
#model=BertModel.from_pretrained('bert-base-uncased')
#model=RobertaModel.from_pretrained("roberta-base")
#model = AutoModel.from_pretrained("microsoft/deberta-v3-base")
model = AutoModel.from_pretrained("microsoft/deberta-base")
#model_seq=DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", \
                                                           #num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

In [13]:
model

DebertaModel(
  (embeddings): DebertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=0)
    (LayerNorm): DebertaLayerNorm()
    (dropout): StableDropout()
  )
  (encoder): DebertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaLayer(
        (attention): DebertaAttention(
          (self): DisentangledSelfAttention(
            (in_proj): Linear(in_features=768, out_features=2304, bias=False)
            (pos_dropout): StableDropout()
            (pos_proj): Linear(in_features=768, out_features=768, bias=False)
            (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
            (dropout): StableDropout()
          )
          (output): DebertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): DebertaLayerNorm()
            (dropout): StableDropout()
          )
        )
        (intermediate): DebertaIntermediate(
          (dense): Linear(in_features=768, out_features=3

In [14]:
with open("deberta_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [15]:
def make_sentence(x):
    #model.eval()
    model.eval
    text=torch.LongTensor(x).unsqueeze(dim=0)
    with torch.no_grad():
        #enc_layer, _=model(text)
        last_layer=model(text).last_hidden_state
        #poll_out=model(text).pooler_output
    #return torch.mean(enc_layer[11], 1).squeeze(dim=0)
    #return(poll_out.squeeze(dim=0))
    return torch.mean(last_layer, 1).squeeze(dim=0)
        #return model_seq(text).logits.squeeze()

In [16]:
prompts_train['prompt_text_sent']=prompts_train['prompt_text_ids'].apply(make_sentence)
#prompts_train['prompt_q_sent']=prompts_train['prompt_q_ids'].apply(make_sentence)

In [17]:
train_content=pd.merge(sum_train, prompts_train, how='left', left_on='prompt_id', right_on='prompt_id')
train_content.isnull().sum()

student_id           0
prompt_id            0
text                 0
content              0
wording              0
len_text             0
prompt_question      0
prompt_title         0
prompt_text          0
prompt_text_token    0
prompt_text_ids      0
len_prompt           0
prompt_text_sent     0
dtype: int64

In [18]:
train_content['len_ratio']=train_content.len_text/train_content.len_prompt

In [19]:
train_c=train_content.drop(['prompt_text', 'prompt_title','prompt_question', 'prompt_text_ids',\
                           'prompt_text_token', 'len_text', 'len_prompt'], axis=1)
train_c.head(3)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_text_sent,len_ratio
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.098571
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,"[tensor(-0.6038), tensor(0.0009), tensor(-0.00...",0.046706
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.392713


In [20]:
SPLIT_SEED=92
from sklearn.model_selection import train_test_split

#data_train, data_val = train_test_split(train_c, test_size=0.20, random_state=SPLIT_SEED)
prompt_list=list(prompts_train['prompt_id'])
data_train=train_c[(train_c.prompt_id==prompt_list[0]) | (train_c.prompt_id==prompt_list[1]) |\
                   (train_c.prompt_id==prompt_list[3])].copy()

data_val=train_c[(train_c.prompt_id==prompt_list[2])].copy()
                   
data_train['split']='train'
data_val['split']='val'
data_with_split=pd.concat([data_train, data_val], ignore_index=True)
data_with_split

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_text_sent,len_ratio,split
0,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,"[tensor(-0.6038), tensor(0.0009), tensor(-0.00...",0.046706,train
1,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.392713,train
2,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.053981,train
3,0071d51dab6d,ebad26,They would use chemicals and substances to cha...,0.205683,0.380538,"[tensor(-0.6038), tensor(0.0009), tensor(-0.00...",0.039199,train
4,0072b649a88c,3b9047,The Egyptian society is really different from ...,0.205683,0.380538,"[tensor(0.0025), tensor(0.3075), tensor(-0.016...",0.121457,train
...,...,...,...,...,...,...,...,...
7160,fe1e3c528e24,814d6b,The third wave experiment developed quick...,3.020803,2.421200,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.252857,val
7161,fe6fac61dc49,814d6b,Mr jones started the third wave as a expereme...,1.221089,2.269070,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.120000,val
7162,fed33a5f383e,814d6b,The Third Wave gained over 200 members by the ...,2.141224,1.123777,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.238571,val
7163,fefd4f143fbe,814d6b,The Third Wave developed over such a short tim...,-0.782641,-0.245970,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.044286,val


In [21]:
class TextDataset(Dataset):
    def __init__(self, text_df, max_seq_length):
        
        self.text_df = text_df 
        
        self._max_seq_length = max_seq_length

        self.train_df = self.text_df[self.text_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.text_df[self.text_df.split=='val']
        self.validation_size = len(self.val_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size), 
                             'val': (self.val_df, self.validation_size)}

        self.set_split('train')
       

    def set_split(self, split="train"):
        self._data_split = split
        self._data_df, self._data_size = self._lookup_dict[split]

    def __len__(self):
        return self._data_size

    def __getitem__(self, index):
        
        row = self._data_df.iloc[index]
        text=row['text']
        tokens=tokenizer.tokenize(text)
        text_index=[1]+tokenizer.convert_tokens_to_ids(tokens)+[2]        
        token_index=text_index
                
        if len(token_index)<self._max_seq_length:
            pad=[0]*(self._max_seq_length-len(token_index))
            token_index=token_index+pad
                      
        else:
            token_index=token_index[:self._max_seq_length]
            
        data_vector = torch.LongTensor(token_index) 
        
        target = row[['content', 'wording']]
        
        return {'x_data': data_vector,
                'attention_mask': (data_vector!=0).long(),
                'content_vector': row['prompt_text_sent'],                
                'y_target': torch.squeeze(torch.FloatTensor([target])),                
                'len_ratio': row['len_ratio']}

    def get_num_batches(self, batch_size):
        
        return len(self) // batch_size


In [22]:
data=TextDataset(data_with_split, 350)

In [23]:
t=data.__getitem__(0)
#t

In [24]:
t['x_data'].shape, t['content_vector'].shape, t['y_target'].shape, t['attention_mask'].shape

(torch.Size([350]), torch.Size([768]), torch.Size([2]), torch.Size([350]))

In [25]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
            
        yield out_data_dict

In [26]:
gen=generate_batches(data, 3)

In [27]:
g=next(gen)
g['x_data'].shape, g['y_target'].shape, g['len_ratio'].shape


(torch.Size([3, 350]), torch.Size([3, 2]), torch.Size([3]))

In [28]:
class BertForSequenceRegression(nn.Module):
  
    def __init__(self, num_marks=2):    
        super(BertForSequenceRegression, self).__init__()
        self.num_marks = num_marks        
        with open("/kaggle/working/deberta_model.pkl", "rb") as f:            
            self.bert = pickle.load(f)
                
        self.hidden_1=nn.Linear(2*config.hidden_size, 2*config.hidden_size)
        self.notline_1=nn.ReLU()
        self.dropout_1 = nn.Dropout(config.hidden_dropout_prob)
        self.hidden_2=nn.Linear(2*config.hidden_size, config.hidden_size)
        self.notline_2=nn.ReLU()
        self.dropout_2 = nn.Dropout(config.hidden_dropout_prob)
        self.hidden = nn.Linear(config.hidden_size, 128)
        self.regres = nn.Linear(128, num_marks)     
    
    def forward(self, input_ids, content_vector,\
                token_type_ids=None, attention_mask=None, labels=None):
        output_bert = self.bert(input_ids, token_type_ids, attention_mask).last_hidden_state.mean(1)
        h_concat=torch.cat((content_vector,output_bert), dim=1)        
        hidden_vec_1=self.hidden_1(h_concat)
        hidden_drop_1=self.dropout_1(hidden_vec_1) 
        hidden_vecn_1=self.notline_1(hidden_drop_1) 
        hidden_vec_2=self.hidden_2(hidden_vecn_1)
        hidden_drop_2=self.dropout_2(hidden_vec_2) 
        hidden_vecn_2=self.notline_2(hidden_drop_2) 
        hidden=self.hidden(hidden_vecn_2)        
        marks = self.regres(hidden)
                
        return marks

In [29]:
config=Namespace(
hidden_dropout_prob=0.05,
hidden_size=768
)

In [30]:
model=BertForSequenceRegression()
model

BertForSequenceRegression(
  (bert): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (intermedia

In [31]:
from sklearn.metrics import mean_squared_error as RMSE

In [32]:
args = Namespace(
    # Training hyper parameter
    num_epochs=50,
    learning_rate=1.2e-5,
    batch_size=4,
    seed=99,    
    cuda=True
    )

In [33]:
def make_train_state(args):
    return {            
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': []
            }

In [34]:
make_train_state(args)

{'learning_rate': 1.2e-05,
 'epoch_index': 0,
 'train_loss': [],
 'train_acc': [],
 'val_loss': [],
 'val_acc': []}

In [35]:
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("Using CUDA: {}".format(args.cuda))

Using CUDA: True


In [36]:
train_c.head(1)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_text_sent,len_ratio
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,"[tensor(-0.1649), tensor(0.2333), tensor(-0.01...",0.098571


In [37]:
from sklearn.model_selection import LeaveOneGroupOut
logo = LeaveOneGroupOut()

In [38]:
groups = train_c['prompt_id']
for i, (train_index, test_index) in enumerate(logo.split(train_c, groups=groups)):    
    X_train = train_c.iloc[train_index].copy()
    X_train['split']='train'
    X_val = train_c.iloc[test_index].copy()
    X_val['split']='val'
    X=pd.concat([X_train, X_val], ignore_index=True)
    print(X[X.split=='train'].prompt_id.value_counts())
    print(X[X.split=='val'].prompt_id.value_counts())

prompt_id
3b9047    2009
ebad26    1996
814d6b    1103
Name: count, dtype: int64
prompt_id
39c16e    2057
Name: count, dtype: int64
prompt_id
39c16e    2057
ebad26    1996
814d6b    1103
Name: count, dtype: int64
prompt_id
3b9047    2009
Name: count, dtype: int64
prompt_id
39c16e    2057
3b9047    2009
ebad26    1996
Name: count, dtype: int64
prompt_id
814d6b    1103
Name: count, dtype: int64
prompt_id
39c16e    2057
3b9047    2009
814d6b    1103
Name: count, dtype: int64
prompt_id
ebad26    1996
Name: count, dtype: int64


In [39]:
loss_func = nn.MSELoss()
groups = train_c['prompt_id']
logo = LeaveOneGroupOut()

for i, (train_index, test_index) in enumerate(logo.split(train_c, groups=groups)):
    min_val_loss = np.inf
    train_state = make_train_state(args)
    X_train = train_c.iloc[train_index].copy()
    X_train['split']='train'
    X_val = train_c.iloc[test_index].copy()
    X_val['split']='val'
    X=pd.concat([X_train, X_val], ignore_index=True)
    
    data=TextDataset(X, 260)      
    
            
    running_loss = 0.0
    running_acc = 0.0
    patience = 0
    
    #regr = BertForSequenceRegression().to(args.device)
    regr = BertForSequenceRegression().to(args.device)
    
    lrlast = .001
    lrmain = args.learning_rate
    optimizer = optim.Adam(
    [
        {"params":regr.bert.parameters(),"lr": lrmain},
        {"params":regr.regres.parameters(), "lr": lrlast},
       
   ])  
    
    
    print(f'-------------------- Fold {i+1} --------------------')
    for epoch in range(args.num_epochs):
        regr.train()
        data.set_split('train')
        batch_generator = generate_batches(data, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
    
        running_loss = 0.0
        running_acc = 0.0
        for batch_index, batch_dict in enumerate(batch_generator):
            n_batch=batch_index
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
                        
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = regr(batch_dict['x_data'], batch_dict['content_vector'], \
                          attention_mask=batch_dict['attention_mask'])
                        
            # step 3. compute the loss
            
            loss = loss_func(y_pred, batch_dict['y_target'].float())
    
            #running_loss += (loss.item() - running_loss) / (batch_index + 1)
            running_loss += loss.item()

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the RMSE
            acc_t = RMSE(batch_dict['y_target'].to('cpu').numpy(), y_pred.detach().to('cpu').numpy(), squared=False)
            #running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_acc += acc_t
            
        avg_loss=running_loss/(n_batch+1)
        
        train_state['train_loss'].append(running_loss/(n_batch+1))
        train_state['train_acc'].append(running_acc/(n_batch+1))
        
        # Validation loop
        data.set_split('val')
        batch_generator = generate_batches(data, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        regr.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            n_batch=batch_index
            # compute the output
            y_pred = regr(batch_dict['x_data'], batch_dict['content_vector'],\
                          attention_mask=batch_dict['attention_mask'])
                        
            # step 3. compute the loss
            loss = loss_func(y_pred, batch_dict['y_target'].float())            
            #running_loss += (loss.item() - running_loss) / (batch_index + 1)
            running_loss += loss.item()
            # compute the accuracy
            acc_t = RMSE(batch_dict['y_target'].to('cpu').numpy(), y_pred.detach().to('cpu').numpy(), squared=False)
            #running_acc += (acc_t - running_acc) / (batch_index + 1)
            running_acc += acc_t
            
        avg_acc = running_acc/(n_batch+1) 
        avg_val_loss = running_loss/(n_batch+1)
        
        train_state['val_loss'].append(running_loss/(n_batch+1))
        train_state['val_acc'].append(running_acc/(n_batch+1))
        
        print(f'Epoch {epoch+1} Loss: {avg_loss:.3f} SCORE:{avg_acc:.3}')

        #if (avg_val_loss < min_val_loss) and (epoch!=0):
        if (avg_val_loss < min_val_loss):
            patience = 0
            min_val_loss = avg_val_loss
            model_name='deb_model' +'_'+ str(i)+'.pt'
            torch.save(regr.state_dict(), model_name)
            print(f'saving model with score: {avg_acc:.3f}')

        patience += 1    
        if patience >=3 :
            print(f'Early Stopping trigerred on epoch: {epoch+1}')            
            break
    


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


-------------------- Fold 1 --------------------
Epoch 1 Loss: 0.434 SCORE:0.488
saving model with score: 0.488
Epoch 2 Loss: 0.281 SCORE:0.483
saving model with score: 0.483
Epoch 3 Loss: 0.232 SCORE:0.528
Epoch 4 Loss: 0.199 SCORE:0.473
saving model with score: 0.473
Epoch 5 Loss: 0.165 SCORE:0.522
Epoch 6 Loss: 0.139 SCORE:0.582
Early Stopping trigerred on epoch: 6
-------------------- Fold 2 --------------------
Epoch 1 Loss: 0.437 SCORE:0.692
saving model with score: 0.692
Epoch 2 Loss: 0.243 SCORE:0.651
saving model with score: 0.651
Epoch 3 Loss: 0.223 SCORE:0.649
Epoch 4 Loss: 0.172 SCORE:0.645
Early Stopping trigerred on epoch: 4
-------------------- Fold 3 --------------------
Epoch 1 Loss: 0.363 SCORE:0.669
saving model with score: 0.669
Epoch 2 Loss: 0.245 SCORE:0.713
Epoch 3 Loss: 0.202 SCORE:0.645
saving model with score: 0.645
Epoch 4 Loss: 0.174 SCORE:0.582
saving model with score: 0.582
Epoch 5 Loss: 0.148 SCORE:0.633
Epoch 6 Loss: 0.123 SCORE:0.592
Early Stopping trig

In [40]:
y_pred.shape

torch.Size([4, 2])