In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import csv
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

In [5]:
train_all=pd.read_csv('../alldata/si630w22-hw3-train.csv')
data_all=pd.read_csv('../alldata/si630w22-hw3-data.csv')
dev_all=pd.read_csv('../alldata/si630w22-hw3-dev.csv')

In [32]:
# group_ids=['01','02','03','04','05']
# group_ids=['06']
group_ids=['07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25']
# '07','08','09','10','11','12','13','14','15',

In [8]:

def process_data(group_id):
    train_df=train_all[train_all['group']!='group_'+group_id]
    train_df_agg=train_df.groupby(['id'])[['id','rating']].mean().reset_index()
    train_df_agg=train_df_agg.rename(columns={'id':'question_id'})
    train_df_agg=pd.merge(train_df_agg,data_all,on='question_id')
    dev_exclude=dev_all[dev_all['group']=='group_'+group_id]
    dev1=dev_all[dev_all['group']!='group_'+group_id]
    dev2=pd.merge(dev_exclude,dev_all,on='id').drop(columns=['annotator_id_x','rating_x','group_x'])
    dev2=dev2.rename(columns={'annotator_id_y':'annotator_id','rating_y':'rating','group_y':'group','id':'question_id'})

    dev3=dev2[dev2['group']!='group_'+group_id]
    dev2=dev2[dev2['group']=='group_'+group_id]
    annotation_list=list(set(dev3['question_id'].tolist()))
    # print(annotation_list)
    dev2=dev2[dev2['question_id'].isin(annotation_list)]
    dev1=dev1.rename(columns={'id':'question_id'})


    dev1=dev1.groupby(['question_id'])[['question_id','rating']].mean().reset_index()
    dev2=dev2.groupby(['question_id'])[['question_id','rating']].mean().reset_index()
    dev3=dev3.groupby(['question_id'])[['question_id','rating']].mean().reset_index()
    
    dev1=pd.merge(dev1,data_all,on='question_id')
    dev2=pd.merge(dev2,data_all,on='question_id')
    dev3=pd.merge(dev3,data_all,on='question_id')
    
    return train_df_agg,dev1,dev2,dev3

In [26]:
train_df_agg,dev1,dev2,dev3=process_data('06')
dev1

Unnamed: 0,question_id,rating,question_text,reply_id,reply_text,rlen
0,t3_n26t3x,5.000000,"Gamer girls, what are your experiences when ga...",gwhl8wk,Most males have found it refreshing that I’m t...,603
1,t3_n27j2t,4.400000,How can I deal with loneliness in a healthier ...,gwhrkod,Journal!! That’s the best thing ever. Not ever...,440
2,t3_n28ch5,3.166667,Offset panda: is it a scam?,gwila0j,"I went to the only website I could find, and i...",350
3,t3_n29tbt,3.250000,How would most people actually react to an apo...,gwi4duw,most would probably all hurry up and buy all t...,108
4,t3_n2awyq,4.200000,How has photography changed the world?,gwibgl5,It's added perspective by allowing people to s...,455
...,...,...,...,...,...,...
806,t3_np84aj,4.750000,What is the best way to hide a body?,h03oc2y,"Warm humid climate, warm water - wrap the body...",437
807,t3_npe9n2,5.000000,ELI5 : Why does the area around the wound feel...,h04l2ng,"Damaged / inflamed tissue releases histamines,...",342
808,t3_npex8i,4.600000,Is it wrong or/and bad to write about the same...,h04o1o3,Authors have their niche and I think readers f...,897
809,t3_npf5lq,2.833333,"Retail worker, what are your horror stories?",h04r6dx,Someone pooped in a shopping cart.,34


In [10]:
class Example:
    def __init__(self,question_id,question,answer,label=None):
        self.question_id=question_id
        self.question=question
        self.answer=answer
        self.label=label

In [11]:
def to_input(df):
    question_list=[]
    input_list=[]
    for idx,row in df.iterrows():
#         user_id=row['annotator_id']
        question_id=row['question_id']
        if 'rating' in df.columns:
#             print(row['rating'])
            if row['rating']==1 or row['rating']==2 or row['rating']==3 or row['rating']==4 or row['rating']==5:
                rating=int(row['rating'])-1
            else:
                continue
        else:
            rating=None
        question=row['question_text']
        answer=row['reply_text']
#         print(question_id,rating,question,answer)
        
        example=Example(question_id,question,answer,rating)
        input_list.append(example)
    return input_list

In [12]:
def to_sequence(input_list,tokenizer,max_length=128):
    X_list=[]
    rating_list=[]
    for example in input_list:
        x=example.question+"['[SEP]']"+example.answer
        label=example.label
        rating_list.append(label)
        X_list.append(x)
#     print("=======================")
#     print(X_list)
#     print("=======================")
    X_train=tokenizer(X_list,padding=True,truncation=True,max_length=max_length)
    X_train=dict(X_train)
    return X_train,rating_list

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [27]:
def to_dataset(train_df,dev_df1,dev_df2,dev_df3):
    
    tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")

    input_list=to_input(train_df)
    X_train,Y_train=to_sequence(input_list,tokenizer)
    dev1_list=to_input(dev_df1)
    X_dev1,Y_dev1=to_sequence(dev1_list,tokenizer)
    
    dev2_list=to_input(dev_df2)
    if len(dev2_list)!=0:
        X_dev2,Y_dev2=to_sequence(dev2_list,tokenizer)
    else:
        X_dev2=None
        Y_dev2=[]
        
    dev3_list=to_input(dev_df3)
    if len(dev3_list)!=0:
        X_dev3,Y_dev3=to_sequence(dev3_list,tokenizer)
    else:
        X_dev3=None
        Y_dev3=[]
#     print(X_dev3)
    
    max_length=128
    train_dataset = Dataset(X_train,Y_train)
    val_dataset1 = Dataset(X_dev1,Y_dev1)
    val_dataset2 = Dataset(X_dev2,Y_dev2)
    val_dataset3 = Dataset(X_dev3,Y_dev3)
    
    return train_dataset,[val_dataset1,val_dataset2,val_dataset3],[[X_dev1,Y_dev1],[X_dev2,Y_dev2],[X_dev3,Y_dev3]]

In [18]:
from transformers import TrainingArguments, Trainer

device=torch.device("cuda" if torch.cuda.is_available else "cpu")

class Regression(nn.Module):
    def __init__(self):
        super(Regression, self).__init__()
        self.miniLM=AutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
        self.regressor=nn.Linear(384,1)
    def forward(self,input_ids,attention_mask,token_type_ids,labels=None):
        input_ids=input_ids.to(device)
        attention_mask=attention_mask.to(device)
        token_type_ids=token_type_ids.to(device)
        output=self.miniLM(input_ids,attention_mask,token_type_ids)
#         print("output",output)
#         print("output type:",type(output))
#         print("output[1]:",output[1])
#         print("output[1].shape:",output[1].shape)
        pred=self.regressor(output[1])
#         print("pred:",pred)
        return pred

class RevisedTrainer(Trainer):
    def compute_loss(self,model,inputs,return_outputs=False):
        label=torch.FloatTensor(list(inputs["labels"])).to(device)
#         print(ratings)
        outputs=model(**inputs)
        loss_func=nn.MSELoss()
        loss=loss_func(outputs.squeeze(1).float(),label)
        return (loss,outputs) if return_outputs else loss


def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy}


In [19]:


def calculate_pearson(input_list,label_list,max_length,trainer):
#     print(input_list['input_ids'])
    data_unlabeled = Dataset(input_list,label_list)
    pred_list=[]
    dataloader=torch.utils.data.DataLoader(data_unlabeled)
    for step,data in enumerate(dataloader):
        output=model1(data['input_ids'],data['token_type_ids'],data['attention_mask'])
        pred_list.append(output.tolist()[0][0])
        
#     pred=trainer.predict(data_unlabeled)

#     print(pred_list)
    print(len(pred_list))
    print(len(label_list))
    
    pr,_=pearsonr(pred_list,label_list)
#     print(pr)
    return pr

In [33]:
all_pr=[]
max_length=128

for group in group_ids:
    train_df,dev_df1,dev_df2,dev_df3=process_data(group)
#     print(train_df.head())
#     break
    train_dataset,eval_dataset_list,all_eval_list=to_dataset(train_df,dev_df1,dev_df2,dev_df3)
    
#     print(all_eval_list[0]==all_eval_list[1])
    
    print("=================group:",group,"===================")
    pr_list=[]
    for i in range(3):
        print("============",i,"=============")
        eval_dataset=eval_dataset_list[i]
        [eval_list,label_list]=all_eval_list[i]

        
        args = TrainingArguments(output_dir="ouput_part3",
        evaluation_strategy="epoch",
                            per_device_train_batch_size=8,
                            per_device_eval_batch_size=8,
                            num_train_epochs=5)

        model1=Regression()
        trainer1 = RevisedTrainer(
            model=model1.to(device),
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset

        )
        
        # Train pre-trained model
        trainer1.train()
        pr=calculate_pearson(eval_list,label_list,max_length,trainer1)
        print("pearson score:",pr)
        pr_list.append(pr)
    all_pr.append(pr_list)
    
    if group=='05' or group=='10' or group=='15' or group=='25':
        df=pd.DataFrame(all_pr,columns=['A','B','C'])
        filename='group'+group+"checkpoint.csv"
        df.to_csv(filename)
    

loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1152
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 720


Epoch,Training Loss,Validation Loss
1,No log,0.701066
2,No log,0.573868
3,No log,0.572334
4,0.758000,0.635622
5,0.758000,0.573225


***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8
***** Running Evaluation *****
  Num examples = 241
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

241
241
pearson score: 0.6060139300016087


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1152
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 720


Epoch,Training Loss,Validation Loss
1,No log,0.253884
2,No log,0.693548
3,No log,0.823859
4,0.758000,0.36969
5,0.758000,0.538218


***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

37
37
pearson score: 0.3908110704828902


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1152
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 720


Epoch,Training Loss,Validation Loss
1,No log,0.843224
2,No log,0.971082
3,No log,1.006264
4,0.758000,1.109877
5,0.758000,1.09207


***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




36
36
pearson score: 0.25405900381314084


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1162
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.668844
2,No log,0.585587
3,No log,0.503045
4,0.760100,0.563116
5,0.760100,0.513468


***** Running Evaluation *****
  Num examples = 231
  Batch size = 8
***** Running Evaluation *****
  Num examples = 231
  Batch size = 8
***** Running Evaluation *****
  Num examples = 231
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 231
  Batch size = 8
***** Running Evaluation *****
  Num examples = 231
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

231
231
pearson score: 0.2943743156710711


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1162
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,1.267164
2,No log,2.619842
3,No log,2.070941
4,0.760100,2.508174
5,0.760100,2.156802


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

33
33
pearson score: 0.7097668850126448


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1162
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.570529
2,No log,0.468283
3,No log,0.421962
4,0.760100,0.46983
5,0.760100,0.501786


***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




30
30
pearson score: 0.3036221605817847


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1172
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,0.740828
2,No log,0.541098
3,No log,0.537163
4,0.721300,0.569726
5,0.721300,0.498643


***** Running Evaluation *****
  Num examples = 229
  Batch size = 8
***** Running Evaluation *****
  Num examples = 229
  Batch size = 8
***** Running Evaluation *****
  Num examples = 229
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 229
  Batch size = 8
***** Running Evaluation *****
  Num examples = 229
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

229
229
pearson score: 0.6147277948826547


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1172
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,0.694243
2,No log,0.663264
3,No log,0.581472
4,0.721300,0.547431
5,0.721300,0.560322


***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

35
35
pearson score: 0.7278685509301861


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1172
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,0.905099
2,No log,0.666303
3,No log,0.629472
4,0.721300,0.670586
5,0.721300,0.601007


***** Running Evaluation *****
  Num examples = 34
  Batch size = 8
***** Running Evaluation *****
  Num examples = 34
  Batch size = 8
***** Running Evaluation *****
  Num examples = 34
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 34
  Batch size = 8
***** Running Evaluation *****
  Num examples = 34
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




34
34
pearson score: 0.6427505986770784


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1179
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 740


Epoch,Training Loss,Validation Loss
1,No log,0.588465
2,No log,0.647894
3,No log,0.582393
4,0.780500,0.535058
5,0.780500,0.518812


***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

230
230
pearson score: 0.5958619607047777


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1179
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 740


Epoch,Training Loss,Validation Loss
1,No log,0.886187
2,No log,1.200458
3,No log,1.164356
4,0.780500,1.151283
5,0.780500,1.342747


***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8
***** Running Evaluation *****
  Num examples = 20
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

20
20
pearson score: -0.24955702085094802


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1179
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 740


Epoch,Training Loss,Validation Loss
1,No log,0.609421
2,No log,0.64174
3,No log,0.581495
4,0.780500,0.586666
5,0.780500,0.662653


***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




26
26
pearson score: 0.51121831285433


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1185
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 745


Epoch,Training Loss,Validation Loss
1,No log,0.567312
2,No log,0.58275
3,No log,0.519961
4,0.803300,0.609602
5,0.803300,0.561799


***** Running Evaluation *****
  Num examples = 248
  Batch size = 8
***** Running Evaluation *****
  Num examples = 248
  Batch size = 8
***** Running Evaluation *****
  Num examples = 248
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 248
  Batch size = 8
***** Running Evaluation *****
  Num examples = 248
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

248
248
pearson score: -0.0029732061302449424


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1185
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 745


Epoch,Training Loss,Validation Loss
1,No log,0.83478
2,No log,0.73583
3,No log,0.822339
4,0.803300,1.281989
5,0.803300,1.10141


***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
***** Running Evaluation *****
  Num examples = 26
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

26
26
pearson score: -0.16928764162770607


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1185
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 745


Epoch,Training Loss,Validation Loss
1,No log,0.841729
2,No log,0.880073
3,No log,0.755932
4,0.803300,0.838323
5,0.803300,0.812339


***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




42
42
pearson score: -0.09211305251829581


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1175
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,0.704827
2,No log,0.559962
3,No log,0.522298
4,0.776800,0.59317
5,0.776800,0.550917


***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

232
232
pearson score: 0.6322355233236927


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1175
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,1.364548
2,No log,1.129993
3,No log,0.986115
4,0.776800,1.070307
5,0.776800,0.958811


***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

41
41
pearson score: 0.634267606702463


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1175
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,0.745757
2,No log,0.672445
3,No log,0.626897
4,0.776800,0.668829
5,0.776800,0.621456


***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




35
35
pearson score: 0.6056998399542277


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1160
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 725


Epoch,Training Loss,Validation Loss
1,No log,0.947096
2,No log,0.563165
3,No log,0.650951
4,0.745700,0.564575
5,0.745700,0.580464


***** Running Evaluation *****
  Num examples = 238
  Batch size = 8
***** Running Evaluation *****
  Num examples = 238
  Batch size = 8
***** Running Evaluation *****
  Num examples = 238
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 238
  Batch size = 8
***** Running Evaluation *****
  Num examples = 238
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

238
238
pearson score: 0.013950205963971292


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1160
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 725


Epoch,Training Loss,Validation Loss
1,No log,0.737854
2,No log,0.835182
3,No log,0.800332
4,0.745700,0.903643
5,0.745700,0.938135


***** Running Evaluation *****
  Num examples = 31
  Batch size = 8
***** Running Evaluation *****
  Num examples = 31
  Batch size = 8
***** Running Evaluation *****
  Num examples = 31
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 31
  Batch size = 8
***** Running Evaluation *****
  Num examples = 31
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

31
31
pearson score: 0.14334318439467758


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1160
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 725


Epoch,Training Loss,Validation Loss
1,No log,1.522508
2,No log,0.834135
3,No log,1.086566
4,0.745700,0.920424
5,0.745700,0.866009


***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




36
36
pearson score: 0.1461809072596399


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1140
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 715


Epoch,Training Loss,Validation Loss
1,No log,0.675437
2,No log,0.632173
3,No log,0.622683
4,0.745800,0.599509
5,0.745800,0.579932


***** Running Evaluation *****
  Num examples = 247
  Batch size = 8
***** Running Evaluation *****
  Num examples = 247
  Batch size = 8
***** Running Evaluation *****
  Num examples = 247
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 247
  Batch size = 8
***** Running Evaluation *****
  Num examples = 247
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

247
247
pearson score: 0.5724926866330147


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1140
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 715


Epoch,Training Loss,Validation Loss
1,No log,0.655822
2,No log,0.602994
3,No log,0.613482
4,0.745800,0.506572
5,0.745800,0.49676


***** Running Evaluation *****
  Num examples = 65
  Batch size = 8
***** Running Evaluation *****
  Num examples = 65
  Batch size = 8
***** Running Evaluation *****
  Num examples = 65
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 65
  Batch size = 8
***** Running Evaluation *****
  Num examples = 65
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

65
65
pearson score: 0.695465409486961


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1140
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 715


Epoch,Training Loss,Validation Loss
1,No log,0.899489
2,No log,0.776484
3,No log,0.757539
4,0.745800,0.752543
5,0.745800,0.73155


***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 49
  Batch size = 8
***** Running Evaluation *****
  Num examples = 49
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




49
49
pearson score: 0.5128366894296483


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1171
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,0.604119
2,No log,0.647978
3,No log,0.559162
4,0.759900,0.592018
5,0.759900,0.571619


***** Running Evaluation *****
  Num examples = 234
  Batch size = 8
***** Running Evaluation *****
  Num examples = 234
  Batch size = 8
***** Running Evaluation *****
  Num examples = 234
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 234
  Batch size = 8
***** Running Evaluation *****
  Num examples = 234
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


234
234
pearson score: 0.6692575252595082


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.b

Epoch,Training Loss,Validation Loss
1,No log,0.389281
2,No log,0.516263
3,No log,0.506359
4,0.759900,0.506656
5,0.759900,0.531276


***** Running Evaluation *****
  Num examples = 44
  Batch size = 8
***** Running Evaluation *****
  Num examples = 44
  Batch size = 8
***** Running Evaluation *****
  Num examples = 44
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 44
  Batch size = 8
***** Running Evaluation *****
  Num examples = 44
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

44
44
pearson score: 0.37995633394496897


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1171
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 735


Epoch,Training Loss,Validation Loss
1,No log,0.817359
2,No log,0.880386
3,No log,0.776674
4,0.759900,0.819942
5,0.759900,0.818522


***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




29
29
pearson score: 0.5467101612730584


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1161
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.597604
2,No log,0.546808
3,No log,0.582188
4,0.766900,0.586731
5,0.766900,0.519584


***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

235
235
pearson score: 0.4620446632056122


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1161
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.51747
2,No log,0.485494
3,No log,0.537215
4,0.766900,0.654348
5,0.766900,0.519244


***** Running Evaluation *****
  Num examples = 32
  Batch size = 8
***** Running Evaluation *****
  Num examples = 32
  Batch size = 8
***** Running Evaluation *****
  Num examples = 32
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 32
  Batch size = 8
***** Running Evaluation *****
  Num examples = 32
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

32
32
pearson score: 0.6306135736456971


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1161
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.830599
2,No log,0.697393
3,No log,0.747997
4,0.766900,0.700716
5,0.766900,0.627685


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




33
33
pearson score: 0.6763456341558017


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1168
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.649784
2,No log,0.678756
3,No log,0.616022
4,0.748900,0.52572
5,0.748900,0.581147


***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

235
235
pearson score: 0.650834536850358


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1168
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.551026
2,No log,0.471869
3,No log,0.579461
4,0.748900,0.658877
5,0.748900,0.550536


***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8
***** Running Evaluation *****
  Num examples = 33
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

33
33
pearson score: 0.6019485072079837


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1168
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss
1,No log,0.721903
2,No log,0.709513
3,No log,0.51926
4,0.748900,0.495774
5,0.748900,0.494492


***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




36
36
pearson score: 0.7262581964300043


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1155
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 725


Epoch,Training Loss,Validation Loss
1,No log,0.640454
2,No log,0.543084
3,No log,0.492926
4,0.704200,0.493621
5,0.704200,0.499678


***** Running Evaluation *****
  Num examples = 244
  Batch size = 8
***** Running Evaluation *****
  Num examples = 244
  Batch size = 8
***** Running Evaluation *****
  Num examples = 244
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 244
  Batch size = 8
***** Running Evaluation *****
  Num examples = 244
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

244
244
pearson score: 0.656673457281461


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1155
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 725


Epoch,Training Loss,Validation Loss
1,No log,0.647157
2,No log,0.915989
3,No log,1.094734
4,0.704200,1.190517
5,0.704200,1.218805


***** Running Evaluation *****
  Num examples = 43
  Batch size = 8
***** Running Evaluation *****
  Num examples = 43
  Batch size = 8
***** Running Evaluation *****
  Num examples = 43
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 43
  Batch size = 8
***** Running Evaluation *****
  Num examples = 43
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

43
43
pearson score: 0.3759108188767908


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1155
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 725


Epoch,Training Loss,Validation Loss
1,No log,0.692339
2,No log,0.554032
3,No log,0.476975
4,0.704200,0.451706
5,0.704200,0.435403


***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8
***** Running Evaluation *****
  Num examples = 42
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




42
42
pearson score: 0.749535591221574


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1127
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 705


Epoch,Training Loss,Validation Loss
1,No log,0.732129
2,No log,0.538612
3,No log,0.6322
4,0.745200,0.555394
5,0.745200,0.57989


***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

232
232
pearson score: 0.6456646304358999


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1127
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 705


Epoch,Training Loss,Validation Loss
1,No log,0.790508
2,No log,0.406509
3,No log,0.493315
4,0.745200,0.347822
5,0.745200,0.338102


***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

41
41
pearson score: 0.8416279638914848


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1127
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 705


Epoch,Training Loss,Validation Loss
1,No log,1.183904
2,No log,0.770881
3,No log,1.065981
4,0.745200,0.825826
5,0.745200,0.786565


***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8
***** Running Evaluation *****
  Num examples = 36
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




36
36
pearson score: 0.6648149507735263


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1200
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss
1,No log,0.801586
2,No log,0.540449
3,No log,0.531712
4,0.799000,0.493593
5,0.799000,0.510198


***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8
***** Running Evaluation *****
  Num examples = 230
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

230
230
pearson score: 0.5883400428535791


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1200
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss
1,No log,2.636016
2,No log,1.210556
3,No log,1.255553
4,0.799000,1.076432
5,0.799000,1.053977


***** Running Evaluation *****
  Num examples = 14
  Batch size = 8
***** Running Evaluation *****
  Num examples = 14
  Batch size = 8
***** Running Evaluation *****
  Num examples = 14
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 14
  Batch size = 8
***** Running Evaluation *****
  Num examples = 14
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

14
14
pearson score: 0.7509137473574425


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1200
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss
1,No log,0.707245
2,No log,0.531561
3,No log,0.520078
4,0.799000,0.505159
5,0.799000,0.542146


***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




30
30
pearson score: 0.3326912388441565


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1135
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss
1,No log,0.642287
2,No log,0.652141
3,No log,0.544893
4,0.749800,0.538712
5,0.749800,0.557743


***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8
***** Running Evaluation *****
  Num examples = 235
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

235
235
pearson score: 0.4907001231619872


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1135
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss
1,No log,0.597286
2,No log,0.662709
3,No log,0.448083
4,0.749800,0.440949
5,0.749800,0.432915


***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8
***** Running Evaluation *****
  Num examples = 40
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

40
40
pearson score: 0.4036584100062458


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1135
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss
1,No log,0.853211
2,No log,0.963303
3,No log,0.66179
4,0.749800,0.615555
5,0.749800,0.627944


***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




35
35
pearson score: 0.502925173428671


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.b

Epoch,Training Loss,Validation Loss
1,No log,0.644459
2,No log,0.586153
3,No log,0.656028
4,0.748500,0.607885
5,0.748500,0.613171


***** Running Evaluation *****
  Num examples = 240
  Batch size = 8
***** Running Evaluation *****
  Num examples = 240
  Batch size = 8
***** Running Evaluation *****
  Num examples = 240
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 240
  Batch size = 8
***** Running Evaluation *****
  Num examples = 240
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

240
240
pearson score: 0.5925578490162837


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1190
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 745


Epoch,Training Loss,Validation Loss
1,No log,0.524504
2,No log,0.38372
3,No log,0.697721
4,0.748500,0.493817
5,0.748500,0.453306


***** Running Evaluation *****
  Num examples = 16
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 16
  Batch size = 8
***** Running Evaluation *****
  Num examples = 16
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

16
16
pearson score: 0.8238438535152258


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1190
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 745


Epoch,Training Loss,Validation Loss
1,No log,1.414885
2,No log,1.237396
3,No log,1.24918
4,0.748500,1.130744
5,0.748500,1.100037


***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




30
30
pearson score: 0.4299745044486713


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1141
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 715


Epoch,Training Loss,Validation Loss
1,No log,0.643985
2,No log,0.504868
3,No log,0.529153
4,0.706700,0.500448
5,0.706700,0.487224


***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8
***** Running Evaluation *****
  Num examples = 232
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

232
232
pearson score: 0.06581583126194704


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1141
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 715


Epoch,Training Loss,Validation Loss
1,No log,0.617717
2,No log,0.650191
3,No log,0.737312
4,0.706700,0.850638
5,0.706700,0.735749


***** Running Evaluation *****
  Num examples = 58
  Batch size = 8
***** Running Evaluation *****
  Num examples = 58
  Batch size = 8
***** Running Evaluation *****
  Num examples = 58
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 58
  Batch size = 8
***** Running Evaluation *****
  Num examples = 58
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

58
58
pearson score: 0.03965774439243592


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1141
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 715


Epoch,Training Loss,Validation Loss
1,No log,0.799198
2,No log,0.611
3,No log,0.592378
4,0.706700,0.526526
5,0.706700,0.539778


***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8
***** Running Evaluation *****
  Num examples = 29
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




29
29
pearson score: -0.14862663512275903


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1182
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 740


Epoch,Training Loss,Validation Loss
1,No log,0.594373
2,No log,0.535636
3,No log,0.578867
4,0.763600,0.531254
5,0.763600,0.536487


***** Running Evaluation *****
  Num examples = 243
  Batch size = 8
***** Running Evaluation *****
  Num examples = 243
  Batch size = 8
***** Running Evaluation *****
  Num examples = 243
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 243
  Batch size = 8
***** Running Evaluation *****
  Num examples = 243
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

243
243
pearson score: 0.15933514625675266


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1182
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 740


Epoch,Training Loss,Validation Loss
1,No log,0.972463
2,No log,0.80434
3,No log,0.675919
4,0.763600,0.663535
5,0.763600,0.752221


***** Running Evaluation *****
  Num examples = 24
  Batch size = 8
***** Running Evaluation *****
  Num examples = 24
  Batch size = 8
***** Running Evaluation *****
  Num examples = 24
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 24
  Batch size = 8
***** Running Evaluation *****
  Num examples = 24
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

24
24
pearson score: 0.1939895574774813


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1182
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 740


Epoch,Training Loss,Validation Loss
1,No log,0.699688
2,No log,0.649652
3,No log,0.763892
4,0.763600,0.649736
5,0.763600,0.636372


***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8
***** Running Evaluation *****
  Num examples = 41
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




41
41
pearson score: 0.011079423207239078


loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1135
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss
1,No log,1.000431
2,No log,0.654523
3,No log,0.628527
4,0.724900,0.581262
5,0.724900,0.643294


***** Running Evaluation *****
  Num examples = 228
  Batch size = 8
***** Running Evaluation *****
  Num examples = 228
  Batch size = 8
***** Running Evaluation *****
  Num examples = 228
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 228
  Batch size = 8
***** Running Evaluation *****
  Num examples = 228
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/hugg

228
228
pearson score: 0.6205680145358894


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1135
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss
1,No log,1.057969
2,No log,0.439215
3,No log,0.45883
4,0.724900,0.368754
5,0.724900,0.419993


***** Running Evaluation *****
  Num examples = 52
  Batch size = 8
***** Running Evaluation *****
  Num examples = 52
  Batch size = 8
***** Running Evaluation *****
  Num examples = 52
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 52
  Batch size = 8
***** Running Evaluation *****
  Num examples = 52
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingfa

52
52
pearson score: 0.8228835444856302


loading weights file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/pytorch_model.bin from cache at /home/zihuiliu/.cache/huggingface/transformers/b774244369e464de2c660477b70bae7c3223fa7250aa1c8fc0b0f037ed58418a.087808d17814e241e9352c5ce0fea1a7d05e5b0f020d44b42b5f05922e96c923
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
***** Running training *****
  Num examples = 1135
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 710


Epoch,Training Loss,Validation Loss
1,No log,1.382736
2,No log,0.998098
3,No log,0.938436
4,0.724900,0.840941
5,0.724900,0.854156


***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8
***** Running Evaluation *****
  Num examples = 37
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




37
37
pearson score: 0.5191157849747964


In [None]:
len(dic['input_ids'])

In [63]:
len(dic['token_type_ids'])

128

In [64]:
len(dic['attention_mask'])

128