In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import krippendorff
import csv
from sklearn.metrics import accuracy_score

import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from transformers import AutoTokenizer

In [2]:
train_all=pd.read_csv('../alldata/si630w22-hw3-train.csv')
data_all=pd.read_csv('../alldata/si630w22-hw3-data.csv')
dev_all=pd.read_csv('../alldata/si630w22-hw3-dev.csv')

In [24]:
# group_ids=['01','02','03','04','05']
group_ids=['06']
# group_ids=['07','08','09','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25']
# '07','08','09','10','11','12','13','14','15',

In [4]:
def process_data(group_id):
    train_df=train_all[train_all['group']!='group_'+group_id]
    train_df_agg=train_df.groupby(['id'])[['id','rating']].mean().reset_index()
    train_df_agg=train_df_agg.rename(columns={'id':'question_id'})
    train_df_agg=pd.merge(train_df_agg,data_all,on='question_id')
    dev_exclude=dev_all[dev_all['group']=='group_'+group_id]
    dev1=dev_all[dev_all['group']!='group_'+group_id]
    dev2=pd.merge(dev_exclude,dev_all,on='id').drop(columns=['annotator_id_x','rating_x','group_x'])
    dev2=dev2.rename(columns={'annotator_id_y':'annotator_id','rating_y':'rating','group_y':'group','id':'question_id'})

    dev3=dev2[dev2['group']!='group_'+group_id]
    dev2=dev2[dev2['group']=='group_'+group_id]
    annotation_list=list(set(dev3['question_id'].tolist()))
    # print(annotation_list)
    dev2=dev2[dev2['question_id'].isin(annotation_list)]
    dev1=dev1.rename(columns={'id':'question_id'})


    dev1=dev1.groupby(['question_id'])[['question_id','rating']].mean().reset_index()
    dev2=dev2.groupby(['question_id'])[['question_id','rating']].mean().reset_index()
    dev3=dev3.groupby(['question_id'])[['question_id','rating']].mean().reset_index()
    
    dev1=pd.merge(dev1,data_all,on='question_id')
    dev2=pd.merge(dev2,data_all,on='question_id')
    dev3=pd.merge(dev3,data_all,on='question_id')
    
    return train_df_agg,dev1,dev2,dev3

In [5]:
class Example:
    def __init__(self,question_id,question,answer,label=None):
        self.question_id=question_id
        self.question=question
        self.answer=answer
        self.label=label

In [6]:
def to_input(df):
    question_list=[]
    input_list=[]
    for idx,row in df.iterrows():
        question_id=row['question_id']
        if 'rating' in df.columns:
#             print(row['rating'])
            if row['rating']==1 or row['rating']==2 or row['rating']==3 or row['rating']==4 or row['rating']==5:
                rating=int(row['rating'])-1
            else:
                continue
        else:
            rating=None
        question=row['question_text']
        answer=row['reply_text']
#         print(question_id,rating,question,answer)
        
        example=Example(question_id,question,answer,rating)
        input_list.append(example)
    return input_list

In [7]:
def to_sequence(input_list,tokenizer,max_length=128):
    X_list=[]
    label_list=[]
    for example in input_list:
        x=example.question+"['[SEP]']"+example.answer
        label=example.label
        label_list.append(label)
        X_list.append(x)
#     print(X_list)
    X_train=tokenizer(X_list,padding=True,truncation=True,max_length=max_length)

    return X_train,label_list

In [8]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [9]:
def to_dataset(train_df,dev_df1,dev_df2,dev_df3):
    
    tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")

    input_list=to_input(train_df)
    X_train,Y_train=to_sequence(input_list,tokenizer)
    dev1_list=to_input(dev_df1)
    X_dev1,Y_dev1=to_sequence(dev1_list,tokenizer)
    
    dev2_list=to_input(dev_df2)
    if len(dev2_list)!=0:
        X_dev2,Y_dev2=to_sequence(dev2_list,tokenizer)
    else:
        X_dev2=None
        Y_dev2=[]
        
    dev3_list=to_input(dev_df3)
    if len(dev3_list)!=0:
        X_dev3,Y_dev3=to_sequence(dev3_list,tokenizer)
    else:
        X_dev3=None
        Y_dev3=[]
#     print(X_dev3)
    
    max_length=128
    train_dataset = Dataset(X_train,Y_train)
    val_dataset1 = Dataset(X_dev1,Y_dev1)
    val_dataset2 = Dataset(X_dev2,Y_dev2)
    val_dataset3 = Dataset(X_dev3,Y_dev3)
    
    return train_dataset,[val_dataset1,val_dataset2,val_dataset3],[[X_dev1,Y_dev1],[X_dev2,Y_dev2],[X_dev3,Y_dev3]]

In [19]:
def calculate_pearson(input_list,label_list,max_length,trainer):
#     print(input_list['input_ids'])
    data_unlabeled = Dataset(input_list,None)
#     dataloader=torch.utils.data.DataLoader(data_unlabeled)
#     for step,data in enumerate(dataloader):
#         output=model1(data['input_ids'],data['token_type_ids'],data['attention_mask'])
#         pred_list.append(output.tolist()[0][0])
        
    raw_test_result,pred_label,_=trainer.predict(data_unlabeled)
    y_pred = np.argmax(raw_test_result, axis=1)+1
#     print(raw_test_result)
    print(y_pred)
    print(len(y_pred))
#     for i in range(len(pred_label)):
#         pred_label[i]+=1
    
    
    pr,_=pearsonr(y_pred,label_list)
#     print(pr)
    return pr

In [20]:



device=torch.device("cuda" if torch.cuda.is_available else "cpu")

model_path="../model/MiniLM-L12-H384-uncased/"


In [25]:
all_pr=[]
max_length=128

for group in group_ids:
    train_df,dev_df1,dev_df2,dev_df3=process_data(group)
#     print(train_df.head())
#     break
    train_dataset,eval_dataset_list,all_eval_list=to_dataset(train_df,dev_df1,dev_df2,dev_df3)
    
#     print(all_eval_list[0]==all_eval_list[1])
    
    print("=================group:",group,"===================")
    pr_list=[]
    for i in range(3):
        print("============",i,"=============")
        eval_dataset=eval_dataset_list[i]
        [eval_list,label_list]=all_eval_list[i]

        
        args = TrainingArguments(output_dir="ouput_part3",
                            evaluation_strategy="epoch",
                            per_device_train_batch_size=8,
                            per_device_eval_batch_size=8,
                            num_train_epochs=5)
        
        model = BertForSequenceClassification.from_pretrained(model_path, num_labels=5)
        model=model.to(device)
        
        trainer1 = Trainer(
            model=model,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset

        )
        
        # Train pre-trained model
        trainer1.train()
        pr=calculate_pearson(eval_list,label_list,max_length,trainer1)
        print("pearson score:",pr)
        pr_list.append(pr)
    all_pr.append(pr_list)
    
    if group=='05' or group=='10' or group=='15' or group=='25':
        df=pd.DataFrame(all_pr,columns=['A','B','C'])
        filename='group'+group+"checkpoint_classification.csv"
        df.to_csv(filename)
    

loading configuration file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/config.json from cache at /home/zihuiliu/.cache/huggingface/transformers/ceb753d3f27a8c0d09184f35884666cda91b8ae610cd2a54d89793ac7663f1f9.13815020fd994b27db9974c0ce0ec4c47dfac6c8f11bf1a35a0a06d5b165665a
Model config BertConfig {
  "_name_or_path": "microsoft/MiniLM-L12-H384-uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/microsoft/MiniLM-L12-H384-uncased/resolve/main/vocab.txt from cache at



All model checkpoint weights were used when initializing BertForSequenceClassification.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../model/MiniLM-L12-H384-uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 1078
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 675


Epoch,Training Loss,Validation Loss
1,No log,1.077674
2,No log,1.016866
3,No log,1.047409
4,1.010500,1.032114
5,1.010500,1.08147


***** Running Evaluation *****
  Num examples = 217
  Batch size = 8
***** Running Evaluation *****
  Num examples = 217
  Batch size = 8
***** Running Evaluation *****
  Num examples = 217
  Batch size = 8
Saving model checkpoint to ouput_part3/checkpoint-500
Configuration saved in ouput_part3/checkpoint-500/config.json
Model weights saved in ouput_part3/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 217
  Batch size = 8
***** Running Evaluation *****
  Num examples = 217
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 217
  Batch size = 8


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file ../model/MiniLM-L12-H384-uncased/config.json
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position

[5 5 5 3 5 4 4 5 3 4 5 5 5 3 4 5 4 4 4 4 5 5 5 4 4 5 3 5 5 5 5 4 4 5 5 3 5
 5 5 4 4 4 3 4 5 4 5 3 4 5 4 5 4 5 5 4 5 5 5 5 5 5 4 5 5 3 5 3 5 5 3 4 5 5
 3 5 5 5 5 3 4 5 5 3 4 5 3 5 4 4 5 5 5 4 3 5 4 4 3 4 3 3 5 4 3 3 3 4 5 5 5
 5 4 5 5 5 3 5 4 3 5 5 4 5 3 3 5 5 4 5 4 5 5 4 5 3 4 3 3 4 4 5 3 3 3 5 4 5
 4 5 4 3 3 5 4 5 3 3 5 5 5 3 3 5 5 5 4 3 5 3 3 4 4 5 5 5 5 4 4 3 3 5 5 4 5
 4 4 5 5 5 5 5 5 4 4 4 3 5 5 4 5 5 4 4 4 5 5 3 5 4 4 3 3 5 3 4 5]
217
pearson score: 0.6308644476736225


All model checkpoint weights were used when initializing BertForSequenceClassification.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../model/MiniLM-L12-H384-uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 1078
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 675


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 