# MRPC

- Determine the consistency of the two sentences
- Fine-tuned on pre-trained model - BERT, RoBERTa, ALBERT, DeBERTa, Electra

In [2]:
test_num = 0
bool_rewrite = 0
cuda_NO = 'cuda:0'
model_name = 'bert'
learning_rate = 2e-5
num_epoch = 20
use_colab = 0
batch_size = 8
rand_seed = 42

print("____________________________________________________________________________________________________________")
print(f"Pre-trained model: {model_name}")

label_mapping = {
    'not_entailment': 0,
    'entailment': 1
}

if bool_rewrite == 0:
    print(f"Training on ORIGINAL data, test_num: {test_num}")
    model_save_path = f'model/my-{model_name}-mrpc-weights-{test_num}.pth'
else:
    print(f"Training on REWRITTEN data, test_num: {test_num}")
    model_save_path = f'model/my-{model_name}-mrpc-weights-rewrite-{test_num}.pth'
pred_save_path = 'DS/MRPC/test_pred.csv'

In [None]:
!nvidia-smi

if use_colab == 1:
    from google.colab import drive
    drive.mount('/content/drive')
    import os
    os.chdir('/content/drive/Othercomputers/我的笔记本电脑/openai')

import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import torch
print(torch.cuda.is_available())
device = f"{cuda_NO}" if torch.cuda.is_available() else "cpu"
print(device)

# Set Random Seed

In [None]:
import numpy as np
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(rand_seed)

# Load Model and Tokenizer

In [5]:
from transformers import BertForSequenceClassification, RobertaForSequenceClassification, AlbertForSequenceClassification, DebertaForSequenceClassification, ElectraForSequenceClassification
from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer, DebertaTokenizer, ElectraTokenizer

if model_name == "bert":
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
elif model_name == "roberta":
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = RobertaForSequenceClassification.from_pretrained("roberta-base")
elif model_name == "albert":
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    model = AlbertForSequenceClassification.from_pretrained("albert-base-v2")
elif model_name == "deberta":
    tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
    model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base")
elif model_name == "electra":
    tokenizer = ElectraTokenizer.from_pretrained("google/electra-base-discriminator")
    model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator")
else:
    raise ValueError(f"Model {model_name} not supported.")

# Read Data

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('DS/MRPC/train_rewrite_taskUnknown.csv', sep=',', header=0)
test_data = pd.read_csv('DS/MRPC/test_rewrite_taskUnknown.csv', sep=',', header=0)

train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=rand_seed)

# Fill the blanks
# test_data['#1 String'] = test_data['#1 String'].fillna('')
# test_data['#2 String'] = test_data['#2 String'].fillna('')

if bool_rewrite == 0:
    train_tokenized = tokenizer(
        list(train_data['Sentence1']),
        list(train_data['Sentence2']),
        return_tensors='pt', padding=True, truncation=True
    )

    val_tokenized = tokenizer(
        list(val_data['Sentence1']),
        list(val_data['Sentence2']),
        return_tensors='pt', padding=True, truncation=True
    )

    test_tokenized = tokenizer(
        list(test_data['Sentence1']),
        list(test_data['Sentence1']),
        return_tensors='pt', padding=True,truncation=True
    )

else:
    train_tokenized = tokenizer(
        list(train_data['Sentence1-Rewrite']),
        list(train_data['Sentence2-Rewrite']),
        return_tensors='pt', padding=True, truncation=True
    )

    val_tokenized = tokenizer(
        list(val_data['Sentence1-Rewrite']),
        list(val_data['Sentence2-Rewrite']),
        return_tensors='pt', padding=True, truncation=True
    )

    test_tokenized = tokenizer(
        list(test_data['Sentence1-Rewrite']),
        list(test_data['Sentence2-Rewrite']),
        return_tensors='pt', padding=True,truncation=True
    )

# Dataset and Dataloader

In [5]:
from torch.utils.data import Dataset, DataLoader

class MRPC_Dataset(Dataset):
    def __init__(self, tokenized_data, labels):
        self.data = tokenized_data
        #self.labels = [label_mapping[label] for label in labels]
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.data['input_ids'][idx],
            'attention_mask': self.data['attention_mask'][idx],
            'label': torch.tensor(self.labels[idx])
        }

train_dataset = MRPC_Dataset(train_tokenized, list(train_data['Original_Label']))
val_dataset = MRPC_Dataset(val_tokenized, list(val_data['Original_Label']))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

# Fine-Tune

In [1]:
import torch, gc
from tqdm import tqdm
from transformers import get_scheduler, SchedulerType

gc.collect()
torch.cuda.empty_cache()
model.to(device)

lr = learning_rate
validation = True
total_step = len(train_loader) * num_epoch
weight_decay = 0.01
best_acc = 0

early_stop = 100
early_stop_count = 0

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
num_warmup_steps = int(total_step * 0.1)
scheduler = get_scheduler(
    name = SchedulerType.LINEAR,
    optimizer = optimizer,
    num_warmup_steps = num_warmup_steps,
    num_training_steps = total_step - num_warmup_steps
)

for epoch in range(num_epoch):
    # Training
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epoch}, Average Training Loss: {avg_train_loss}")

    # Validation
    if validation:
        model.eval()
        total_val_loss = 0
        correct_predictions = 0
        with torch.no_grad():
            for batch in tqdm(val_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=-1)
                correct_predictions += torch.sum((preds == labels).int())

        avg_val_loss = total_val_loss / len(val_loader)
        val_acc = correct_predictions.double() / len(val_data)
        print(f"Average Validation Loss: {avg_val_loss}, Validation Accuracy: {val_acc}")

        if val_acc > best_acc:
            best_acc = val_acc
            # torch.save(model, model_save_path)
            torch.save(model.state_dict(), model_save_path)
            print('Saving model with val_acc {:.3f}...'.format(best_acc))
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= early_stop:
            print('\nModel is not improving, so we halt the training session.')
            break

    if not validation:
        # torch.save(model, model_save_path)
        torch.save(model.state_dict(), model_save_path)

# Prediction

In [12]:
if model_name == "bert":
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
elif model_name == "roberta":
    model = RobertaForSequenceClassification.from_pretrained("roberta-base")
elif model_name == "albert":
    model = AlbertForSequenceClassification.from_pretrained("albert-base-v2")
elif model_name == "deberta":
    model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base")
elif model_name == "electra":
    model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator")
else:
    raise ValueError(f"Model {model_name} not supported.")
model.load_state_dict(torch.load(model_save_path, weights_only=True))
# model.to(device)
# model = torch.load(model_save_path, weights_only=False)
model.eval()
with torch.no_grad():
    # outputs = model(**test_tokenized.to(device))
    outputs = model(**test_tokenized)

# Save the pred

In [5]:
predictions = torch.argmax(outputs.logits, dim=-1).cpu()
test_data['pred_label'] = predictions
test_data.to_csv(pred_save_path, sep=',', header=True, index=True)

# Calculate Acc

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_csv(pred_save_path, sep=',', header=0)
df['Original_Label'] = df['Original_Label'].map(label_mapping)
correct_predictions = (df['Original_Label'] == df['pred_label']).sum()
total_predictions = len(df)

accuracy = accuracy_score(df['Original_Label'], df['pred_label'])
precision = precision_score(df['Original_Label'], df['pred_label'], average='macro')
recall = recall_score(df['Original_Label'], df['pred_label'], average='macro')
f1 = f1_score(df['Original_Label'], df['pred_label'], average='macro')

print(f'{accuracy * 100:.2f} / {precision * 100:.2f} / {recall * 100:.2f} / {f1 * 100:.2f}')