In [1]:
import itertools

from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Model, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
cbt_dataset = load_dataset('cbt','CN')

In [None]:
train_set = cbt_dataset['train']
val_set = cbt_dataset['validation']
test_set = cbt_dataset['test']

In [None]:
train_set

In [None]:
model = GPT2Model.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
example = train_set[1]
context, question, answer, options = example['sentences'], example['question'], example['answer'], example['options']
input_text = f"{context} {question} {' '.join(options)}"
tokenizer(input_text, add_special_tokens=True)

In [None]:
input_ids,attention_mask = tokenizer(input_text, add_special_tokens=True)

In [None]:
input_ids

task: write a preprocess function s.t. for an input: context, question, answer, options, output a format that we can put into data loader:
做成input_id, labels, 然后把input_id和labels都padding成一样长就行了应该。

In [None]:
train_set_lst = []
label_lst = []
attn_lst = []
token_type_id_lst = []
padding = 50256
max_length = 1024
for i in tqdm(range(int(len(train_set)/500))):
    example = train_set[i]
    context, question, answer, options = example['sentences'], example['question'], example['answer'], example['options']
    option = [example['question'].replace('XXXXX', option) for option in example['options']]
    input_text = [''.join(example["sentences"]) + opt for opt in option]
    labels = [1 if option == answer else 0 for option in options]

    for i in range(len(input_text)):
        input = input_text[i]
        label = labels[i]
        input_ids,attention_mask = tokenizer(input, add_special_tokens=True)['input_ids'],tokenizer(input, add_special_tokens=True)['attention_mask']
        context_length = len(input_ids)-len(question)
        question_length = len(question)
        token_type_id = [0] * context_length + [1] * question_length
        if len(input_ids)<=max_length:
            padding_length = max_length-len(input_ids)
            input_ids+=[padding]*padding_length
            attention_mask+=[padding]*padding_length
            token_type_id+=[padding]*padding_length
            train_set_lst.append(input_ids)
            attn_lst.append(attention_mask)
            token_type_id_lst.append(token_type_id)
            label_lst.append(label)


In [None]:
dataset = torch.utils.data.TensorDataset(torch.tensor(train_set_lst),torch.tensor(attn_lst),torch.tensor(token_type_id_lst),torch.tensor(label_lst))
train_dataloader = DataLoader(dataset,batch_size=1,shuffle=True)
print('finish')

In [None]:
import torch
from torch import nn

class GPT2ForMultipleChoice(nn.Module):
    def __init__(self, gpt2_model):
        super().__init__()
        self.gpt2 = gpt2_model
        self.classifier = nn.Linear(self.gpt2.config.n_embd, 1)

    def forward(self, input_ids, token_type_ids):
        #input_shape = input_ids.size()
        #input_ids = input_ids.view(-1, input_shape[-1])
        #attention_mask = attention_mask.view(-1, input_shape[-1]) if attention_mask is not None else None
        #token_type_ids = token_type_ids.view(-1, input_shape[-1]) if token_type_ids is not None else None

        outputs = self.gpt2(input_ids=input_ids,  token_type_ids=token_type_ids)
        outputs = outputs.last_hidden_state
        pooled_output = outputs[0]
        logits = self.classifier(pooled_output)
    
        return logits

In [None]:
model = GPT2Model.from_pretrained('gpt2').to(device)
input_ids = torch.tensor([train_set_lst[0]]).view(-1,1).to(device)
print(input_ids.shape)
attention_mask = torch.tensor([attn_lst[0]]).to(device)
token_type_id = torch.tensor([token_type_id_lst[0]]).view(-1,1).to(device)
label = torch.tensor([label_lst[0]]).to(device)
gpt_output = model(input_ids = input_ids,  token_type_ids = token_type_id)
gpt_output.last_hidden_state

In [None]:

# Define model and optimizer
#model = GPT2LMHeadModel.from_pretrained('gpt2')
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define training loop
def train(model, train_dataloader, optimizer, scheduler=None, num_epochs=1):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        total_loss = 0
        for batch in train_dataloader:

            input_ids, attention_mask,token_type_ids, labels = batch[0].to(device),batch[1].to(device), batch[2].to(device),batch[3].to(device)
            optimizer.zero_grad()
            logits = model(input_ids = input_ids.view(-1,1), token_type_ids = token_type_ids.view(-1,1))
            criterion = torch.nn.BCEWithLogitsLoss().to(device)
            loss = criterion(logits,labels.view(-1,1).float())
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} Loss: {total_loss/len(train_dataloader)}")

# Load dataset and create dataloader

model2 = GPT2ForMultipleChoice(model)
# Fine-tune model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model2.to(device)
num_training_steps = len(train_dataloader) * 3
num_warmup_steps = int(num_training_steps * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)
train(model2, train_dataloader, optimizer, scheduler=scheduler, num_epochs=100)

In [None]:
logit = torch.tensor([[0.5585]])
label = torch.tensor([1]).view(-1,1).float()
criterion = torch.nn.BCEWithLogitsLoss()
criterion(logit,label)


In [None]:
logit.shape

In [17]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead.")


Using GPU: Quadro RTX 8000


In [6]:
1+1

2

一个问题变成10个sample，每个变成 binary classification