In [1]:
import csv

import torch
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Read training, dev and unlabeled test data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
DATA_ADDRESS = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/4_PreconditionInference/HW4_upload/data/'
TRAIN_DATA_ADDRESS = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/4_PreconditionInference/HW4_upload/data/pnli_train.csv'
DEV_DATA_ADDRESS = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/4_PreconditionInference/HW4_upload/data/pnli_dev.csv'
TEST_DATA_ADDRESS = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/4_PreconditionInference/HW4_upload/data/pnli_test_unlabeled.csv'
PREDICTION_ADDRESS = '/content/drive/MyDrive/UniversityOfSouthernCalifornia/Term2-Spring2022/CSCI544-AppliedNLP/CodingAssignments/4_PreconditionInference/HW4_upload/'

The following provides a starting code (Python 3) of how to read the labeled training and dev sentence pairs, and unlabeled test sentence pairs, into lists.

In [5]:
train, dev, test = [], [], []

In [6]:
with open(f'{DATA_ADDRESS}pnli_train.csv', encoding='utf-8') as fp:
  csvreader = csv.reader(fp)
  for x in csvreader:
    # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
    train.append(x)
print(len(train))
print(train[:3])

5983
[['Sometimes do exercise.', 'A person typically desire healthy life.', '1'], ['Who eats junk foods.', 'A person typically desire healthy life.', '0'], ['A person is sick.', 'A person typically desire healthy life.', '1']]


In [7]:
with open(f'{DATA_ADDRESS}pnli_dev.csv', encoding='utf-8') as fp:
  csvreader = csv.reader(fp)
  for x in csvreader:
    # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
    dev.append(x)
print(len(dev))
print(dev[:3])

1055
[['A person is looking for accuracy.', 'A person typically desires accurate results.', '1'], ['A person does not care for accuracy.', 'A person typically desires accurate results.', '0'], ['The person double checks their data.', 'A person typically desires accurate results.', '1']]


In [8]:
with open(f'{DATA_ADDRESS}pnli_test_unlabeled.csv', encoding='utf-8') as fp:
  csvreader = csv.reader(fp)
  for x in csvreader:
    # x[0] and x[1] will be the sentence pairs.
    test.append(x)
print(len(test))
print(test[:3])

4850
[['The people want to have a romantic and pleasant feel.', 'People typically does desire to smell violets.'], ['The contract is to buy products from you.', 'Getting contract typically cause to make money or spend money.'], ['Train station is closed.', 'Line can typically be used to move train along tracks.']]


#### Creating PyTorch dataset and dataloader

In [9]:
!pip install transformers



In [10]:
from transformers import RobertaTokenizer

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset

In [11]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [12]:
BATCH_SIZE = 8

In [13]:
class PITrainDevDataset(Dataset):
  def __init__(self, data_dir, verbose=False):
    self.preconditions = []
    self.statements = []
    self.labels = []

    with open(data_dir, encoding='utf-8') as fp:
      csvreader = csv.reader(fp)
      for x in csvreader:
        self.preconditions.append(x[0])
        self.statements.append(x[1])
        self.labels.append(int(x[2]))
  
  def __len__(self):
    return len(self.preconditions)

  def __getitem__(self, index):
    return (self.preconditions[index], self.statements[index], self.labels[index])

In [14]:
class PITestDataset(Dataset):
  def __init__(self, data_dir, verbose=False):
    self.preconditions = []
    self.statements = []

    with open(data_dir, encoding='utf-8') as fp:
      csvreader = csv.reader(fp)
      for x in csvreader:
        self.preconditions.append(x[0])
        self.statements.append(x[1])
  
  def __len__(self):
    return len(self.preconditions)

  def __getitem__(self, index):
    return (self.preconditions[index], self.statements[index])

In [15]:
def collate_fn(batch):
  token_ids_list = []
  mask_ids_list = []
  segment_ids_list = []
  labels_list = []

  verbose = False

  for index in range(len(batch)):
    tokenized_precondition = tokenizer.encode(batch[index][0], add_special_tokens=False)
    tokenized_statement = tokenizer.encode(batch[index][1], add_special_tokens=False)
    tokenized_concatenated = [*[tokenizer.cls_token_id], *tokenized_precondition, *[tokenizer.sep_token_id], *tokenized_statement, *[tokenizer.sep_token_id]]
    mask_ids = [1] * len(tokenized_concatenated)
    segment_ids = [*([0] * (len(tokenized_precondition) + 2)), *([1] * (len(tokenized_statement) + 1))]

    assert len(tokenized_concatenated) == len(tokenized_precondition) + len(tokenized_statement) + 3
    assert len(mask_ids) == len(segment_ids) and len(mask_ids) == len(tokenized_concatenated)

    if verbose == True:
      print(f'Precondition: {batch[index][0]}')
      print(f'Tokenized precondition: {tokenized_precondition}')
      print(f'Statement: {batch[index][1]}')
      print(f'Tokenized statement: {tokenized_statement}')
      print(f'Tokenized concatenated: {tokenized_concatenated}')
      print(f'Mask ids: {mask_ids}')
      print(f'Segment ids: {segment_ids}')

    tokenized_concatenated = torch.tensor(tokenized_concatenated)
    mask_ids = torch.tensor(mask_ids)
    segment_ids = torch.tensor(segment_ids)

    token_ids_list.append(tokenized_concatenated)
    mask_ids_list.append(mask_ids)
    segment_ids_list.append(segment_ids)

    labels_list.append(batch[index][2])

  token_ids_list = pad_sequence(token_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
  mask_ids_list = pad_sequence(mask_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
  segment_ids_list = pad_sequence(segment_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
  labels_list = torch.tensor(labels_list)

  return token_ids_list, mask_ids_list, segment_ids_list, labels_list

In [16]:
def collate_fn_test(batch):
  token_ids_list = []
  mask_ids_list = []
  segment_ids_list = []

  verbose = False

  for index in range(len(batch)):
    tokenized_precondition = tokenizer.encode(batch[index][0], add_special_tokens=False)
    tokenized_statement = tokenizer.encode(batch[index][1], add_special_tokens=False)
    tokenized_concatenated = [*[tokenizer.cls_token_id], *tokenized_precondition, *[tokenizer.sep_token_id], *tokenized_statement, *[tokenizer.sep_token_id]]
    mask_ids = [1] * len(tokenized_concatenated)
    segment_ids = [*([0] * (len(tokenized_precondition) + 2)), *([1] * (len(tokenized_statement) + 1))]

    assert len(tokenized_concatenated) == len(tokenized_precondition) + len(tokenized_statement) + 3
    assert len(mask_ids) == len(segment_ids) and len(mask_ids) == len(tokenized_concatenated)

    if verbose == True:
      print(f'Precondition: {batch[index][0]}')
      print(f'Tokenized precondition: {tokenized_precondition}')
      print(f'Statement: {batch[index][1]}')
      print(f'Tokenized statement: {tokenized_statement}')
      print(f'Tokenized concatenated: {tokenized_concatenated}')
      print(f'Mask ids: {mask_ids}')
      print(f'Segment ids: {segment_ids}')

    tokenized_concatenated = torch.tensor(tokenized_concatenated)
    mask_ids = torch.tensor(mask_ids)
    segment_ids = torch.tensor(segment_ids)

    token_ids_list.append(tokenized_concatenated)
    mask_ids_list.append(mask_ids)
    segment_ids_list.append(segment_ids)

  token_ids_list = pad_sequence(token_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
  mask_ids_list = pad_sequence(mask_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)
  segment_ids_list = pad_sequence(segment_ids_list, batch_first=True, padding_value=tokenizer.pad_token_id)

  return token_ids_list, mask_ids_list, segment_ids_list

In [17]:
train_dataset = PITrainDevDataset(TRAIN_DATA_ADDRESS)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

dev_dataset = PITrainDevDataset(DEV_DATA_ADDRESS)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

test_dataset = PITestDataset(TEST_DATA_ADDRESS)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn_test)

In [18]:
train_token_ids_list, train_mask_ids_list, train_segment_ids_list, train_labels_list = next(iter(train_dataloader))
print(f"Token ids batch shape: {train_token_ids_list.size()}")
print(f"Mask ids batch shape: {train_mask_ids_list.size()}")
print(f"segment ids batch shape: {train_segment_ids_list.size()}")
print(f"Labels batch shape: {train_labels_list.size()}")
label = train_labels_list[0]
print(f"Label: {label}")

Token ids batch shape: torch.Size([8, 22])
Mask ids batch shape: torch.Size([8, 22])
segment ids batch shape: torch.Size([8, 22])
Labels batch shape: torch.Size([8])
Label: 1


In [19]:
index = 1
print(train_token_ids_list[index])
print(train_mask_ids_list[index])
print(train_segment_ids_list[index])
print(train[index])

tensor([    0, 12375, 24923, 15163,  6592,     4,     2,   250,   621,  3700,
         4724,  2245,   301,     4,     2,     1,     1,     1,     1,     1,
            1,     1])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
tensor([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
['Who eats junk foods.', 'A person typically desire healthy life.', '0']


In [20]:
dev_token_ids_list, dev_mask_ids_list, dev_segment_ids_list, dev_labels_list = next(iter(dev_dataloader))
print(f"Token ids batch shape: {dev_token_ids_list.size()}")
print(f"Mask ids batch shape: {dev_mask_ids_list.size()}")
print(f"segment ids batch shape: {dev_segment_ids_list.size()}")
print(f"Labels batch shape: {dev_labels_list.size()}")
label = dev_labels_list[0]
print(f"Label: {label}")

Token ids batch shape: torch.Size([8, 19])
Mask ids batch shape: torch.Size([8, 19])
segment ids batch shape: torch.Size([8, 19])
Labels batch shape: torch.Size([8])
Label: 1


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

#### Load model

In [21]:
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW

In [22]:
from tqdm.notebook import tqdm

In [23]:
LEARNING_RATE = 1e-5
EPOCHS = 20
NUM_TRAINING_STEPS = EPOCHS * len(train_dataloader)

In [24]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [25]:
model.roberta.config.type_vocab_size = 2
single_emb = model.roberta.embeddings.token_type_embeddings
model.roberta.embeddings.token_type_embeddings = torch.nn.Embedding(2, single_emb.embedding_dim)
model.roberta.embeddings.token_type_embeddings.weight = torch.nn.Parameter(single_emb.weight.repeat([2, 1]))

In [26]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

In [27]:
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=NUM_TRAINING_STEPS
)

#### Train

In [28]:
progress_bar = tqdm(range(NUM_TRAINING_STEPS))

for epoch in tqdm(range(EPOCHS)):
  model.train()
  total_train_loss = 0
  total_train_acc  = 0
  for batch_idx, (train_token_ids_list, train_mask_ids_list, train_segment_ids_list, train_labels_list) in enumerate(train_dataloader):
    optimizer.zero_grad()
    train_token_ids_list = train_token_ids_list.to(device)
    train_mask_ids_list = train_mask_ids_list.to(device)
    train_segment_ids_list = train_segment_ids_list.to(device)
    train_labels_list = train_labels_list.to(device)
    # print(train_token_ids_list)
    # print(train_mask_ids_list)
    # print(train_segment_ids_list)
    # print(train_labels_list)

    prediction = model(input_ids=train_token_ids_list,
                       token_type_ids=train_segment_ids_list,
                       attention_mask=train_mask_ids_list,
                       labels=train_labels_list)
    
    # print(f'Prediction: {prediction}')
    
    loss = prediction.loss

    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    progress_bar.update(1)

    # print(torch.log_softmax(prediction.logits, dim=1))
    # print(torch.log_softmax(prediction.logits, dim=1).argmax(dim=1))
    # print(f'train_labels_list: {train_labels_list}')
    # print((torch.log_softmax(prediction.logits, dim=1).argmax(dim=1) == train_labels_list).sum().float() / float(BATCH_SIZE))
    
    total_train_loss += loss.item()
    total_train_acc  += ((torch.log_softmax(prediction.logits, dim=1).argmax(dim=1) == train_labels_list).sum().float() / float(BATCH_SIZE)).item()

  train_acc  = total_train_acc/len(train_dataloader)
  train_loss = total_train_loss/len(train_dataloader)

  model.eval()
  total_val_acc  = 0
  total_val_loss = 0
  with torch.no_grad():
    for batch_idx, (dev_token_ids_list, dev_mask_ids_list, dev_segment_ids_list, dev_labels_list) in enumerate(dev_dataloader):
      dev_token_ids_list = dev_token_ids_list.to(device)
      dev_mask_ids_list = dev_mask_ids_list.to(device)
      dev_segment_ids_list = dev_segment_ids_list.to(device)
      dev_labels_list = dev_labels_list.to(device)

      prediction = model(dev_token_ids_list, token_type_ids=dev_segment_ids_list,
                               attention_mask=dev_mask_ids_list,
                               labels=dev_labels_list)
      
      loss = prediction.loss

      total_val_loss += loss.item()
      total_val_acc  += ((torch.log_softmax(prediction.logits, dim=1).argmax(dim=1) == dev_labels_list).sum().float() / float(BATCH_SIZE)).item()

  val_acc  = total_val_acc/len(dev_dataloader)
  val_loss = total_val_loss/len(dev_dataloader)

  print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')

  0%|          | 0/14960 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch 1: train_loss: 0.4762 train_acc: 0.7569 | val_loss: 0.3520 val_acc: 0.8419
Epoch 2: train_loss: 0.3102 train_acc: 0.8720 | val_loss: 0.3282 val_acc: 0.8712
Epoch 3: train_loss: 0.2286 train_acc: 0.9116 | val_loss: 0.3592 val_acc: 0.8741
Epoch 4: train_loss: 0.1777 train_acc: 0.9327 | val_loss: 0.3981 val_acc: 0.8807
Epoch 5: train_loss: 0.1321 train_acc: 0.9512 | val_loss: 0.4451 val_acc: 0.8646
Epoch 6: train_loss: 0.1078 train_acc: 0.9631 | val_loss: 0.4325 val_acc: 0.8845
Epoch 7: train_loss: 0.0797 train_acc: 0.9748 | val_loss: 0.4548 val_acc: 0.8826
Epoch 8: train_loss: 0.0644 train_acc: 0.9803 | val_loss: 0.4266 val_acc: 0.8750
Epoch 9: train_loss: 0.0574 train_acc: 0.9820 | val_loss: 0.4296 val_acc: 0.8864
Epoch 10: train_loss: 0.0439 train_acc: 0.9836 | val_loss: 0.5221 val_acc: 0.8911
Epoch 11: train_loss: 0.0357 train_acc: 0.9898 | val_loss: 0.5081 val_acc: 0.8845
Epoch 12: train_loss: 0.0314 train_acc: 0.9906 | val_loss: 0.5226 val_acc: 0.8873
Epoch 13: train_loss: 0.0

In [29]:
# Eventually, results need to be a list of 2028 0 or 1's
results = []

model.eval()
with torch.no_grad():
  for batch_idx, (test_token_ids_list, test_mask_ids_list, test_segment_ids_list) in enumerate(test_dataloader):
    test_token_ids_list = test_token_ids_list.to(device)
    test_mask_ids_list = test_mask_ids_list.to(device)
    test_segment_ids_list = test_segment_ids_list.to(device)

    prediction = model(test_token_ids_list,
                       token_type_ids=test_segment_ids_list,
                       attention_mask=test_mask_ids_list)

    logits = prediction.logits
    predictions = torch.argmax(logits, dim=-1)
    results = results + predictions.tolist()
    
    # print(f'Prediction: {prediction}')
    # print(f'Final Predictions: {predictions}')
    # print(f'Results list: {results}')

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [30]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 4850)

In [31]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [34]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open(f'{PREDICTION_ADDRESS}upload_predictions_2_89_02.txt', 'w', encoding = 'utf-8') as fp:
  for x in results:
    fp.write(str(x) + '\n')