### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev sentence pairs, and unlabeled test sentence pairs, into lists.

In [1]:
import csv

In [2]:
train, dev, test = [], [], []

In [3]:
# # needed for Colab
# # uncomment and run; select "data.zip" that contains the three CSVs

# from google.colab import files
# uploaded = files.upload()

In [4]:
# # needed for Colab
# # uncomment and run

# from zipfile import ZipFile
# zipF = ZipFile('data.zip')
# zipF.extractall()

In [5]:
with open('./data/pnli_train.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        train.append(x)
print (len(train))
print (train[:3])

5983
[['Sometimes do exercise.', 'A person typically desire healthy life.', '1'], ['Who eats junk foods.', 'A person typically desire healthy life.', '0'], ['A person is sick.', 'A person typically desire healthy life.', '1']]


In [6]:
with open('./data/pnli_dev.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        dev.append(x)
print (len(dev))
print (dev[:3])

1055
[['A person is looking for accuracy.', 'A person typically desires accurate results.', '1'], ['A person does not care for accuracy.', 'A person typically desires accurate results.', '0'], ['The person double checks their data.', 'A person typically desires accurate results.', '1']]


In [7]:
with open('./data/pnli_test_unlabeled.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[0] and x[1] will be the sentence pairs.
        test.append(x)
print (len(test))
print (test[:3])

4850
[['The people want to have a romantic and pleasant feel.', 'People typically does desire to smell violets.'], ['The contract is to buy products from you.', 'Getting contract typically cause to make money or spend money.'], ['Train station is closed.', 'Line can typically be used to move train along tracks.']]


### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [None]:
# I created/tuned this program (model) using Google Colab -- for speed
# Although I am not sure why there appears to be variatio between the Colab and Jupyter predictions 
# ???


In [8]:
# # needed for Colab
# # uncomment and run

# !pip install simpletransformers

In [9]:
import pandas as pd
import random

from simpletransformers.classification import ClassificationModel 
from simpletransformers.classification import ClassificationArgs

from warnings import filterwarnings
filterwarnings('ignore')

# import numpy as np
# from statistics import mean
# from random import shuffle


In [10]:
# the 1 and 0 labels are strings not integers, which can cause problems for some models
# so we convert them to integers

train = [ [lst[0], lst[1], int(lst[2])] for lst in train ]
dev = [ [lst[0], lst[1], int(lst[2])] for lst in dev ]



In [11]:
# X is the sentence pairs and Y is the binary precondition classes

trainL_X = [line[:-1] for line in train]
trainL_Y = [line[-1] for line in train]

devL_X = [line[:-1] for line in dev]
devL_Y = [line[-1] for line in dev]

testL_X = [line for line in test]


In [12]:
# The final model will be trained on a combination of the train set + dev set
# And tested on the test set

# train_dev_X = trainL_X.copy() + devL_X.copy()
# train_dev_Y = trainL_Y.copy() + devL_Y.copy()

train_dev = train.copy() + dev.copy()


In [13]:
# # accuracy when guessing

# guess_errors = 0

# for label in devL_Y:
#     # our labels are 0 and 1
#     random_guess = random.randint(0, 1)
#     if label != random_guess:
#         guess_errors += 1

# accuracy = 1-guess_errors/len(devL_Y)
# print(accuracy)


In [16]:
# using RoBERTa-Large
# https://huggingface.co/ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli

# https://simpletransformers.ai/docs/classification-models/#binary-classification
# https://simpletransformers.ai/docs/usage/#configuring-a-simple-transformers-model
# https://simpletransformers.ai/docs/classification-data-formats/
# https://simpletransformers.ai/docs/sentence-pair-classification/
# https://simpletransformers.ai/docs/classification-specifics/#supported-model-types


# try increasing epoch numbers; though 2 is already giving good results
# adjust batch size ???; maybe cause failure due to reaching GPU capcaity limit
# or otherwise just very slow



# Creating classifier

model_args = ClassificationArgs(num_train_epochs=2, train_batch_size=32, overwrite_output_dir=True, reprocess_input_data=True, manual_seed=3, learning_rate=2.5e-5)

# don't set use_cuda to False in Colab
model = ClassificationModel("roberta", "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli", args = model_args, use_cuda=False)

# current best configuration



Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [17]:
# # training on the training set


# # the simpletransformers library appears to use pandas dataframes as the inputs
# # When performing sentence-pair tasks (e.g. sentence similarity), both the training and evaluation dataframes must contain a header row. 
# # The dataframes must also have at least 3 columns, text_a, text_b, and labels.
# # precondition inference is a senetence-pair classification task

# train_df = pd.DataFrame(train, columns=["text_a", "text_b", "labels"])

# # model training
# model.train_model(train_df)

# # very slow; is there a way to make it faster



In [18]:
# # getting dev set predictions


# # The predict() method is used to make predictions with the model. 
# # Parameters: to_predict - A python list of text (str) to be sent to the model for prediction.
# # Returns: preds (list) - A python list of the predictions (0 or 1) for each text. 
# #          model_outputs (list) - A python list of the raw model outputs for each text.

# # Since the predict() input is a list, we will be using devL_X and devL_Y for parameter tuning 

# predsL, model_outputsL = model.predict( devL_X )



In [19]:
# print(list(predsL))

In [20]:
# # evaluation of dev set predictions -- using train set for training

# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, predsL):
#     if label != pred:
#         errors += 1

# accuracy = 1-(errors/total)
# print(accuracy)

# predictionsL = predsL.copy()


In [21]:
# various recorded dev set evaluation results


# standard BERT

# approximately
# 0.81 with 1 epoch
# with 2 epochs: 0.856, 0.85
# 0.854 with 5 epochs
# 0.8597156398104265 11


# got .899 with roberta large; no tuning
# could be lower when rerun; how to run multiple times without getting the cache error
# other times .895, 0.8957, 0.8995, .90, .887, .9071


# with 3 epochs: .895

# with adam_epsilon=0.00000005: 0.893
# with adam_epsilon=0.000000005: 0.896
# with adam_epsilon=0.000000001: 0.877
# weight_decay=0.00001: 0.893

# roberta-large-mnli
# 0.9118483412322275

# ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli
  # learning_rate=4e-5
# 0.9137440758293839, 0.9118483412322275, 0.9165876777251185 (seed 3)
      # learning_rate=3e-5 : 0.91848
          # 0.90995 with 7 epochs
      # learning_rate=2.7e-5 : 0.9137
      # learning_rate=2.6e-5 : 0.91
      # learning_rate=2.5e-5 : 0.92227
      # learning_rate=2.4e-5 : 0.9147
      # learning_rate=2.3e-5 : 0.9175
      # learning_rate=2.1e-5 : 0.9156
      # learning_rate=2e-5 : 0.9213 ???
        # Which is more consistent on random runs
  # 0.90995 with 6 epochs
  # 0.90805, 0.9109, 0.9033 with 3 epochs

# use seed 3 or
# find a better one
# or find 2 other good ones and maj vote


In [22]:
# training on the train+dev set combinations

train_dev_df = pd.DataFrame(train_dev, columns=["text_a", "text_b", "labels"])

model.train_model(train_dev_df)


  0%|          | 0/7038 [00:00<?, ?it/s]



Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/220 [00:00<?, ?it/s]



Running Epoch 1 of 2:   0%|          | 0/220 [00:00<?, ?it/s]

(440, 0.2945946195162833)

In [23]:
predsL, model_outputsL = model.predict( testL_X )

  0%|          | 0/4850 [00:00<?, ?it/s]

  0%|          | 0/607 [00:00<?, ?it/s]

In [24]:
# Results should have exactly 4850 lines, every line is either 0 or 1.

results = list(predsL)

In [27]:
len(results)

4850

### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [33]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 4850)

In [34]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [35]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')