In [3]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 4.8 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 49.9 MB/s 
[?25hCollecting transformers>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 38.5 MB/s 
[?25hCollecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 12.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 40.6 MB/s 
Collecting streamlit
  Downloading streamlit-1.8.1-py2.py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 34.5 MB/s 
[?25hCollecting wandb>=

#Loading Dataset 

In [4]:
import csv
import pandas as pd
from sklearn import preprocessing
import torch

train, dev, test = [], [], []

from google.colab import drive
drive.mount('/content/drive')

with open('/content/drive/My Drive/HW4_NLP/data/pnli_train.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        train.append(x)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

df_train = pd.DataFrame(train, columns = ['sentence1', 'sentence2', 'label'])
le = preprocessing.LabelEncoder()
df_train['label'] = le.fit_transform(df_train['label'])
df_train

with open('/content/drive/My Drive/HW4_NLP/data/pnli_dev.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        # x[2] will be the label (0 or 1). x[0] and x[1] will be the sentence pairs.
        dev.append(x)

with open('/content/drive/My Drive/HW4_NLP/data/pnli_test_unlabeled.csv', encoding='utf-8') as fp:
    csvreader = csv.reader(fp)
    for x in csvreader:
        test.append(x)

df_val = pd.DataFrame(dev, columns = ['sentence1', 'sentence2', 'label'])
df_test = pd.DataFrame(test, columns = ['sentence1', 'sentence2'])

df_val['label'] = le.transform(df_val['label'])

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

df_train.head()

Mounted at /content/drive
cpu
(5983, 3)
(1055, 3)
(4850, 2)


Unnamed: 0,sentence1,sentence2,label
0,Sometimes do exercise.,A person typically desire healthy life.,1
1,Who eats junk foods.,A person typically desire healthy life.,0
2,A person is sick.,A person typically desire healthy life.,1
3,A person is dead.,A person typically desire healthy life.,0
4,A person eats properly and do exercise regularly.,A person typically desire healthy life.,1


###Converting dataset format to match Simpletransformers requirements

In [5]:
df_train = df_train.rename(columns={'sentence1': 'text_a', 'sentence2': 'text_b', 'label': 'labels'}).dropna()
df_val = df_val.rename(columns={'sentence1': 'text_a', 'sentence2': 'text_b', 'label': 'labels'}).dropna()

#Fine-tuning Roberta model with our data and changing the classification head

In [33]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
import pandas as pd
import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=5, overwrite_output_dir=True, train_batch_size=32)

# Create a ClassificationModel
model = ClassificationModel("roberta", "roberta-base", num_labels=2, args=model_args, use_cuda=True)

# Train the model
model.train_model(df_train)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

  0%|          | 0/5983 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_roberta_128_2_3


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/187 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/187 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/187 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/187 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/187 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(935, 0.2660618359312654)

In [34]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(df_val)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/1055 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_roberta_128_2_3


Running Evaluation:   0%|          | 0/132 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'mcc': 0.7663140657607381, 'tp': 491, 'tn': 441, 'fp': 60, 'fn': 63, 'auroc': 0.9406421813412886, 'auprc': 0.9363246535073607, 'eval_loss': 0.513909937305884}


###Best Validation error achieved (0.883)

In [35]:
import numpy as np
from sklearn.metrics import accuracy_score

lst = []
for arr in model_outputs:
    lst.append(np.argmax(arr))
true = df_val['labels'].tolist()
predicted = lst
accuracy_score(true,predicted)

0.8834123222748815

In [14]:
frames = [df_train, df_val]
final_train = result = pd.concat(frames, ignore_index=True)

In [23]:
print(final_train['text_a'][1060])
print(final_train['text_b'][1060])
print(final_train['labels'][1060])

The numbers are too big.
Adding up numbers are typically used for totalling sum.
0


In [None]:
predictions, raw_outputs = model.predict(
    [
        [
            "The numbers are too big.",
            "Adding up numbers are typically used for totalling sum.",
        ]
    ]
)

#Training with Train + Val set

In [6]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)
import pandas as pd
import logging

In [7]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=4, overwrite_output_dir=True, train_batch_size=32)

# Create a ClassificationModel
model = ClassificationModel("roberta", "roberta-base", num_labels=2, args=model_args, use_cuda=True)

# Train the model
model.train_model(final_train)


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

ValueError: ignored

In [None]:
predictions, raw_outputs = model.predict(df_test)

In [None]:
results = predictions
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 4850)

results = [int(x) for x in results]

In [None]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')