In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers[sentencepiece] datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 30.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 67.0 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 76.2 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 82.5 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 63.5 MB/s 
Collecting urllib3!=1.25.0,!=1.25

In [None]:
model_checkpoint = "roberta-large"
batch_size = 16

In [None]:
from datasets import Dataset, load_metric
import xml.etree.ElementTree as ET

In [None]:
def load_data():
  path2_x = '/content/subtask2-homographic-test.xml'
  path2_y =  '/content/subtask2-homographic-test.gold'

  pun_instances = {}
  locations = {}
  human_data = []
  all_data = []

  tree = ET.parse(path2_x)
  root = tree.getroot()

  for child in root:
    line = []
    idx = child.attrib["id"]
    for kid in child:
      line.append(kid.text)
    pun_instances[idx] = line

  with open(path2_y) as gold:
    lines = gold.readlines()
    for line in lines:
      token = line.strip().split("\t")
      sub_tokens = token[1].split("_")
      locations[token[0]] = sub_tokens[2]

  for idx in pun_instances.keys():
    sentence = " ".join(pun_instances[idx])
    pun_word = pun_instances[idx][int(locations[idx]) - 1]
    pun_location = int(locations[idx]) - 1
    labels = [0] * len(pun_instances[idx])
    labels[pun_location] = 1
    all_data.append({ "tokens": pun_instances[idx], "labels": labels })
    human_data.append({ "sentence": sentence, "pun_word": pun_word })

  print('[INFO] Data loaded successfully.')
  return all_data, human_data

data, human_data = load_data()
dataset = Dataset.from_list(data)

[INFO] Data loaded successfully.


In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(human_data)
df.head()

Unnamed: 0,sentence,pun_word
0,They hid from the gunman in a sauna where they...,sweat
1,Wal - Mart isn ' t the only saving place !,saving
2,Can honeybee abuse lead to a sting operation ?,sting
3,A ditch digger was entrenched in his career .,entrenched
4,"She was only a Blacksmith ' s daughter , but s...",forge


In [None]:
df.to_json('semeval-task2-homo.json', orient='records')

In [None]:
train_test_data = dataset.train_test_split(test_size=0.2, shuffle=True)
train_test_data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 1016
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 255
    })
})

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_dataset = train_test_data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=2)

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be ab

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-semeval",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1016
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 192
  Number of trainable parameters = 354312194


Epoch,Training Loss,Validation Loss
1,No log,0.035083
2,No log,0.049384
3,No log,0.051663


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 255
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 255
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 255
  Batch size = 16


Tr

TrainOutput(global_step=192, training_loss=0.003134489990770817, metrics={'train_runtime': 98.6193, 'train_samples_per_second': 30.907, 'train_steps_per_second': 1.947, 'total_flos': 188237298466944.0, 'train_loss': 0.003134489990770817, 'epoch': 3.0})

In [None]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 255
  Batch size = 16


{'eval_loss': 0.05166316404938698,
 'eval_runtime': 1.9286,
 'eval_samples_per_second': 132.222,
 'eval_steps_per_second': 8.296,
 'epoch': 3.0}

In [None]:
# Inference
import numpy as np
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tokens. If tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 255
  Batch size = 16


In [None]:
idx = 8
print(tokenized_dataset["test"][idx]['tokens'])
locs = [i - 1 for i, e in enumerate(predictions[idx]) if e == 1]
for loc in locs:
  print(tokenized_dataset["test"][idx]['tokens'][loc], end=" ")

['Bees', 'make', 'money', 'by', 'celling', 'their', 'honey', '.']
celling their 

In [None]:
trainer.save_model('/content/drive/MyDrive/semeval')

Saving model checkpoint to /content/drive/MyDrive/semeval
Configuration saved in /content/drive/MyDrive/semeval/config.json
Model weights saved in /content/drive/MyDrive/semeval/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/semeval/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/semeval/special_tokens_map.json


In [None]:
!zip -r '/content/model.zip' '/content/model'

  adding: content/model/ (stored 0%)
  adding: content/model/vocab.json (deflated 59%)
  adding: content/model/special_tokens_map.json (deflated 52%)
  adding: content/model/pytorch_model.bin (deflated 11%)
  adding: content/model/tokenizer.json (deflated 72%)
  adding: content/model/merges.txt (deflated 53%)
  adding: content/model/config.json (deflated 50%)
  adding: content/model/training_args.bin (deflated 47%)
  adding: content/model/tokenizer_config.json (deflated 48%)


In [None]:
tokenized_input_eg = tokenizer("Bees make money by celling their money .", truncation=True)
tokenized_input_eg

{'input_ids': [0, 35589, 146, 418, 30, 3551, 154, 49, 418, 479, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
sent = "Bees make money by celling their money ."

In [None]:
import numpy as np
predictions, labels, _ = trainer.predict([tokenized_input_eg])
predictions = np.argmax(predictions, axis=2)
print(predictions[0])
locs = [i - 1 for i, e in enumerate(predictions[0]) if e == 1]
for loc in locs:
  print(sent.split(" ")[loc], end=" ")

***** Running Prediction *****
  Num examples = 1
  Batch size = 16


[0 0 0 0 0 1 1 0 0 0 0]
celling their 