# Environment Setup

1.   Transformers Model Installation
2.   Colab Setup

In [None]:
# Install Tranformers & related libraries
! pip install transformers
! pip install datasets

# Parameters Set up 

In [2]:
import os
import json
import pandas as pd
import numpy as np
from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import TrainingArguments
import torch

In [3]:
model_checkpoint = "roberta-base"
task = "response_context_rev_sep"
args = TrainingArguments(
    "test",
    evaluation_strategy = "epoch",
)
# different structural combination of data to be given as input to the transformer
task_to_keys = {
    "response_last_response_sep": ("response", "last_response"),
    "response_context_sep": ("response", "context"),
    "response_context_rev_sep": ("response", "context_rev"),
    "response_only": ("response", None),
    "all_only": ("all", None),
    "response_last_response_only": ("response_last_response", None),
    "response_only": ("response", None)
}
sentence1_key, sentence2_key = task_to_keys[task]

def preprocess_function(examples):
    if sentence2_key is None:
        return tokenizer(examples[sentence1_key], truncation=True)
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# Data Load
Load the test datasets and tokenize it

In [5]:
test_pddf = pd.read_json('test.jsonl',  lines=True)

In [6]:
test_pddf['context_rev'] = test_pddf.apply(lambda x: " ".join(x.context[::-1]), axis=1)
final_ds = Dataset.from_pandas(test_pddf)
final_encoded_ds = final_ds.map(preprocess_function, batched=True)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




# Import the model and predict on test data


In [7]:
model2 =  AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [8]:
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)
#path_model = os.path.join(r'/content/drive/My Drive/CS410/data','check_point.pth' )
path_model = 'check_point.pth'
model2.load_state_dict(torch.load(path_model))

<All keys matched successfully>

In [9]:
trainer2 = Trainer(
    model=model2,
    args=args,
    tokenizer=tokenizer,
)

In [10]:
final_result = trainer2.predict(test_dataset=final_encoded_ds)

# Output the data to answers.txt


In [11]:
final_preds = final_result.predictions.argmax(-1)

In [12]:
test_pddf["preds"] = final_preds

In [13]:
test_pddf['label']= np.where(test_pddf['preds'] == 1, "SARCASM", "NOT_SARCASM")

In [14]:
test_pddf[['id', 'label']].to_csv('answer.txt', index = False, header=False)