In [1]:
import torch.nn as nn
from transformers import AutoTokenizer, TrainingArguments, Trainer
from modeling_cus_roberta import RobertaForContrastiveClassification
import os, json, torch
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
from datasets import Dataset, DatasetDict

2023-06-12 18:44:39.807947: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-12 18:44:39.857096: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_directory = "data/"
home_directory = "../../"

NUM_EPOCHS = 10
BATCH_SIZE = 64
MODEL_NAME = "roberta-base"

In [3]:
def get_articles(raw_data):
    data = []
    for each_row in raw_data['articles']:
        data.append(each_row['content'])
    return data

def get_split_data(split_data):
    data = []
    labels = []
    for i, news_src in enumerate(split_data['json_file_path']):
        raw_data = json.loads(open(os.path.join(home_directory, data_directory, news_src)).read())
        cur_data = get_articles(raw_data)
        data.extend(cur_data)
        labels.extend([split_data['label'][i] for _ in range(len(cur_data))])
    final_data = [data, labels]
    return final_data

### Factuality

In [4]:
fact_train_file_path = "data/task_4/task_4_news_media_factuality_train.tsv"
fact_dev_file_path = "data/task_4/task_4_news_media_factuality_dev.tsv"
fact_test_gold_file_path = "task_4_news_media_factuality_test.tsv"

In [5]:
fact_train_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_train_file_path), sep = "\t")
fact_dev_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_dev_file_path), sep = "\t")
fact_test_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_test_gold_file_path), sep = "\t")

In [6]:
fact_train_raw_data.shape

(947, 3)

In [7]:
fact_train_data = get_split_data(fact_train_raw_data)
fact_dev_data  = get_split_data(fact_dev_raw_data)
fact_test_data = get_split_data(fact_test_raw_data)

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = RobertaForContrastiveClassification.from_pretrained(MODEL_NAME, num_labels = 3)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForContrastiveClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForContrastiveClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForContrastiveClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForContrastiveClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['neg.out_proj.bias', 'neg.dense.bias', 'neg.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.wei

In [9]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [10]:
model.to(device)

RobertaForContrastiveClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [11]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, return_tensors="pt")

In [12]:
df_train = pd.DataFrame({
    'text': fact_train_data[0],
    'label': fact_train_data[1]
})

df_dev = pd.DataFrame({
    'text': fact_dev_data[0],
    'label': fact_dev_data[1]
})

df_test = pd.DataFrame({
    'text': fact_test_data[0],
    'label': fact_test_data[1]
})

In [13]:
train_ds = Dataset.from_pandas(df_train)
dev_ds = Dataset.from_pandas(df_dev)
test_ds = Dataset.from_pandas(df_test)

In [14]:
tokenized_train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_dev_ds = dev_ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_test_ds = test_ds.map(tokenize, batched=True, remove_columns=["text"])

Map:   0%|          | 0/7948 [00:00<?, ? examples/s]

Map:   0%|          | 0/1049 [00:00<?, ? examples/s]

Map:   0%|          | 0/1054 [00:00<?, ? examples/s]

In [15]:
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.argmax(torch.from_numpy(predictions), dim = -1)

    return {'mae_error': nn.L1Loss()(predictions.to(torch.float64), torch.FloatTensor(labels))}

In [16]:
# import os

# os.environ["WANDB_PROJECT"] = "ugrip-nlp1"

In [17]:
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def freeze_params(model):
    for param in model.base_model.parameters():
        param.requires_grad = False
    return model

In [18]:
import copy
frozen_model = copy.deepcopy(model)

In [19]:
frozen_model = freeze_params(frozen_model)

In [20]:
count_trainable_params(frozen_model), count_trainable_params(model)

(1778697, 125833737)

In [21]:
training_args = TrainingArguments(
    output_dir='kl-roberta-frozen-factuality',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
#     warmup_steps=500,
#     weight_decay=0.01,
    logging_dir='kl-roberta-frozen-factuality-logs',
    learning_rate=1e-4,
    load_best_model_at_end=True,
    save_total_limit=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to = "none"
)

trainer = Trainer(
    model=frozen_model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_dev_ds,
    compute_metrics=compute_metrics,
)

In [22]:
trainer.train()



Epoch,Training Loss,Validation Loss,Mae Error
1,No log,-12.402655,0.523356


In [None]:
preds = trainer.predict(tokenized_test_ds)

In [None]:
preds = preds.predictions

In [None]:
pred_labs = torch.argmax(torch.from_numpy(preds), dim = -1)

In [None]:
pred_labs = pred_labs.to(torch.float64)

In [None]:
print("Accuracy: ", sum(torch.FloatTensor(tokenized_test_ds['label']) == pred_labs)/pred_labs.shape[0])

In [None]:
print("MAE: ", nn.L1Loss()(pred_labs, torch.FloatTensor(tokenized_test_ds['label'])).item())

### Political Bias

In [None]:
# bias_A_train_file_path = "data/task_3A/task_3A_news_article_bias_train.tsv"
# bias_A_dev_file_path = "data/task_3A/task_3A_news_news_article_bias_dev.tsv"
# bias_A_test_gold_file_path = "task_3A_news_news_article_bias_test.tsv"

In [None]:
# bias_A_train_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_train_file_path), sep = "\t")
# bias_A_dev_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_dev_file_path), sep = "\t")
# bias_A_test_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_test_gold_file_path), sep = "\t")