In [1]:
import torch
import torch.nn as nn
from transformers import DebertaV2ForSequenceClassification, DebertaV2Tokenizer, TrainingArguments, Trainer
import os
import pandas as pd
import json
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np
from datasets import Dataset, DatasetDict

2023-06-07 17:15:04.943749: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-07 17:15:04.993301: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_directory = "data/"
home_directory = "../"

NUM_EPOCHS = 10
BATCH_SIZE = 4

In [3]:
def get_articles(raw_data):
    data = []
    for each_row in raw_data['articles']:
        data.append(each_row['content'])
    return data

def get_split_data(split_data):
    data = []
    labels = []
    for i, news_src in enumerate(split_data['json_file_path']):
        raw_data = json.loads(open(os.path.join(home_directory, data_directory, news_src)).read())
        cur_data = get_articles(raw_data)
        data.extend(cur_data)
        labels.extend([split_data['label'][i] for _ in range(len(cur_data))])
    final_data = [data, labels]
    return final_data

### Factuality

In [4]:
fact_train_file_path = "data/task_4/task_4_news_media_factuality_train.tsv"
fact_dev_file_path = "data/task_4/task_4_news_media_factuality_dev.tsv"
fact_test_gold_file_path = "task_4_news_media_factuality_test.tsv"

In [5]:
fact_train_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_train_file_path), sep = "\t")
fact_dev_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_dev_file_path), sep = "\t")
fact_test_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_test_gold_file_path), sep = "\t")

In [6]:
fact_test_raw_data.head()

Unnamed: 0,source,json_file_path,label
0,beinglibertarian.com,data/task_4/test_json/beinglibertarian.com.json,1
1,usareally.com,data/task_4/test_json/usareally.com.json,0
2,nzherald.co.nz,data/task_4/test_json/nzherald.co.nz.json,2
3,lancasteronline.com,data/task_4/test_json/lancasteronline.com.json,2
4,slate.com,data/task_4/test_json/slate.com.json,2


In [7]:
fact_train_data = get_split_data(fact_train_raw_data)
fact_dev_data  = get_split_data(fact_dev_raw_data)
fact_test_data = get_split_data(fact_test_raw_data)

In [8]:
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/mdeberta-v3-base")
model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels = 3)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'deberta.embeddings.word_embeddings._weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing

In [9]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [10]:
model.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
 

In [11]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=1024, return_tensors="pt")

In [12]:
df_train = pd.DataFrame({
    'text': fact_train_data[0],
    'label': fact_train_data[1]
})

df_dev = pd.DataFrame({
    'text': fact_dev_data[0],
    'label': fact_dev_data[1]
})

df_test = pd.DataFrame({
    'text': fact_test_data[0],
    'label': fact_test_data[1]
})

In [13]:
train_ds = Dataset.from_pandas(df_train)
dev_ds = Dataset.from_pandas(df_dev)
test_ds = Dataset.from_pandas(df_test)

In [None]:
tokenized_train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_dev_ds = dev_ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_test_ds = test_ds.map(tokenize, batched=True, remove_columns=["text"])

Map:   0%|          | 0/7948 [00:00<?, ? examples/s]

In [None]:
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.argmax(torch.from_numpy(preds), dim = -1)

    return {'mse_error': nn.L1Loss()(predictions.to(torch.float64), torch.FloatTensor(labels))}

In [None]:
import os

os.environ["WANDB_PROJECT"] = "ugrip-nlp1"

In [None]:
training_args = TrainingArguments(
    output_dir='mdebertav3',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    disable_tqdm = False,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='mdebertav3-logs',
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to = "none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_dev_ds,
    # compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

### Political Bias

In [None]:
bias_A_train_file_path = "data/task_3A/task_3A_news_article_bias_train.tsv"
bias_A_dev_file_path = "data/task_3A/task_3A_news_news_article_bias_dev.tsv"
bias_A_test_gold_file_path = "task_3A_news_news_article_bias_test.tsv"

In [None]:
# bias_A_train_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_train_file_path), sep = "\t")
# bias_A_dev_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_dev_file_path), sep = "\t")
# bias_A_test_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_test_gold_file_path), sep = "\t")