In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os
import pandas as pd
import json
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np
from datasets import Dataset, DatasetDict

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
data_directory = "data/"
home_directory = "../../"

NUM_EPOCHS = 10
BATCH_SIZE = 64
MODEL_NAME = "roberta-base"

In [None]:
def get_articles(raw_data):
    data = []
    for each_row in raw_data['articles']:
        data.append(each_row['content'])
    return data

def get_split_data(split_data):
    data = []
    labels = []
    for i, news_src in enumerate(split_data['json_file_path']):
        raw_data = json.loads(open(os.path.join(home_directory, data_directory, news_src)).read())
        cur_data = get_articles(raw_data)
        data.extend(cur_data)
        labels.extend([split_data['label'][i] for _ in range(len(cur_data))])
    final_data = [data, labels]
    return final_data

### Factuality

In [None]:
fact_train_file_path = "data/task_4/task_4_news_media_factuality_train.tsv"
fact_dev_file_path = "data/task_4/task_4_news_media_factuality_dev.tsv"
fact_test_gold_file_path = "task_4_news_media_factuality_test.tsv"

In [None]:
fact_train_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_train_file_path), sep = "\t")
fact_dev_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_dev_file_path), sep = "\t")
fact_test_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, fact_test_gold_file_path), sep = "\t")

In [None]:
fact_train_raw_data.shape

In [None]:
fact_train_data = get_split_data(fact_train_raw_data)
fact_dev_data  = get_split_data(fact_dev_raw_data)
fact_test_data = get_split_data(fact_test_raw_data)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = 3)

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model.to(device)

In [None]:
def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, return_tensors="pt")

In [None]:
df_train = pd.DataFrame({
    'text': fact_train_data[0],
    'label': fact_train_data[1]
})

df_dev = pd.DataFrame({
    'text': fact_dev_data[0],
    'label': fact_dev_data[1]
})

df_test = pd.DataFrame({
    'text': fact_test_data[0],
    'label': fact_test_data[1]
})

In [None]:
train_ds = Dataset.from_pandas(df_train)
dev_ds = Dataset.from_pandas(df_dev)
test_ds = Dataset.from_pandas(df_test)

In [None]:
tokenized_train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_dev_ds = dev_ds.map(tokenize, batched=True, remove_columns=["text"])
tokenized_test_ds = test_ds.map(tokenize, batched=True, remove_columns=["text"])

In [None]:
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.argmax(torch.from_numpy(predictions), dim = -1)

    return {'mse_error': nn.L1Loss()(predictions.to(torch.float64), torch.FloatTensor(labels))}

In [None]:
# import os

# os.environ["WANDB_PROJECT"] = "ugrip-nlp1"

In [None]:
def count_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def freeze_params(model):
    for param in model.base_model.parameters():
        param.requires_grad = False
    return model

In [None]:
import copy
frozen_model = copy.deepcopy(model)

In [None]:
frozen_model = freeze_params(frozen_model)

In [None]:
count_trainable_params(frozen_model), count_trainable_params(model)

In [None]:
training_args = TrainingArguments(
    output_dir='roberta-frozen-factuality',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
#     warmup_steps=500,
#     weight_decay=0.01,
    logging_dir='roberta-frozen-factuality-logs',
    learning_rate=1e-4,
    load_best_model_at_end=True,
    save_total_limit=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to = "none"
)

trainer = Trainer(
    model=frozen_model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_dev_ds,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
preds = trainer.predict(tokenized_test_ds)

In [None]:
preds = preds.predictions

In [None]:
pred_labs = torch.argmax(torch.from_numpy(preds), dim = -1)

In [None]:
pred_labs = pred_labs.to(torch.float64)

In [None]:
print("Accuracy: ", sum(torch.FloatTensor(tokenized_test_ds['label']) == pred_labs)/pred_labs.shape[0])

In [None]:
print("MAE: ", nn.L1Loss()(pred_labs, torch.FloatTensor(tokenized_test_ds['label'])).item())

### Political Bias

In [None]:
# bias_A_train_file_path = "data/task_3A/task_3A_news_article_bias_train.tsv"
# bias_A_dev_file_path = "data/task_3A/task_3A_news_news_article_bias_dev.tsv"
# bias_A_test_gold_file_path = "task_3A_news_news_article_bias_test.tsv"

In [None]:
# bias_A_train_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_train_file_path), sep = "\t")
# bias_A_dev_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_dev_file_path), sep = "\t")
# bias_A_test_raw_data = pd.read_csv(os.path.join(home_directory, data_directory, bias_A_test_gold_file_path), sep = "\t")

In [31]:
from nela_features.nela_features import NELAFeatureExtractor

newsarticle = "Breaking News: Ireland Expected To Become World's First Country To Divest From Fossil Fuels ..." 

nela = NELAFeatureExtractor()

# Extract all feature groups at once
feature_vector, feature_names = nela.extract_all(newsarticle)

In [32]:
import numpy as np

feature_vector = np.array(feature_vector)

In [33]:
x = (feature_vector - feature_vector.mean())/feature_vector.std()

In [34]:
x.shape

(87,)