### Загружаем данные

In [None]:
!pip install datasets

In [None]:
import json
from collections import defaultdict

with open('arabic_stance_trans_grandmaster.json') as f:
    d = json.load(f)

arabic_stance_reformatted = []
for key in d.keys():
  x = {"arb":d[key][0], "arb_clean":d[key][1], "eng_tr":d[key][2], "rus":d[key][3], "label":d[key][4]+1}
  arabic_stance_reformatted.append(x)

In [None]:
import csv

def dict_of_lists(data, corpus=None):

  answer = []
  if corpus == 'ruarg':
    for row in data:
      x = {"eng_tr": row[2], "rus": row[0], "label": int(row[1])+1}
      answer.append(x)
  else:
    for row in data:
      x = {"eng_tr": row[0], "rus": row[2], "label": int(row[1])+1}
      answer.append(x)
  return(answer)


with open('RuArg Translation.csv', 'r') as csvfile:
    ra = list(csv.reader(csvfile))
with open('TweetStance Translation.csv', 'r') as csvfile:
    ts = list(csv.reader(csvfile))

ra_formatted = dict_of_lists(ra, corpus='ruarg')
ts_formatted = dict_of_lists(ts, corpus='tweetstance')

In [None]:
import torch

arabic_train, arabic_val, arabic_test = torch.utils.data.random_split(arabic_stance_reformatted, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(121))
ruarg_train, ruarg_val, ruarg_test = torch.utils.data.random_split(ra_formatted, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(121))
tweetstance_train, tweetstance_val, tweetstance_test = torch.utils.data.random_split(ts_formatted, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(121))

In [None]:
with open('arabic_stance_train.json', 'w') as f:
    json.dump(list(arabic_train), f, indent=4, ensure_ascii=False)
with open('arabic_stance_val.json', 'w') as f:
    json.dump(list(arabic_val), f, indent=4, ensure_ascii=False)
with open('arabic_stance_test.json', 'w') as f:
    json.dump(list(arabic_test), f, indent=4, ensure_ascii=False)

with open('ruarg_train.json', 'w') as f:
    json.dump(list(ruarg_train), f, indent=4, ensure_ascii=False)
with open('ruarg_val.json', 'w') as f:
    json.dump(list(ruarg_val), f, indent=4, ensure_ascii=False)
with open('ruarg_test.json', 'w') as f:
    json.dump(list(ruarg_test), f, indent=4, ensure_ascii=False)

with open('tweetstance_train.json', 'w') as f:
    json.dump(list(tweetstance_train), f, indent=4, ensure_ascii=False)
with open('tweetstance_val.json', 'w') as f:
    json.dump(list(tweetstance_val), f, indent=4, ensure_ascii=False)
with open('tweetstance_test.json', 'w') as f:
    json.dump(list(tweetstance_test), f, indent=4, ensure_ascii=False)

In [None]:
import random


with open('ruarg_train.json') as f:
    ra_t = json.load(f)
with open('ruarg_val.json') as f:
    ra_v = json.load(f)
with open('tweetstance_train.json') as f:
    ts_t = json.load(f)
with open('tweetstance_val.json') as f:
    ts_v = json.load(f)
with open('arabic_stance_train.json') as f:
    arb_t = json.load(f)
with open('arabic_stance_val.json') as f:
    arb_v = json.load(f)

combined_train = ra_t + ts_t + arb_t
combined_val = ra_v + ts_v + arb_v
random.shuffle(combined_train)
random.shuffle(combined_val)

with open('combined_train.json', 'w') as f:
    json.dump(list(combined_train), f, indent=4, ensure_ascii=False)
with open('combined_val.json', 'w') as f:
    json.dump(list(combined_val), f, indent=4, ensure_ascii=False)

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "combined_train.json", "validation": "combined_val.json"})

### Дообучение

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics_fixed(eval_pred):
    logits, labels = eval_pred
    pred_labels = np.argmax(logits, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, pred_labels)

    # Calculate precision, recall, and F1-score for each class
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred_labels, average='weighted')

    # Convert to percentage
    accuracy *= 100
    precision *= 100
    recall *= 100
    f1 *= 100

    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1": f1}

In [None]:
from transformers import AutoTokenizer

def make_dataset(data, tokenizer, label_field='label', language=None):
    answer = []
    for i, (s, label) in enumerate(zip(data["rus"] if language == "Russian" else data["eng_tr"], data["label"])):
        result = tokenizer(s, max_length=511, truncation=True)
        result["labels"] = label
        answer.append(result)
    return answer

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
train_dataset = make_dataset(dataset["train"], tokenizer, language="Russian")
dev_dataset = make_dataset(dataset["validation"], tokenizer, language="Russian")

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from torch.optim import AdamW

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     pred_labels = np.argmax(logits, axis=-1)
#     are_equal = (pred_labels == labels)
#     TP = np.sum(are_equal * labels)
#     FP = np.sum((1.0 - are_equal) * (1-labels))
#     FN = np.sum((1.0 - are_equal) * (1-pred_labels))
#     correct, total = np.sum(are_equal), len(labels)
#     return {"Accuracy": 100 * correct / total, "P": 100 * TP / max(TP+FP, 1.0), "R": 100 * TP / max(TP+FN, 1.0), "F1": 100 * TP / max(TP+0.5*FN+0.5*FP, 1.0)}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=3)
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
training_args = TrainingArguments(output_dir="trainer_logs",
                                  evaluation_strategy="epoch", save_strategy='epoch', num_train_epochs=5,
                                  load_best_model_at_end=True, disable_tqdm=False,
                                  per_device_train_batch_size=4, warmup_ratio=0.1,
                                  gradient_accumulation_steps=4,
                                  metric_for_best_model="Accuracy", report_to="none")
trainer = Trainer(
    model=model,
    optimizers=(optimizer, None),
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics_fixed)
trainer.train()

In [None]:
test_dataset_arb = load_dataset("json", data_files="arabic_stance_test.json")
test_dataset_ra = load_dataset("json", data_files="ruarg_test.json")
test_dataset_ts = load_dataset("json", data_files="tweetstance_test.json")

test_arb = make_dataset(test_dataset_arb['train'], tokenizer)
test_ra = make_dataset(test_dataset_ra['train'], tokenizer)
test_ts = make_dataset(test_dataset_ts['train'], tokenizer)

In [None]:
import numpy as np
import scipy

predictions_arb = trainer.predict(test_arb)
print('Arabic stance:\n',  predictions_arb.metrics)
predictions_ra = trainer.predict(test_ra)
print('RuArg:\n', predictions_ra.metrics)
predictions_ts = trainer.predict(test_ts)
print('English Tweet Stance:\n', predictions_ts.metrics)

In [None]:
import numpy as np
import scipy

def predict_with_trainer(trainer, dataset, classes):
    predictions = trainer.predict(dataset)
    probs = scipy.special.softmax(predictions.predictions, axis=-1)
    answer = [{"label": classes[np.argmax(elem)], "probs": elem} for elem in probs]
    return answer