### Изначальное форматирование корпусов, использованное на первых этапах проекта:

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

arabic_stance = load_dataset("NoraAlt/Mawqif_Stance-Detection")
synt_english_stance = load_dataset("webimmunization/COVID-19-conspiracy-theories-tweets")
english_tweet_stance = load_dataset("Supakrit65/stance-general-json")

**Приведение корпусов к единому формату**

In [None]:
import re
def label_format(stance):
  if stance in ['Against', 'deny', 'AGAINST']:
    answer = -1
  elif stance in ['Favor', 'support', 'FAVOR']:
    answer = 1
  else:
    answer = 0
  return answer


def stance_format(ds):
  stance_f = {}
  ID = 0
  for x in ds:
    if 'target' in x:
      if x['target'] == 'Covid Vaccine':
        stance_f[ID] =  (x['text'], label_format(x['stance']))
        ID += 1
    if 'tweet' in x:
      stance_f[ID] = (x['tweet'], label_format(x['label']))
      ID += 1
    if 'input' in x:
      stance_f[ID] = (re.search(r"`(.*?)`", x['input']).group(1), label_format(x['output']))
      ID += 1

  return stance_f

In [None]:
arabic_stance_new = stance_format(arabic_stance['train'])
print(arabic_stance_new[0])

synt_english_stance_new = stance_format(synt_english_stance['train'])
print(synt_english_stance_new[0])

english_tweet_stance_new = stance_format(english_tweet_stance['train'])
print(english_tweet_stance_new[0])

In [None]:
import json
with open('synt_english_stance.json', 'w') as f:
    json.dump(synt_english_stance_new, f, indent=4)
with open('english_tweet_stance.json', 'w') as f:
    json.dump(english_tweet_stance_new, f, indent=4)

Подгружаем RuArg. Отфильтровываем только те тексты, которые содержат позицию ровно по одной из тем, чтобы у нас не было неоднозначных данных.

In [None]:
import pandas as pd

df = pd.read_csv('train.tsv', sep='\t')
label_columns = ['text_id', 'text', 'masks_stance', 'masks_argument', 'quarantine_stance', 'quarantine_argument', 'vaccines_stance', 'vaccines_argument']
def filter_texts(row):
    return sum(1 for label in row[label_columns] if label == -1) == 4
filtered_df = df[df.apply(filter_texts, axis=1)]

In [None]:
print(filtered_df.head())

In [None]:
ruarg_new = {}
ID = 0
for _, row in filtered_df.iterrows():
  text = row['text']
  for label_column in label_columns[2::]:
        if row[label_column] == 2:
            ruarg_new[ID] = (text, 1)
            break
        elif row[label_column] == 0:
            ruarg_new[ID] = (text, -1)
            break
        elif row[label_column] == 1:
            ruarg_new[ID] = (text, 0)
            break
  ID += 1

print(ruarg_new)

In [None]:
with open('ruarg_cyr.json', 'w') as f:
    json.dump(ruarg_new, f, indent=4, ensure_ascii=False)
with open('arabic_stance_ar.json', 'w') as f:
    json.dump(arabic_stance_new, f, indent=4, ensure_ascii=False)

### Промптинг YandexGPT

In [None]:
!pip install yandex-chain

In [None]:
from yandex_chain import YandexLLM

LLM = YandexLLM(folder_id="b1g697l3fh7u10rq35fd", iam_token="")
print(LLM("How are you today?"))

In [None]:
template_en = """You are a knowledgeable AI model who is an expert on COVID-19.
Please examine the statement in the context below after the word "STATEMENT:".
Output "STANCE: 2" if the author of the statement has a positive stance towards measures put
in place by governments to combat COVID-19, such as lockdowns, mask mandates, and vaccination campaigns.
If this statement is neutral in sentiment, output "STANCE: 1". Output "STANCE: 0" if the statement is critical of these measures.
Output only "STANCE: " and then a number. Do NOT output anything else.{suf}
STATEMENT: {statement}"""

ex_en = """Some examples:
STATEMENT: [USER], doctors work in masks all their lives and everything is fine, but we will have problems with our organs from wearing them for a short time. More problems will come from complications after covid, influenza and acute respiratory viral infections.
STANCE: 2
STATEMENT: [USER], who can argue, the mask is not even from infection, but from the stupid bureaucrats who ordered not to be allowed into the store without it.
STANCE: 0
STATEMENT: [USER], the vaccine contains only dead coronavirus cells and nothing else!!!
STANCE: 1"""

template_ru = """Ты — модель ИИ, которая является экспертом по теме COVID-19.
Пожалуйста, изучи утверждение в контексте ниже. Выведи "ПОЗИЦИЯ: 2", если автор утверждения положительно
относится к мерам, принимаемым правительствами для борьбы с COVID-19, будь то карантин,
требование к ношению масок и вакцинация. Если данное утверждение является нейтральным,
выведи "ПОЗИЦИЯ: 1". Если же в утверждении содержится критическая по отношении к этом мерам позиция,
выведи "ПОЗИЦИЯ: 0". Выводи только "ПОЗИЦИЯ:" и число. НЕ выводи ничего другого.{suf}
УТВЕРЖДЕНИЕ: {statement}"""

ex_ru = """Некоторые примеры:
УТВЕРЖДЕНИЕ: [USER], врачи всю жизнь в масках работают и все нормально, а у нас от кратковременного ношения, прямо, будут проблемы с органами  Больше проблем будет от осложнений после ковида, гриппа и ОРВ.
ПОЗИЦИЯ: 2
УТВЕРЖДЕНИЕ: [USER], кто спорит, маска очень даже  только не от инфекции, а от тупоголовых чинуш, которые приказали не пускать без неё в магазин.
ПОЗИЦИЯ: 0
УТВЕРЖДЕНИЕ: [USER], в вакцине только исключительно мертвые клетки короновируса и ничего другого там нет!!!
ПОЗИЦИЯ: 1"""

templates = (template_en, template_ru)
exs = (ex_en, ex_ru)

In [None]:
!pip install -U scikit-learn

In [None]:
import numpy
from sklearn.metrics import f1_score
import re

def YandexLLM_predict(data, language="English", model=LLM, few_shot=True, templates=templates, exs=exs):

  pattern = r'\d'
  actual_labels = []
  predicted_labels = []
  curr_template = templates[0] if language=="English" else templates[1]
  curr_ex = exs[0] if few_shot and language=="English" else exs[1] if few_shot and language=="Russian" else ''
  for elem in data:
    answer = model(curr_template.format(suf=curr_ex, statement=elem['eng_tr'] if language=="English" else elem['rus']))
    try:
      label = re.findall(pattern, answer)[0]
      predicted_labels.append(int(label))
      actual_labels.append(int(elem['label']))
    except:
      continue
  # print(predicted_labels[:10])
  # print(len(actual_labels), len(predicted_labels))
  return(f1_score(actual_labels, predicted_labels, average='macro'))

  #
  # for (text, l) in random_slice:
  #   answer = LLM(template_ru.format(text))
    # print(answer)

In [None]:
import json

with open('arabic_stance_test.json') as f:
    arabic_stance_test = json.load(f)
with open('ruarg_test.json') as f:
    ruarg_test = json.load(f)
with open('tweetstance_test.json') as f:
    tweetstance_test = json.load(f)

In [None]:
arb_zero_shot_eng = YandexLLM_predict(arabic_stance_test, few_shot=False)
print("Arabic stance zero-shot English F1-score:", arb_zero_shot_eng)
ruarg_zero_shot_eng = YandexLLM_predict(ruarg_test, few_shot=False)
print("RuArg zero-shot English F1-score:", ruarg_zero_shot_eng)
ts_zero_shot_eng = YandexLLM_predict(tweetstance_test, few_shot=False)
print("English tweet stance zero-shot English F1-score:", ts_zero_shot_eng)

arb_few_shot_eng = YandexLLM_predict(arabic_stance_test)
print("Arabic stance few-shot English F1-score:", arb_few_shot_eng)
ruarg_few_shot_eng = YandexLLM_predict(ruarg_test)
print("RuArg few-shot English F1-score:", ruarg_few_shot_eng)
ts_few_shot_eng = YandexLLM_predict(tweetstance_test)
print("English tweet stance few-shot English F1-score:", ts_few_shot_eng)

arb_zero_shot_ru = YandexLLM_predict(arabic_stance_test, language="Russian", few_shot=False)
print("Arabic stance zero-shot Russian F1-score:", arb_zero_shot_ru)
ruarg_zero_shot_ru = YandexLLM_predict(ruarg_test, language="Russian", few_shot=False)
print("RuArg zero-shot Russian F1-score:", ruarg_zero_shot_ru)
ts_zero_shot_ru = YandexLLM_predict(tweetstance_test, language="Russian", few_shot=False)
print("English tweet stance zero-shot Russian F1-score:", ts_zero_shot_ru)

arb_few_shot_ru = YandexLLM_predict(arabic_stance_test, language="Russian")
print("Arabic stance few-shot Russian F1-score:", arb_few_shot_ru)
ruarg_few_shot_ru = YandexLLM_predict(ruarg_test, language="Russian")
print("RuArg few-shot Russian F1-score:", ruarg_few_shot_ru)
ts_few_shot_ru = YandexLLM_predict(tweetstance_test, language="Russian")
print("English tweet stance few-shot Russian F1-score:", ts_few_shot_ru)

### Ниже код с промежуточных результатов:

In [None]:
random_slice_1 = random.sample(list(english_tweet_stance_new.values()), 100)
actual_labels_1 = []
predicted_labels_1 = []
pattern = r'-?\d'
for (text, l) in random_slice_1:
  answer = LLM(template_en.format(text))
  # print(answer)
  try:
    label = re.search(pattern, answer).group(0)
    predicted_labels_1.append(int(label))
    actual_labels_1.append(int(l))
  except:
    continue

In [None]:
print(f1_score(actual_labels_1, predicted_labels_1, average='macro'))

In [None]:
random_slice_2 = random.sample(list(synt_english_stance_new.values()), 100)
actual_labels_2 = []
predicted_labels_2 = []
pattern = r'-?\d'
for (text, l) in random_slice_2:
  answer = LLM(template_en.format(text))
  # print(answer)
  try:
    label = re.search(pattern, answer).group(0)
    predicted_labels_2.append(int(label))
    actual_labels_2.append(int(l))
  except:
    continue

In [None]:
print(f1_score(actual_labels_2, predicted_labels_2, average='macro'))

In [None]:
with open('arabic_stance_aug.json', 'r') as file:
    arabic_stance_translated_dict = json.load(file)

arabic_stance_only_en = []
for key in arabic_stance_translated_dict:
  _, text, l = arabic_stance_translated_dict[key]
  arabic_stance_only_en.append((text, l))

In [None]:
random_slice_3 = arabic_stance_only_en
actual_labels_3 = []
predicted_labels_3 = []
pattern = r'-?\d'
for (text, l) in random_slice_3:
  answer = LLM(template_en.format(text))
  # print(answer)
  try:
    label = re.search(pattern, answer).group(0)
    predicted_labels_3.append(int(label))
    actual_labels_3.append(int(l))
  except:
    continue

In [None]:
print(f1_score(actual_labels_3, predicted_labels_3, average='macro'))