In [None]:
!pip install -q -U langchain transformers bitsandbytes accelerate langchain-community

In [None]:
import torch
from transformers import BitsAndBytesConfig
from langchain import HuggingFacePipeline, PromptTemplate, LLMChain
from langchain_core.output_parsers import StrOutputParser
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import login
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda",quantization_config=quantization_config,)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
pipeline_inst = pipeline(
        "text-generation",
        model=model_4bit,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=2500,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipeline_inst)

In [None]:
!nvidia-smi

In [None]:
template_en = """<s>[INST]You are a knowledgeable AI model who is an expert on COVID-19. Please examine the statement in the context below after the word "STATEMENT:". Output "STANCE: 2" if the author of the statement has a positive stance towards measures put in place by governments to combat COVID-19, such as lockdowns, mask mandates, and vaccination campaigns. If this statement is neutral in sentiment, output "STANCE: 1". Output "STANCE: 0" if the statement is critical of these measures. Output only "STANCE: " and then a number. Do NOT output anything else.{suf}
STATEMENT: {statement}[/INST]</s>
"""

ex_en = """
Some examples:
STATEMENT: [USER], doctors work in masks all their lives and everything is fine, but we will have problems with our organs from wearing them for a short time. More problems will come from complications after covid, influenza and acute respiratory viral infections.
STANCE: 2
STATEMENT: [USER], who can argue, the mask is not even from infection, but from the stupid bureaucrats who ordered not to be allowed into the store without it.
STANCE: 0
STATEMENT: [USER], the vaccine contains only dead coronavirus cells and nothing else!!!
STANCE: 1"""

template_ru = """<s>[INST]Ты — модель ИИ, которая является экспертом по теме COVID-19. Пожалуйста, изучи утверждение в контексте ниже. Выведи "ПОЗИЦИЯ: 2", если автор утверждения положительно относится к мерам, принимаемым правительствами для борьбы с COVID-19, будь то карантин, требование к ношению масок и вакцинация. Если данное утверждение является нейтральным, выведи "ПОЗИЦИЯ: 1". Если же в утверждении содержится критическая по отношении к этом мерам позиция, выведи "ПОЗИЦИЯ: 0". Выводи только "ПОЗИЦИЯ:" и число. НЕ выводи ничего другого.{suf}
УТВЕРЖДЕНИЕ: {statement}[/INST]</s>
"""

ex_ru = """
Некоторые примеры:
УТВЕРЖДЕНИЕ: [USER], врачи всю жизнь в масках работают и все нормально, а у нас от кратковременного ношения, прямо, будут проблемы с органами  Больше проблем будет от осложнений после ковида, гриппа и ОРВ.
ПОЗИЦИЯ: 2
УТВЕРЖДЕНИЕ: [USER], кто спорит, маска очень даже  только не от инфекции, а от тупоголовых чинуш, которые приказали не пускать без неё в магазин.
ПОЗИЦИЯ: 0
УТВЕРЖДЕНИЕ: [USER], в вакцине только исключительно мертвые клетки короновируса и ничего другого там нет!!!
ПОЗИЦИЯ: 1
"""

In [None]:
def generate_response(statement, template, suf=''):
  prompt = PromptTemplate(template=template, input_variables=["statement", "suf"])
  chain = prompt | llm | StrOutputParser()
  response = chain.invoke({"statement": statement, "suf": suf})
  return response.split('\n')[-1]

In [None]:
import json

filenames = ['tweetstance_test.json', 'ruarg_test.json', 'arabic_stance_test.json']
# filenames = ['ruarg_test.json']
# filenames = ['arabic_stance_test.json']
# filenames = ['tweetstance_test.json', 'ruarg_test.json']

In [None]:
all_res = []

In [None]:
import re
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

stance_prog = re.compile(r'STANCE: (-?\d)')

for filename in filenames:
  cur_res =[]
  print(filename)
  with open(filename, 'r') as fin:
    data = json.load(fin)
  true = [int(x['label']) for x in data]
  # English zero shot
  en_zs = []
  for x in tqdm(data):
    resp = generate_response(template=template_en, statement=x['eng_tr'])
    en_zs.append(resp)
    # print(resp)
    # ans = stance_prog.findall(resp)[0]
    # en_zs.append(int(ans))
  cur_res.append(en_zs)
  # print(f1_score(y_true=true, y_pred=en_zs))
  # English few shot
  en_fs = []
  for x in tqdm(data):
    resp = generate_response(template=template_en, statement=x['eng_tr'], suf=ex_en)
    en_fs.append(resp)
    # ans = stance_prog.findall(resp)[0]
    # en_zs.append(int(ans))
  cur_res.append(en_fs)
  # print(f1_score(y_true=true, y_pred=en_fs))
  # Russian zero shot
  ru_zs = []
  for x in tqdm(data):
    resp = generate_response(template=template_ru, statement=x['rus'])
    # ans = stance_prog.findall(resp)[0]
    # ru_zs.append(int(ans))
    ru_zs.append(resp)
  cur_res.append(ru_zs)
  # print(f1_score(y_true=true, y_pred=ru_zs))
  # Russian few shot
  ru_fs = []
  for x in tqdm(data):
    resp = generate_response(template=template_ru, statement=x['rus'], suf=ex_ru)
    # ans = stance_prog.findall(resp)[0]
    # ru_fs.append(int(ans))
    ru_fs.append(resp)
  cur_res.append(ru_fs)
  all_res.append(cur_res)
  # print(f1_score(y_true=true, y_pred=ru_fs))

In [None]:
!pip install datasets

In [None]:
import csv

exts = ['en_zs', 'en_fs', 'ru_zs', 'ru_fs']

for i, x in enumerate(filenames[:len(all_res)]):
  for j, ext in enumerate(exts):
    csv_filename = x.replace('.json', '_' + ext + '.csv')
    with open(csv_filename, 'w') as fout:
      csv.writer(fout).writerows([[y] for y in all_res[i][j]])