# Импортированные бибилиотеки

In [None]:
!pip install transformers auto-gptq

In [None]:
import torch

In [None]:
import pandas as pd

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
!pip install flash-attn

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [None]:
from openai import AsyncOpenAI

In [None]:
import numpy as np
import asyncio
import os

from tqdm.auto import tqdm
from copy import deepcopy

In [None]:
!pip install pymorphy3

In [None]:
!pip install nltk

In [None]:
import pymorphy3

In [None]:
from pymystem3 import Mystem
mystem = Mystem()

In [None]:
import string

# Установка моделей

In [None]:
def install_gpt():
  from auto_gptq import AutoGPTQForCausalLM


  model_name = 'fffrrt/ruGPT-3.5-13B-GPTQ'
  model_basename = 'gptq_model-4bit-128g'

  tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
  model = AutoGPTQForCausalLM.from_quantized(model_name,
          model_basename = model_basename,
          use_safetensors=True,
          trust_remote_code=True,
          device_map='auto',
          use_triton=False,
          quantize_config=None)
  return model, tokenizer


In [None]:
def install_llama():
  from unsloth import FastLanguageModel
  max_seq_length = 2048
  dtype = None
  load_in_4bit = True


  fourbit_models = [
      "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
      "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
      "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
      "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
      "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
      "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
      "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
      "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
      "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
      "unsloth/Phi-3-medium-4k-instruct",
      "unsloth/gemma-2-9b-bnb-4bit",
      "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
      ]

  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name = "unsloth/Meta-Llama-3.1-8B",
      max_seq_length = max_seq_length,
      dtype = dtype,
      load_in_4bit = load_in_4bit,
      )
  return model, tokenizer

In [None]:
def install_bert_small():
  model_id = "deepvk/RuModernBERT-small"
  tokenizer = AutoTokenizer.from_pretrained(model_id)

  model = AutoModelForMaskedLM.from_pretrained(model_id)
  model = model.eval()
  return model, tokenizer

In [None]:
def install_bert_big():
  model_id = "deepvk/RuModernBERT-base"
  tokenizer = AutoTokenizer.from_pretrained(model_id)

  model = AutoModelForMaskedLM.from_pretrained(model_id)
  model = model.eval()
  return model, tokenizer

In [None]:
local_client = AsyncOpenAI(
    base_url=f"https://openrouter.ai/api/v1",
    api_key="___",
    timeout=30
)
models_key = ['gpt-3.5', 'gpt-4', 'gpt-4t', 'gpt-4o', 'gpt-4om',
              'mistral', 'mistral-q',
              'llama', 'llama-q', 'llama405',
              'qwen', 'phi', 'nemotron',
              'qwen-reason']
models = ['gpt-3.5-turbo-0125', 'gpt-4-0125-preview', 'gpt-4-turbo-2024-04-09', 'gpt-4o-2024-05-13', 'gpt-4o-mini-2024-07-18',
          'mistralai/Mistral-Large-Instruct-2407', 'qeternity/Mistral-Large-Instruct-2407-w8a8',
          'meta-llama/Meta-Llama-3.1-70B-Instruct', 'neuralmagic/Meta-Llama-3.1-70B-Instruct-quantized.w8a8', 'neuralmagic/Meta-Llama-3.1-405B-Instruct-quantized.w4a16',
          'Qwen/Qwen2.5-72B-Instruct', 'microsoft/Phi-3-medium-128k-instruct', 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF',
          'qwen/qwq-32b']
timeout = [32, 512, 256, 128, 64, 1, 1, 1, 1]
models_dict = dict(zip(models_key, models))
timeout_dict = dict(zip(models_key, timeout))

class LLM:
    def __init__(self, use_cache=True, base_model='llama'):
        self.reset(use_cache, base_model)

    def reset(self, use_cache=True, base_model='llama'):
        self.cache = {}
        self.use_cache = use_cache
        self.base_model = base_model

    async def chat(self, messages, temperature=0, n=1, top_p=1, max_tokens=4096, is_opt=False):
        if isinstance(messages, str):
            messages = [{"role": "user", "content": messages}]
        model_key = self.base_model
        model = models_dict.get(model_key, model_key)
        timeout = timeout_dict.get(model_key, 1)
        if 'gpt' in model:
            client = openAI_client
        else:
            client = local_client
        id = str(messages) +  model
        if id in self.cache and temperature == 0 and self.use_cache:
            return deepcopy(self.cache[id])
        retries = 0
        while True:
            try:
                r = await client.chat.completions.create(
                    model=model,
                    messages=messages,
                    temperature=temperature,
                    n=n,
                    top_p=top_p,
                    max_tokens=max_tokens,
                    timeout= np.random.exponential(timeout) * min(10, retries) + timeout if 'gpt' in model else 100000
                )
                break
            except Exception as e:
                print(e)
                retries += 1
                await asyncio.sleep(np.random.exponential(3))
        self.cache[id] = [choice.message.content for choice in r.choices]
        return deepcopy(self.cache[id])

llm = LLM(base_model='qwen-reason')

In [None]:
import re
def extract_reasoning(s):
    pattern = r"<reason>(.*?)</reason>"
    match = re.search(pattern, s, flags=re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return None

def extract_answer(s):
    pattern = r"<answer>(.*?)</answer>"
    match = re.search(pattern, s, flags=re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return None

def extract_justification(s):
    pattern = r"<justification>(.*?)</justification>"
    match = re.search(pattern, s, flags=re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return None

# Предсказание таргетного слова

In [None]:
df_mask = pd.read_csv('sentences_mask.csv')

## Gpt и Llama

In [None]:
model, tokenizer = install_gpt() # можно заменить на install_llama

In [None]:
#    outputs = model.generate(
#        **inputs,
#        max_new_tokens=10,
#        do_sample=True,
#        top_k=100 / 50,
#        top_p=0.95 / 0.9,
#        temperature=0.7 / 0.4,
#        num_return_sequences=3 / 5,
#        eos_token_id=tokenizer.eos_token_id,
#        pad_token_id=tokenizer.eos_token_id,
#    )"

In [None]:
def predict_masked_word(sentence, tokenizer, model, max_length=1):
    if "<mask>" not in sentence:
        return ""

    # Разбиваем на левый и правый контекст
    prefix, _ = sentence.split("<mask>", 1)

    prompt = prefix.strip()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        do_sample=True,
        top_k=20,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predicted_part = generated[len(prompt):].strip()

    predicted_word = predicted_part.split()[0] if predicted_part else ""
    return predicted_word


In [None]:
predictions = []

if __name__ == "__main__":

    test_sentences = df_mask['Sentences_with_mask']
    for sentence in test_sentences:
        prediction = predict_masked_word(sentence, tokenizer, model)
        predictions.append(prediction)

        print(f"Оригинальное предложение: {sentence}")
        print(f"Предсказанное слово: {prediction}")
        print("-" * 50)

## Bert

In [None]:
model, tokenizer = install_bert_small() # можно заменить на install_bert_big

In [None]:
def predict_word(model, tokenizer, sentence):
    try:
        inputs = tokenizer(sentence, return_tensors="pt")
        masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)

        outputs = model(**inputs)
        predictions = outputs.logits
        predicted_index = predictions[0, masked_index].argmax(dim=-1)
        predicted_token = tokenizer.decode([predicted_index])
        predicted_word = predicted_token.strip() # убираем пробелы в начале и конце
        if predicted_word == tokenizer.unk_token:
            return None
        return predicted_word
    except ValueError:
        return None

In [None]:
predictions = []
for index, row in df_mask.iterrows():
    sentence = row['Sentences_with_mask'].replace("<mask>", "[MASK]")
    predicted_word = predict_word(model, tokenizer, sentence)

    if predicted_word:
        print(f"Предсказанное слово: {predicted_word}") # Добавим вывод предсказанного слова для каждой строки
        predictions.append(predicted_word)
    else:
        print("Предсказанное слово: не удалось предсказать")
        predictions.append(None)

## Qwen

In [None]:
df_mask.head()

Unnamed: 0,Sentences_with_mask
0,"На болотах оставался ещё <mask>, но на берегах..."
1,Он ловко поддел концом <mask> замочки и они от...
2,"Ваня раскрыл было <mask>, но понял, что что-то..."
3,"Сделав мне знак помолчать, он приложил <mask> ..."
4,"Я сделала <mask> навстречу: приехала к ней, по..."


In [None]:
async def fill_mask(sentence_with_mask):
    """
    Predicts the missing word based ONLY on the left context before <mask>.
    Handles model hallucinations like 'Ответ' and ignores invalid outputs.
    """
    if "<mask>" not in sentence_with_mask:
        return ""

    prefix = sentence_with_mask.split("<mask>")[0].strip()

    prompt = f"Предложение нельзя оставить без предсказания. Напиши одно слово, которое может продолжить предложение: '{prefix}'\nОтвет:"

    try:
        response = await llm.chat(prompt)
        generated_text = response[0].strip()
    except Exception as e:
        print(f"Ошибка от модели: {e}")
        return ""


    generated_text = generated_text.replace("Ответ:", "").strip()

    words = generated_text.split()

    if not words:
        return ""

    # Берём первое осмысленное слово, игнорируя "Ответ", "ответ", ":", "—" и прочее
    for word in words:
        clean_word = word.strip(".,:;!?—-\"'")
        if clean_word.lower() not in ["", "ответ", ":", ""]:
            return clean_word

    return ""


In [None]:
async def process_dataframe(df):
    """
    Processes the DataFrame by filling masks in all sentences.
    """
    predicted_words = []
    for sentence in tqdm(df['Sentences_with_mask'], desc="Processing sentences", unit="sentence"):
        predicted_word = await fill_mask(sentence)
        predicted_words.append(predicted_word)

    df['Predicted_Words'] = predicted_words
    return df

In [None]:
async def main():
    new_df = await process_dataframe(df_mask)
    print(new_df)

In [None]:
await main()

In [None]:
df_mask.to_csv('qwen_result.csv')

# Обработка предсказаний

In [None]:
def get_pos(word):
    """Получаем POS-тег для слова, используется Mystem."""
    try:
        analyzed = mystem.analyze(word)
        if analyzed and 'analysis' in analyzed[0] and analyzed[0]['analysis']:
            gr = analyzed[0]['analysis'][0]['gr']
            pos = gr.split(',')[0].split('=')[0] # извлекаем только часть речи
            return pos
    except Exception as e:
        print(f"Ошибка при анализе слова '{word}': {e}")
        return None
    return None

In [None]:
def get_lemma(word):
    """Получаем лемму слова"""
    try:
        analyzed = mystem.analyze(word)
        if analyzed and 'analysis' in analyzed[0] and analyzed[0]['analysis']:
            lemma = analyzed[0]['analysis'][0]['lex']
            return lemma
    except Exception as e:
        print(f"Ошибка при лемматизации слова '{word}': {e}")
        return None
    return None

In [None]:
def get_length_category(word):
    """Определяет категорию длины слова"""
    word_len = len(word)
    if 1 <= word_len <= 4:
        return 'short'
    elif 5 <= word_len <= 7:
        return 'medium'
    elif word_len >= 7:
        return 'long'

In [None]:
for index, row in df_mask.iterrows():
    predicted_word_dirty = row['Predicted_Words']
    predicted_word = predicted_word_dirty.rstrip(string.punctuation)
    if predicted_word_dirty == '___' or predicted_word_dirty.startswith('Предсказанное слово: не удалось предсказать') or predicted_word_dirty.startswith('Вывод') or not predicted_word.strip():
        df_mask.loc[index, 'lengt_pred'] = 'Ошибка предсказания'
        df_mask.loc[index, 'char_pred'] = 'Ошибка предсказания'
        df_mask.loc[index, 'POS_pred'] = 'Ошибка предсказания'
        df_mask.loc[index, 'POS_hitting'] = 'Ошибка предсказания'
        df_mask.loc[index, 'Semantic_similarity'] = 'Ошибка предсказания'
        df_mask.loc[index, 'Error_type_pred'] = 'Ошибка предсказания'
    else:
        # char_pred
        df_mask.loc[index, 'char_pred'] = len(predicted_word)

        # lengt_pred
        df_mask.loc[index, 'lengt_pred'] = get_length_category(predicted_word)

        # POS_pred
        pos_pred = get_pos(predicted_word)
        df_mask.loc[index, 'POS_pred'] = pos_pred

        # POS_hitting
        pos_target = get_pos(row['target'])
        if pos_pred and pos_target:
            df_mask.loc[index, 'POS_hitting'] = 1 if pos_pred == pos_target else 0
        else:
            df_mask.loc[index, 'POS_hitting'] = 0

# Обработка результатов людей

In [None]:
pos_mapping = {
    'S': 'NOUN',
    'V': 'VERB',
    'A': 'ADJF',
    'ADV': 'ADVB',
    'PR': 'PREP',
    'CONJ': 'CONJ',
    'PART': 'PRCL',
    'INTJ': 'INTJ',
    'NUM': 'NUMR',
    'ANUM': 'NUMR',
    'SPRO': 'NPRO',
    'APRO': 'NPRO',
    'PRED': 'PRED',
}


In [None]:
from collections import Counter

df_human = pd.read_csv('people_pred.csv')
df_human['mapped_POS'] = df_human['POS'].map(pos_mapping)


def analyze_word(word):
    parsed = morph.parse(str(word))[0]
    lemma = parsed.normal_form
    pos = parsed.tag.POS
    grammemes = parsed.tag
    word_len = len(str(word))
    lemma_len = len(lemma)
    length_cat = "short" if lemma_len <= 4 else "medium" if lemma_len <= 7 else "long"
    return pd.Series([lemma, pos, str(grammemes), word_len, lemma_len, length_cat],
                     index=['answer_lemma', 'answer_pos', 'answer_form', 'answer_len', 'lemma_len', 'length_category'])


df_human = df_human.join(df_human['answer'].apply(analyze_word))

# Сравнение слов и частей речи
df_human['match_word'] = df_human.apply(lambda row: row['answer_lemma'] == str(row['word.id']).lower(), axis=1)
df_human['match_pos'] = df_human.apply(lambda row: row['mapped_POS'] == row['answer_pos'], axis=1)