### Загрузка Mistral-7B-v0.2

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

device = torch.device('cuda')
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, quantization_config=quantization_config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
model.config.use_cache = False

### Отключение логирования ошибок

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from models import prompts
import importlib
importlib.reload(prompts)

<module 'models.prompts' from '/home/user/Hacks/DigitalBreakthrough-AIAssistant/models/prompts.py'>

In [7]:
from models import catboosty
import importlib
importlib.reload(catboosty)

In [13]:
catboosty.filter_question(['налоговый вычет можно получить?'], catboosty.model_filter_model)

array([[ True,  True]])

### Инференс с Streaming'ом

In [16]:
def load_obscene_words(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        obscene_words = [word.strip().lower() for word in file.readlines()]
    return obscene_words


In [28]:
def check_for_obscene(text, obscene_words):
    text_lower = text.lower()
    for word in obscene_words:
        if word in text_lower:
            return True
    return False

In [29]:
obscene_words = load_obscene_words('data/words.txt')

In [45]:
from transformers import TextIteratorStreamer
from threading import Thread

streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, timeout=120)

def process(message: str, context: str):
    if check_for_obscene(message, obscene_words):
        return False
    variants = catboosty.pipeline_predict2([message], catboosty.model_cl_category, catboosty.model_cl_answer, catboosty.tfidf_vectorizer)
    encodeds = tokenizer.encode(prompts.top_3_prompt(message, variants), return_tensors='pt')
    model_inputs = encodeds.to(device)
    gen_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True)
    decoded = tokenizer.batch_decode(gen_ids)

    ans = decoded[0].split('[/INST]')[-1]


    encodeds = tokenizer.encode(prompts.get_prompt(message, ans, context), return_tensors="pt")

    model_inputs = encodeds.to(device)

    kwargs = dict(input_ids=model_inputs, streamer=streamer, max_new_tokens=500)

    thread = Thread(target=model.generate, kwargs=kwargs)

    thread.start()
    return True

In [46]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [47]:
import time
import gradio as gr

messages = ""
loyalty = 100
def slow_echo(message, history):
    global messages, current_state
    messages += f"User: {message}\n"
    result = process(message, messages)
    if result:
        history = ""
        for char in streamer:
            history += char
            yield history
        messages += f"Assistant: {message}\n"
    else:
        return 'Перевожу на оператора'

gr.ChatInterface(slow_echo, css="footer {visibility: hidden}").launch(share=True)

Running on local URL:  http://127.0.0.1:7870
Running on public URL: https://83d918f3c02ce8d11c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [43]:
import telebot


bot = telebot.TeleBot('SECRET')

@bot.message_handler(commands=['start', 'help'])
def greet(message):
    bot.send_message(message.chat.id, 'Привет! Я бот техподдержки GeekBrains, чем могу помочь?')

@bot.message_handler(func=lambda message: True)
def echo_all(message):
    bot.send_chat_action(message.chat.id, 'typing')
    result = process(prompts.get_prompt(message.text), 'user').split('[/INST]')[-1].strip()
    bot.reply_to(message, result)

bot.infinity_polling()

2024-04-28 04:49:01,082 (__init__.py:1086 MainThread) ERROR - TeleBot: "Infinity polling exception: A request to the Telegram API was unsuccessful. Error code: 404. Description: Not Found"
2024-04-28 04:49:01,083 (__init__.py:1088 MainThread) ERROR - TeleBot: "Exception traceback:
Traceback (most recent call last):
  File "/home/user/.local/lib/python3.10/site-packages/telebot/__init__.py", line 1081, in infinity_polling
    self.polling(non_stop=True, timeout=timeout, long_polling_timeout=long_polling_timeout,
  File "/home/user/.local/lib/python3.10/site-packages/telebot/__init__.py", line 1166, in polling
    logger.info('Starting your bot with username: [@%s]', self.user.username)
  File "/home/user/.local/lib/python3.10/site-packages/telebot/__init__.py", line 293, in user
    self._user = self.get_me()
  File "/home/user/.local/lib/python3.10/site-packages/telebot/__init__.py", line 1353, in get_me
    apihelper.get_me(self.token)
  File "/home/user/.local/lib/python3.10/site-pac

KeyboardInterrupt: 