In [None]:
!pip install --upgrade huggingface_hub 

In [None]:
!pip install --upgrade tokenizers

In [None]:
!pip install peft -q
!pip install -i https://pypi.org/simple/ bitsandbytes -q
!pip install accelerate -q
!pip install datasets -q
!pip install zstandard -q
!pip install jsonlines -q
!pip install sentence_transformers -q
!pip install catboost -q

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [1]:
# from sentence_transformers import SentenceTransformer
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
# from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from tqdm import tqdm
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
from sklearn.metrics import classification_report, f1_score, roc_auc_score, accuracy_score, recall_score, PrecisionRecallDisplay

In [2]:


MODEL_NAME = "IlyaGusev/saiga_7b_lora"
DEFAULT_MESSAGE_TEMPLATE = "<s>{role}\n{content}</s>\n"
DEFAULT_SYSTEM_PROMPT = "Ты — переводчик. Ты переводишь текст с русского, на текст, как будто он был переведён с китайского. Избегай дублирования перевода."

class Conversation:
    def __init__(
        self,
        message_template=DEFAULT_MESSAGE_TEMPLATE,
        system_prompt=DEFAULT_SYSTEM_PROMPT,
        start_token_id=1,
        bot_token_id=9225
    ):
        self.message_template = message_template
        self.start_token_id = start_token_id
        self.bot_token_id = bot_token_id
        self.messages = [{
            "role": "system",
            "content": system_prompt
        }]

    def get_start_token_id(self):
        return self.start_token_id

    def get_bot_token_id(self):
        return self.bot_token_id

    def add_user_message(self, message):
        self.messages.append({
            "role": "user",
            "content": message
        })

    def add_bot_message(self, message):
        self.messages.append({
            "role": "bot",
            "content": message
        })

    def get_prompt(self, tokenizer):
        final_text = ""
        for message in self.messages:
            message_text = self.message_template.format(**message)
            final_text += message_text
        final_text += tokenizer.decode([self.start_token_id, self.bot_token_id])
        return final_text.strip()


def generate(model, tokenizer, prompt, generation_config):
    data = tokenizer(prompt, return_tensors="pt")
    data = {k: v.to(model.device) for k, v in data.items()}
    output_ids = model.generate(
        **data,
        generation_config=generation_config
    )[0]
    output_ids = output_ids[len(data["input_ids"][0]):]
    output = tokenizer.decode(output_ids, skip_special_tokens=True)
    return output.strip()



In [3]:
config = PeftConfig.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
model.resize_token_embeddings(len(tokenizer))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


Embedding(32000, 4096, padding_idx=0)

In [4]:
print("model loaded")

model loaded


In [5]:
model = PeftModel.from_pretrained(
    model,
    MODEL_NAME,
    torch_dtype=torch.float16,
    is_trainable = True,
)
model.eval()

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
print(generation_config)

GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_new_tokens": 1536,
  "no_repeat_ngram_size": 15,
  "pad_token_id": 0,
  "repetition_penalty": 1.1,
  "temperature": 0.2,
  "top_k": 40,
  "top_p": 0.9
}



In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("translate.csv", sep="|")

In [8]:
print(df.iloc[0])

translated    Я простой русский рабочий Иван, работать шахта...
original      Меня зовут Иван, живу в России и я работаю в ш...
Name: 0, dtype: object


In [9]:
def get_prompt(question):
    return f"""
    Пример перевода: 'Меня зовут Иван, живу в России и я работаю в шахте. Читал труды китайской партии, и мне понравилось.' -> 'Я простой русский рабочий Иван, работать шахта, жить Россия. Читать книга Китай партия, много нравиться.
    Текст, который нужно перевести в квадратных скобках: [{question}]
    Переведи с русского так, как будто этот текст был переведён с китайского в переводчике."""

# def get_promt_lev(summary, question, answer):
#     return f"""Высказывание 1: {summary}
#     Высказывание 2: {question + answer}
#     Сравни Высказывание 1 и Высказывание 2 и скажи правдиво ли второе, написав 1 если правдиво и 0 если наоборот"""

In [10]:

def answer(question, translate):
    inputs = get_prompt(question)

    conversation = Conversation()
    conversation.add_user_message(inputs)
    prompt = conversation.get_prompt(tokenizer)

    output = generate(model, tokenizer, prompt, generation_config)
    print(inputs)
    print(output)
    print(output.lower().split(" "))
    print("Translated: ", translate)
    print()
    print("==============================")
    print()
    return  output

In [11]:
import pandas as pd
import json
import os

def dataset_to_json(dataset, filename):
    json_objects = []
    
    with open(filename, 'w', encoding="utf-8") as file:
        file.write("")
    

    for index, row in dataset.iterrows():        
        system_message = "Ты — переводчик. Ты переводишь текст с русского, на текст, как будто он был переведён с китайского. Избегай дублирования перевода."
        user_message = get_prompt(f"{row['original']}")
        bot_message = row['translated']

        json_object = {
            "system": system_message,
            "user": user_message,
            "bot": bot_message
        }

        json_objects.append(json_object)
        with open(filename, 'a', encoding='utf-8') as file:
          file.write(json.dumps(json_object, ensure_ascii=False) + "\n")

    return json_objects


In [12]:
train_size = 1
test_df = df.iloc[0:2]
df = df.iloc[2:]
print(df.iloc[0])



translated    Я простой рабочий Иван из город Тверь. Я устат...
original      Я простой рабочий по имени Иван из города Твер...
Name: 2, dtype: object


In [13]:
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=test_df.sample(frac=1,random_state=200).reset_index(drop=True)
# test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
dataset_to_json(train_dataset, "train.json")
dataset_to_json(test_dataset, "test.json")

FULL Dataset: (39, 2)
TRAIN Dataset: (39, 2)
TEST Dataset: (2, 2)


[{'system': 'Ты — переводчик. Ты переводишь текст с русского, на текст, как будто он был переведён с китайского. Избегай дублирования перевода.',
  'user': "\n    Пример перевода: 'Меня зовут Иван, живу в России и я работаю в шахте. Читал труды китайской партии, и мне понравилось.' -> 'Я простой русский рабочий Иван, работать шахта, жить Россия. Читать книга Китай партия, много нравиться.\n    Текст, который нужно перевести в квадратных скобках: [Президент]\n    Переведи с русского так, как будто этот текст был переведён с китайского в переводчике.",
  'bot': 'Стержень Xi'},
 {'system': 'Ты — переводчик. Ты переводишь текст с русского, на текст, как будто он был переведён с китайского. Избегай дублирования перевода.',
  'user': "\n    Пример перевода: 'Меня зовут Иван, живу в России и я работаю в шахте. Читал труды китайской партии, и мне понравилось.' -> 'Я простой русский рабочий Иван, работать шахта, жить Россия. Читать книга Китай партия, много нравиться.\n    Текст, который нужно 

In [14]:
from datasets import load_dataset
data = load_dataset(
    "json",
    data_files={
                'train' : 'train.json' ,
                'validation' : 'test.json'
    }
)
data

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'bot'],
        num_rows: 39
    })
    validation: Dataset({
        features: ['system', 'user', 'bot'],
        num_rows: 2
    })
})

In [15]:
CUTOFF_LEN = 3584

def generate_prompt(data_point):
    promt = f"""<s>system
{data_point['system']}</s><s>user
{data_point['user']}</s><s>bot
{data_point['bot']}</s>"""
    #     print(promt)
    return promt


def tokenize (prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):

        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)



    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
#     print(tokenized_full_prompt)
    return tokenized_full_prompt

In [16]:
train_data = (
    data["train"].map(generate_and_tokenize_prompt)
)

val_data = (
    data["validation"].map(generate_and_tokenize_prompt)
)

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [17]:
import torch
import pandas as pd
from peft import PeftModel, PeftConfig
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForTokenClassification, AutoConfig, GenerationConfig
from transformers import Trainer, TrainingArguments, logging, TrainerCallback, TrainerState, TrainerControl, BitsAndBytesConfig
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
import torch.nn.functional as F
from datasets import load_dataset
import time
from typing import Any, List, Mapping, Optional
import transformers
import os
from pathlib import Path
BATCH_SIZE = 2
MICRO_BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 100
OUTPUT_DIR = "model"

training_arguments = transformers.TrainingArguments(
            per_device_train_batch_size=MICRO_BATCH_SIZE,
            gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
#             warmup_steps=200,
            max_steps=TRAIN_STEPS,
            learning_rate=LEARNING_RATE,
            fp16=True,
            logging_steps=10,
            optim="adamw_torch",
            evaluation_strategy="steps",
            save_strategy="steps",
            eval_steps=10,
            save_steps=10,
            output_dir=OUTPUT_DIR,
            save_total_limit=10,
            load_best_model_at_end=True,
            report_to=None,
            overwrite_output_dir=True, # Overwrite the content of the output dir
)



In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [19]:
# !pip install accelerate==0.27.2
# !pip install torch==2.2.0

In [20]:
#!pip install flash-attn --no-build-isolation

In [21]:
from transformers import DataCollatorForSeq2Seq, Trainer, TrainingArguments
import transformers
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)

model = torch.compile(model)
trainer.train()


max_steps is given, it will override any value given in num_train_epochs
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss
10,1.2749,0.136129
20,0.4312,0.098412
30,0.2334,0.083676
40,0.3992,0.076068
50,0.2079,0.069728
60,0.2117,0.083577
70,0.0831,0.080489
80,0.1594,0.076329
90,0.063,0.080258
100,0.1768,0.080543


TrainOutput(global_step=100, training_loss=0.3240712159872055, metrics={'train_runtime': 1259.9764, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.079, 'total_flos': 2721040793862144.0, 'train_loss': 0.3240712159872055, 'epoch': 5.128205128205128})

In [None]:
df["answer"] =  df.apply(lambda x: answer(x.summary, x.question, x.answer, x.is_hallucination), axis=1)

In [None]:
from sklearn import metrics
accuracy = metrics.accuracy_score(df["is_hallucination"],df["answer"])
f1_score_micro = metrics.f1_score(df["is_hallucination"],df["answer"], average='micro')
f1_score_macro = metrics.f1_score(df["is_hallucination"],df["answer"], average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

In [None]:
model.save_pretrained(OUTPUT_DIR)

# GGUF


In [1]:
self_instruct_dir = 'rulm/self_instruct'
checkpoint = "../../model/checkpoint-100/"
merged_model_name = 'merged_test_model.pt'


In [2]:
%cd {self_instruct_dir}

E:\PyCharm Community Edition 2021.1.1\projects\LLM_LoRa\LLM_LoRa\rulm\self_instruct


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
%cd ..

In [None]:
tokenizer.save_pretrained('tokenizer')

In [3]:
!python -m src.tools.convert_to_native {checkpoint} {merged_model_name} --device=cuda --enable_offloading

Saving state_dict...



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards:  50%|#####     | 1/2 [00:13<00:13, 13.36s/it]
Loading checkpoint shards: 100%|##########| 2/2 [00:14<00:00,  6.32s/it]
Loading checkpoint shards: 100%|##########| 2/2 [00:14<00:00,  7.37s/it]

  0%|          | 0/291 [00:00<?, ?it/s]
  0%|          | 1/291 [00:00<01:48,  2.67it/s]
  2%|2         | 6/291 [00:00<00:20, 13.58it/s]
  3%|2         | 8/291 [00:00<00:19, 14.62it/s]
  5%|5         | 15/291 [00:00<00:10, 25.83it/s]
  7%|6         | 19/291 [00:00<00:09, 28.80it/s]
  8%|8         | 24/291 [00:01<00:08, 31.76it/s]
 10%|9         | 28/291 [00:01<00:07, 33.72it/s]
 11%|#1        | 33/291 [00:01<00:07, 34.39it/s]
 13%|#2        | 37/291 [00:01<00:07, 35.18it/s]
 14%|#4        | 42/291 [00:01<00:06, 35.87it/s]
 16%|#5        | 46/291 [00:01<00:06, 35.93it/s]
 18%|#7        | 51/291 [00:01<00:06, 35.82it/s]
 19%|#8        | 55/291 [00:01<00:06, 35.28it/s]
 21%|##        | 60/291 [00:02<00:06, 36

In [4]:
model_dir = 'merged_test_model.pt'
checkpoint = "../model/checkpoint-100/"
output_model = "model-100step_new_prompt.gguf"

In [None]:
!ls

In [5]:
%cd ../../llama.cpp

E:\PyCharm Community Edition 2021.1.1\projects\LLM_LoRa\LLM_LoRa\llama.cpp


In [9]:
!python examples/convert-legacy-llama.py {model_dir} --vocab-dir {checkpoint} --outfile {output_model} --outtype f16 --ctx 4096

usage: convert-legacy-llama.py [-h] [--dump] [--dump-single] [--vocab-only]
                               [--no-vocab] [--outtype {f32,f16,q8_0}]
                               [--vocab-dir VOCAB_DIR]
                               [--vocab-type VOCAB_TYPE] [--outfile OUTFILE]
                               [--ctx CTX] [--concurrency CONCURRENCY]
                               [--big-endian] [--pad-vocab] [--skip-unknown]
                               [--verbose] [--metadata METADATA]
                               [--get-outfile]
                               model
convert-legacy-llama.py: error: argument --outtype: invalid choice: 'q4_0' (choose from 'f32', 'f16', 'q8_0')


In [None]:
quant = "model-100step_new_prompt.gguf"

In [11]:
!make

I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   
I UNAME_P:   
I UNAME_M:   
I CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -march=native -mtune=native -fopenmp -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -fopenmp  -march=native -mtune=native -Wno-array-bounds -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE 
I NVCCFLAGS: -std=c++11 -O3 
I LDFLAGS:    
I CC:        
I CXX:       

c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmiss

process_begin: CreateProcess(NULL, uname -s, ...) failed.
Makefile:146: pipe: No error
process_begin: CreateProcess(NULL, uname -p, ...) failed.
Makefile:150: pipe: No error
process_begin: CreateProcess(NULL, uname -m, ...) failed.
Makefile:154: pipe: No error
process_begin: CreateProcess(NULL, which ccache, ...) failed.
Makefile:248: pipe: No error
process_begin: CreateProcess(NULL, cc -dumpmachine, ...) failed.
Makefile:438: pipe: Bad file descriptor
process_begin: CreateProcess(NULL, cc --version, ...) failed.
scripts/get-flags.mk:1: pipe: No error
‘ЁбвҐ¬Ґ ­Ґ г¤ Ґвбп ­ ©вЁ гЄ § ­­л© Їгвм.
process_begin: CreateProcess(NULL, expr >= 070100, ...) failed.
scripts/get-flags.mk:32: pipe: Bad file descriptor
process_begin: CreateProcess(NULL, expr >= 080100, ...) failed.
scripts/get-flags.mk:35: pipe: Bad file descriptor
"cc" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.
"c++" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё

In [10]:
!quantize {quant} q4_0

"quantize" ­Ґ пў«пҐвбп ў­гваҐ­­Ґ© Ё«Ё ў­Ґи­Ґ©
Є®¬ ­¤®©, ЁбЇ®«­пҐ¬®© Їа®Ја ¬¬®© Ё«Ё Ї ЄҐв­л¬ д ©«®¬.


In [None]:
!python convert-hf-to-gguf.py {checkpoint} --outfile {output_model} --outtype f16 

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("IlyaGusev/saiga2_7b_lora", use_fast=False)
tokenizer.save_pretrained('tokenizer')

In [None]:
!make quantize

In [None]:
model_gguf = output_model 
quant_model = "quant_gguf_llama.gguf"
quantization_type = "q4_0"

In [None]:
! ./quantize {model_gguf} {quant_model} {quantization_type}

In [None]:
!./llama-quantize {model_gguf} {quant_model} Q4_K_M

In [None]:
%cd ~
!git clone --recursive https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!make LLAMA_CUBLAS=1 -j libllama.so

# HACK: Use custom compiled libllama.so
%cp ~/llama.cpp/libllama.so /opt/conda/lib/python3.10/site-packages/llama_cpp/libllama.so

In [None]:
!pip install llama-cpp-python huggingface_hub

In [None]:
!cd E:\PyCharm Community Edition 2021.1.1\projects\LLM_LoRa\LLM_LoRa
!git clone --recursive https://github.com/ggerganov/llama.cpp.git
!cd llama.cpp


In [None]:
!make GGML_CUDA=1 -j libllama.so

# HACK: Use custom compiled libllama.so
!cp ~/llama.cpp/libllama.so /opt/conda/lib/python3.10/site-packages/llama_cpp/libllama.so

In [None]:
!cd E:\PyCharm Community Edition 2021.1.1\projects\LLM_LoRa\LLM_LoRa

In [None]:
!cd ..

In [None]:
import os

# Получаем список всех элементов в текущем рабочем каталоге
items = os.listdir('.')

# Фильтруем только директории
directories = [item for item in items if os.path.isdir(item)]

# Печатаем список директорий
for directory in directories:
    print(directory)


In [None]:
from llama_cpp import Llama
from tqdm import tqdm
import os

SYSTEM_PROMPT = "Ты — переводчик. Ты переводишь текст с русского, на текст, как будто он был переведён с китайского. Избегай дублирования перевода."
                
                # "Ты — переводчик. Ты переводишь текст с русского, на текст, как будто он был переведён с китайского. Пример: 'Меня зовут Иван, живу в России и я работаю в шахте. Читал труды китайской партии, и мне понравилось.' -> 'Я простой русский рабочий Иван, работать шахта, жить Россия. Читать книга Китай партия, много нравиться. "
SYSTEM_TOKEN = 1788
USER_TOKEN = 1404
BOT_TOKEN = 9225
LINEBREAK_TOKEN = 13

top_k=40
top_p=0.5
temperature=0.01
repeat_penalty=1.1


ROLE_TOKENS = {
    "user": USER_TOKEN,
    "bot": BOT_TOKEN,
    "system": SYSTEM_TOKEN
}


def get_message_tokens(model, role, content):
    message_tokens = model.tokenize(content.encode("utf-8"))
    message_tokens.insert(1, ROLE_TOKENS[role])
    message_tokens.insert(2, LINEBREAK_TOKEN)
    message_tokens.append(model.token_eos())
    return message_tokens


def get_system_tokens(model):
    system_message = {
        "role": "system",
        "content": SYSTEM_PROMPT
    }
    return get_message_tokens(model, **system_message)

def chat_saiga(message, model):
    message = get_prompt(message)
    system_tokens = get_system_tokens(model)
    tokens = system_tokens
    # model.eval(tokens)
    
    message_tokens = get_message_tokens(model=model, role="user", content=message)
    role_tokens = [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]
    tokens += message_tokens + role_tokens
    # print(tokens)
    # detokenize = model.detokenize(tokens)
    # print(model.tokenize(full_prompt))
    generator = model.generate(
        tokens,
        top_k = top_k,
        top_p = top_p,
        temp = temperature,
        repeat_penalty = repeat_penalty,
        reset = True
    )
    # print(len([token for token in generator]))
    
    result_list = []
    
    for token in generator:
        token_str = model.detokenize([token]).decode("utf-8", errors="ignore")
        tokens.append(token)
        if token == model.token_eos():
            break
        print(token_str, end="", flush=True)
        result_list.append(token_str)
    return ''.join(result_list)

try:
    del model
except:
    pass

# model_path = '/kaggle/working/model-q4_0.gguf'
base_path = "E:\PyCharm Community Edition 2021.1.1\projects\LLM_LoRa\LLM_LoRa"
model_path = 'model-100step.gguf'
full_path = os.path.join(base_path, model_path)
n_ctx = 3096 #

model = Llama(
        model_path = full_path,
        n_ctx = n_ctx,
        n_gpu_layers=-1
)

In [None]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")


In [None]:
message = "Я обучил модель искусственного интеллекта, чтобы она имитировала перевод с китайского на русский и обратно. Локальный мем"
response = chat_saiga(message, model)
# print(response)

In [None]:
%cd ..

In [None]:
%cd "E:\PyCharm Community Edition 2021.1.1\projects\LLM_LoRa\LLM_LoRa\model\checkpoint-20"

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

# Путь к вашему адаптеру
adapter_path = "adapter_config.json"

# 1. Загрузка конфигурации адаптера
peft_config = PeftConfig.from_pretrained(adapter_path)

# 2. Настройка 4-битного квантования
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# 3. Загрузка базовой модели с 4-битным квантованием
base_model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    quantization_config=quantization_config,
    device_map="auto"
)

# 4. Применение адаптера к квантованной базовой модели
model = PeftModel.from_pretrained(base_model, adapter_path)

# 5. Загрузка токенизатора
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

# 6. Объединение базовой модели и адаптера (опционально)
merged_model = model.merge_and_unload()

# 7. Сохранение квантованной модели с адаптером
output_dir = "path_to_save_quantized_model"
merged_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Квантованная модель с адаптером сохранена в:", output_dir)

# 8. Пример использования квантованной модели
input_text = "Ваш текст для тестирования здесь"
inputs = tokenizer(input_text, return_tensors="pt").to(merged_model.device)

with torch.no_grad():
    outputs = merged_model.generate(**inputs, max_new_tokens=50)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Сгенерированный текст:", generated_text)