# Install libs

In [None]:
!pip install jsonlines
!pip install wandb
!pip install sentencepiece

!sudo apt-get install git-lfs
!pip install git+https://github.com/huggingface/transformers peft bitsandbytes

# Download weights

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!git clone https://huggingface.co/huggyllama/llama-7b # en llama

In [None]:
!git clone https://huggingface.co/IlyaGusev/llama_7b_ru_turbo_alpaca_lora_merged  # ru llama

# Clone repo

In [None]:
!git clone https://github.com/IlyaGusev/rulm

# Data

In [None]:
import pandas as pd
import jsonlines
import json

In [None]:
# russian pick up
data_ru = pd.read_csv('pick_up_ru.csv')
data_ru.head()

In [None]:
train_data = []
val_data = []

for i in range(len(data_en)):
    if i <= int(len(data_en)*0.9):
        train_data.append({'instruction': data_en['prompt'][i], 'input': '', 'output': data_en['pick_up'][i]})
    else:
        val_data.append({'instruction': data_en['prompt'][i], 'input': '', 'output': data_en['pick_up'][i]})

In [None]:
with jsonlines.open('./rulm/self_instruct/data/pick_up_en_train.jsonl', 'w') as jsonl_output:
    for item in train_data:
        jsonl_output.write(item)

with jsonlines.open('./rulm/self_instruct/data/pick_up_en_val.jsonl', 'w') as jsonl_output:
    for item in val_data:
        jsonl_output.write(item)

In [None]:
jsonl_file = open("./rulm/self_instruct/data/pick_up_en_val.jsonl", "r")
print(jsonl_file.read())

# Train

In [None]:
%cd rulm/self_instruct

1) Создание папки models в rulm/self_instruct

2) Перетащить туда модель

3) Изменить файл rulm/self_instruct/configs/llama_7b_lora.json на новую модель

4) Загрузить данные в rulm/self_instruct/data

5) Добавить на 204 строчки в файл rulm/self_instruct/src/train 

max_source_tokens_count = 256

max_target_tokens_count = 512

max_tokens_count = max_source_tokens_count + max_target_tokens_count + 1

6) Изменить файл rulm/self_instruct/configs/llama_7b_lora.json на нужные параметры обчения

In [None]:
!python3 -m src.train --config-file configs/llama_7b_lora.json --train-file data/pick_up_ru_train.jsonl --val-file data/pick_up_ru_val.jsonl  --output-dir models/llama_7b_lora

# Save model

In [None]:
!cp -r /content/rulm/self_instruct/models/llama_7b_lora /content/drive/MyDrive/LLaMA

# Test

In [None]:
%cd ./drive/MyDrive/LLaMA

In [None]:
from peft import PeftModel, PeftConfig
from transformers import LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer, GenerationConfig

BASE_MODEL = 'IlyaGusev/llama_7b_ru_turbo_alpaca_lora_merged'
MODEL_NAME = 'llama_7b_lora/checkpoint-75'

In [None]:
model_base = LlamaForCausalLM.from_pretrained(
    # config.base_model_name_or_path,
    BASE_MODEL,
    load_in_8bit=True,
    device_map={'': 0}
    # device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    #device_map={'': 0},
    #device='cuda:0'
    device_map='auto',
)

In [None]:
model = PeftModel.from_pretrained(
    model_base, 
    MODEL_NAME, 
    #device_map={'': 0},
    device_map='auto'
)

In [None]:
def predict(inp, model, tokenizer,**kwargs):
    defaults = { 
        'max_length':64, 
        'length_penalty': 1,
        'top_p':0.95,
        'top_k':40,
        'temperature':1.0,
        'repetition_penalty':1.2,
        'num_beams':3,
        'do_sample':True,
    }
    params = {**defaults, **kwargs}
    data = tokenizer([inp], return_tensors="pt")
    data = {k: v.to(model.device) for k, v in data.items() if k in ("input_ids", "attention_mask")}
    output_ids = model.generate(
        **data,
        **params,
    )[0]
    return tokenizer.decode(output_ids, skip_special_tokens=True)

def generate(TEXT_TO_PARAPHRASE,**kwargs):
    para_prompt = 'Инструкция: сгенерируй подкат. Вход: {}. Выход:'
    inp = para_prompt.format(TEXT_TO_PARAPHRASE)
    pred = predict(
        inp, 
        model_base, 
        tokenizer, 
        **kwargs
    )
    fin = pred[len(inp):]
    return fin

In [None]:
from tqdm import tqdm

pick_up_empty = []
n = 100
inp = ''
for i in tqdm(range(n)):
    pick_up_empty.append(generate(inp))

In [None]:
with open("pick_up_empty.txt", "w") as f:
    for row in pick_up_empty:
        f.write(str(row) + '\n')

In [None]:
pick_up_eyes = []
n = 20
inp = 'глаза'
for i in tqdm(range(n)):
    pick_up_eyes.append(generate(inp))

In [None]:
with open("pick_up_eyes.txt", "w") as f:
    for row in pick_up_eyes:
        f.write(str(row) + '\n')

In [None]:
pick_up_beuty = []
n = 20
inp = 'красивая'
for i in tqdm(range(n)):
    pick_up_beuty.append(generate(inp))

In [None]:
with open("pick_up_beuty.txt", "w") as f:
    for row in pick_up_beuty:
        f.write(str(row) + '\n')

In [None]:
pick_up_smile = []
n = 20
inp = 'улыбка'
for i in tqdm(range(n)):
    pick_up_smile.append(generate(inp))

In [None]:
with open("pick_up_smile.txt", "w") as f:
    for row in pick_up_smile:
        f.write(str(row) + '\n')

In [None]:
pick_up_mam = []
n = 20
inp = 'мама'
for i in tqdm(range(n)):
    pick_up_mam.append(generate(inp))

In [None]:
with open("pick_up_mam.txt", "w") as f:
    for row in pick_up_mam:
        f.write(str(row) + '\n')