In [1]:
!pip install -q transformers datasets peft adapters rouge-score

In [2]:
!pip install torch



In [3]:
!pip install --upgrade wandb

Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.16.6
    Uninstalling wandb-0.16.6:
      Successfully uninstalled wandb-0.16.6
Successfully installed wandb-0.17.0


In [4]:
!pip install -U -q git+https://github.com/huggingface/trl

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW
from trl import DPOTrainer, DPOConfig
from peft import PeftModel
from datasets import Dataset
from peft import get_peft_config
from sklearn.model_selection import train_test_split
from trl import DPOTrainer
from peft import PeftModel
from safetensors.torch import save_model

2024-05-23 21:22:06.691456: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-23 21:22:06.691551: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-23 21:22:06.835611: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_random_seed(69)

In [7]:
def calc_token_len(example):
    return len(tokenizer(example).input_ids)

In [8]:
torch.cuda.empty_cache()

In [9]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'

In [26]:
model = AutoModelForCausalLM.from_pretrained('ai-forever/rugpt3medium_based_on_gpt2', device_map="auto")
tokenizer = AutoTokenizer.from_pretrained('ai-forever/rugpt3medium_based_on_gpt2')

model.config.use_cache = False

ref_model = AutoModelForCausalLM.from_pretrained('ai-forever/rugpt3medium_based_on_gpt2')

for param in ref_model.parameters():
    param.requires_grad = False

In [27]:
config = {
    "peft_type": "PREFIX_TUNING",  # PEFT type (Prefix Tuning)
    "task_type": "SEQ2SEQ_LM",  # task type (Sequence to Sequence Language Model)
    "inference_mode": False,  # inference (False for training)
    "num_virtual_tokens": 20,  
    "token_dim": 128,
    "num_transformer_submodules": 1,
    "num_attention_heads": 12,
    "num_layers": 12, 
    "encoder_hidden_size": 128, 
    "prefix_projection": False,
}


peft_config = get_peft_config(config)

model = PeftModel(model, peft_config)
for param in model.parameters():
    param.requires_grad = True

In [28]:
data = pd.read_csv('/kaggle/input/sentences-preprocessed/sentences_preprocessed.csv')
data = data[['text', 'corrected']].dropna()
data['input_token_len'] = data['text'].apply(calc_token_len)
data['input_token_len'].quantile(0.95), data['input_token_len'].quantile(0.05)

(37.0, 4.0)

In [29]:
data = data[(data['input_token_len'] <= 37) & (data['input_token_len'] > 4)]
data['chosen'] = 'Исправьте все грамматические и орфографические ошибки в этом предложении: "' + data['text'] + '" Исправленное предложение: ' + data['corrected']
data['rejected'] = 'Исправьте все грамматические и орфографические ошибки в этом предложении: "' + data['text'] + '" Исправленное предложение: ' + "ничего"
data['text'] = 'Исправьте все грамматические и орфографические ошибки в этом предложении: "' + data['text'] + '" Исправленное предложение: '
train_df, test_df = train_test_split(data, test_size=0.3, shuffle=False)
val_df, test_df = train_test_split(test_df, test_size=0.5, shuffle=False)

In [32]:
data['chosen'][0]

'Исправьте все грамматические и орфографические ошибки в этом предложении: "Загрязнение тяжелыми металлами Дальнозоркого района ." Исправленное предложение: Загрязнение тяжелыми металлами Дальнегорского района .'

In [33]:
train_df.shape, val_df.shape, test_df.shape

((18907, 5), (4051, 5), (4052, 5))

In [34]:
training_args = DPOConfig(num_train_epochs=1, output_dir="./output-dir", max_length=84)

train_dpo_dataset = {
    "prompt": train_df['text'].tolist(),
    "chosen": train_df['chosen'].tolist(),
    "rejected": train_df['rejected'].tolist()
}

eval_dpo_dataset = {
    "prompt": val_df['text'].tolist(),
    "chosen": val_df['chosen'].tolist(),
    "rejected": val_df['rejected'].tolist()
}

train_dataset = Dataset.from_dict(train_dpo_dataset)
eval_dataset = Dataset.from_dict(eval_dpo_dataset)

save_model(model, "model.safetensors")


dpo_trainer = DPOTrainer(
    model,
    ref_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)


dpo_trainer.train()

Map:   0%|          | 0/18907 [00:00<?, ? examples/s]

Map:   0%|          | 0/4051 [00:00<?, ? examples/s]

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
500,0.2852
1000,0.2741
1500,0.2636
2000,0.2748


TrainOutput(global_step=2364, training_loss=0.2735280877644237, metrics={'train_runtime': 1992.2864, 'train_samples_per_second': 9.49, 'train_steps_per_second': 1.187, 'total_flos': 0.0, 'train_loss': 0.2735280877644237, 'epoch': 1.0})

In [35]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [72]:
model.eval()

def generate_correction(sentence, max_length=85, temperature=1.5):
    inputs = tokenizer.encode(sentence, truncation=True, max_length=max_length, return_tensors='pt')
    if torch.cuda.is_available():
        inputs = inputs.cuda()
    outputs = model.generate(inputs, num_beams=4, max_length=max_length, num_return_sequences=1, temperature=temperature)
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

In [73]:
test_df['corrected_by_dpo'] = test_df['text'].progress_apply(generate_correction)

  0%|          | 0/4052 [00:00<?, ?it/s]

In [74]:
test_df.to_csv('/kaggle/working/dpo_rugpt3_results.csv')