## Импорты

In [2]:
!pip install transformers datasets peft -q
!pip install -U bitsandbytes -q
!pip uninstall wandb -q -y
import json
import os
import kagglehub
import torch
from IPython.display import clear_output
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer
import zipfile
clear_output()

## Загрузка датасета

In [3]:
! kaggle datasets download -d "stanfordu/stanford-question-answering-dataset"
# Распакуем архив
with zipfile.ZipFile("stanford-question-answering-dataset.zip", 'r') as zip_ref:
  zip_ref.extractall("stanford-question-answering-dataset")

  pid, fd = os.forkpty()


Dataset URL: https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset
License(s): CC-BY-SA-4.0
Downloading stanford-question-answering-dataset.zip to /kaggle/working
  0%|                                               | 0.00/8.73M [00:00<?, ?B/s]
100%|███████████████████████████████████████| 8.73M/8.73M [00:00<00:00, 168MB/s]


In [4]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

data = load_data(f"stanford-question-answering-dataset/train-v1.1.json")

In [5]:
'''Что тут вообще есть'''
print(data.keys())
print(data['data'][0].keys())
print(data['data'][0]['paragraphs'][0])
print(data['data'][0]['paragraphs'][0]['context'])

print(data['data'][0]['paragraphs'][0]['qas'])
print(data['data'][0]['paragraphs'][0]['qas'][0]['question'])

print(data['data'][0]['paragraphs'][0]['qas'][0]['answers'])

dict_keys(['data', 'version'])
dict_keys(['title', 'paragraphs'])
{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'qas': [{'answers': [{'answer_start': 515, 'text': 'Saint Bernadette Soubirous'}], 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'id': '5733be284776f41900661182'}, {'answers': [{'an

In [6]:
'''Подготовим данные к виду inputs - outputs'''
def data_preprocessing(init_data):
    inputs, outputs = [], []
    for article in init_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]['text']
                inputs.append(f"question: {question} context: {context}")
                outputs.append(answer)
    return {"input": inputs, "output": outputs}

dataset = Dataset.from_dict(data_preprocessing(data))
dataset = dataset.train_test_split(test_size=0.1)

## Загрузка модели и токенизатора

In [7]:
model_name = "distilbert/distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16'
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Linear4bit(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear4bit(in_features=3072, out_features=768, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_aff

In [9]:
peft_config = LoraConfig(
    r=64,  
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, peft_config)
lora_model.print_trainable_parameters()

trainable params: 3,244,032 || all params: 85,156,608 || trainable%: 3.8095


In [10]:
# eos_token как pad_token, иначе проблемы были
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  


def tokenize_function(examples):
    return tokenizer(
        examples['input'],
        text_target=examples['output'],
        padding="max_length",
        truncation=True,
        max_length=512)

In [11]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_val_test_split = tokenized_datasets['train'].train_test_split(test_size=0.2)
val_test_split = train_val_test_split['test'].train_test_split(test_size=0.5)

train_dataset = train_val_test_split['train']
val_dataset = val_test_split['train']
test_dataset = val_test_split['test']

Map:   0%|          | 0/78839 [00:00<?, ? examples/s]

Map:   0%|          | 0/8760 [00:00<?, ? examples/s]

## Обучение

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    learning_rate=1e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    save_total_limit=2,
    gradient_accumulation_steps=4,
    report_to="none",
    dataloader_num_workers=4
)




In [13]:
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disab

Epoch,Training Loss,Validation Loss
0,0.0773,0.076842
2,0.0728,0.073241


  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to a

TrainOutput(global_step=2955, training_loss=0.6957765161285142, metrics={'train_runtime': 7924.2281, 'train_samples_per_second': 23.878, 'train_steps_per_second': 0.373, 'total_flos': 2.65926463193088e+16, 'train_loss': 0.6957765161285142, 'epoch': 2.9984779299847792})

In [14]:
lora_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-5): 6 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=768, out_features=2304, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora

# Пример

In [15]:
test = test_dataset[2]

lora_model.eval()

with torch.no_grad():
    device = lora_model.device
    input = tokenizer(
        test['input'], return_tensors='pt', truncation=True
    ).to(device)
    result = lora_model.generate(
        **input,
        max_new_tokens=100,
        temperature=0.001,
        do_sample=True
    )

output = tokenizer.decode(result[0, :])
print(f"На входе: {test['input']}")
print(f"На выходе: {output[len(test['input']):]}")
print(f"Что должно было быть: {test['output']}")


На входе: question: What is one reason American courts may follow a post Revolutionary Commonwealth ruling under what circumstances? context: However, it is important to understand that despite the presence of reception statutes, much of contemporary American common law has diverged significantly from English common law. The reason is that although the courts of the various Commonwealth nations are often influenced by each other's rulings, American courts rarely follow post-Revolution Commonwealth rulings unless there is no American ruling on point, the facts and law at issue are nearly identical, and the reasoning is strongly persuasive.
На выходе: <|endoftext|>
Что должно было быть: there is no American ruling on point
