In [3]:
import urllib.request
import zipfile
import os
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Используется устройство:", device)

Используется устройство: cpu


In [5]:
# 2. Скачиваем архив, если его ещё нет
url = "https://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
zip_path = "cornell_movie_dialogs_corpus.zip"

if not os.path.exists(zip_path):
    print("Скачиваю корпус диалогов...")
    urllib.request.urlretrieve(url, zip_path)
    print("Скачивание завершено.")
else:
    print("Архив уже скачан, пропускаю скачивание.")

# 3. Распаковываем архив, если папка ещё не распакована
extract_dir = "cornell movie-dialogs corpus"  # именно так называется папка внутри архива

if not os.path.exists(extract_dir):
    print("Распаковываю архив...")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(".")
    print("Распаковка завершена.")
else:
    print("Папка уже распакована, пропускаю распаковку.")

Скачиваю корпус диалогов...
Скачивание завершено.
Распаковываю архив...
Распаковка завершена.


In [6]:
path = 'cornell movie-dialogs corpus'
with open(f'{path}/movie_lines.txt', 'r', encoding='iso-8859-1') as f:
    lines = f.readlines()

line_id_to_text = {}
for line in lines:
    parts = line.split(' +++$+++ ')
    line_id = parts[0]
    text = parts[-1].strip()
    line_id_to_text[line_id] = text

with open(f'{path}/movie_conversations.txt', 'r', encoding='iso-8859-1') as f:
    conversations = f.readlines()

qa_pairs = []

for convo in conversations:
    parts = convo.split(' +++$+++ ')
    line_ids = eval(parts[-1])

    for i in range(len(line_ids) - 1):
        q_id = line_ids[i]
        a_id = line_ids[i + 1]
        if q_id in line_id_to_text and a_id in line_id_to_text:
            question = line_id_to_text[q_id]
            answer = line_id_to_text[a_id]
            qa_pairs.append({
                'input': question,
                'target': answer
            })

# В DataFrame
df = pd.DataFrame(qa_pairs)
print(f"Всего пар реплик: {len(df)}")
print("\nПримеры:")
print(df.head(3))

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

Всего пар реплик: 221616

Примеры:
                                               input  \
0  Can we make this quick?  Roxanne Korrine and A...   
1  Well, I thought we'd start with pronunciation,...   
2  Not the hacking and gagging and spitting part....   

                                              target  
0  Well, I thought we'd start with pronunciation,...  
1  Not the hacking and gagging and spitting part....  
2  Okay... then how 'bout we try out some French ...  


In [7]:
def format_conversation(ex):
    return f"Question: {ex['input']} Answer: {ex['target']}<|endoftext|>"

df['text'] = df.apply(format_conversation, axis=1)
dataset = Dataset.from_pandas(df[['text']])

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors=None
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
print("Размер обучающей выборки:", len(split_dataset['train']))
print("Размер тестовой:", len(split_dataset['test']))

Map: 100%|██████████| 221616/221616 [00:18<00:00, 11851.77 examples/s]

Размер обучающей выборки: 199454
Размер тестовой: 22162





In [8]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir='./gpt2-dialogue',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    warmup_steps=200,
    lr_scheduler_type='linear',
    fp16=True,  # если GPU поддерживает
    logging_dir='./logs',
    report_to='none',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer
)
print("Запуск дообучения...")
trainer.train()
model.save_pretrained('./gpt2-dialogue-finetuned')
tokenizer.save_pretrained('./gpt2-dialogue-finetuned')
print("Модель сохранена!")

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Запуск дообучения...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,0.0,
1000,0.0,
1500,0.0,
2000,0.0,
2500,0.0,




KeyboardInterrupt: 

In [9]:
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

questions = [
    # "Hi, how are you doing?",
    # "Who are you?",
    # "Are you a robot?",
    "What is the capital of Russia?",
]

print("ДЕМО-ДИАЛОГ С ДООБУЧЕННЫМ GPT-2\n")
for q in questions:
    response = generate_response(q, model, tokenizer)
    print(f"Вы: {q}")
    print(f"Бот: {response}")
    print("-" * 50)

ДЕМО-ДИАЛОГ С ДООБУЧЕННЫМ GPT-2



NameError: name 'generate_response' is not defined