In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup, AutoTokenizer
import torch.optim as optim
import json
import os

# JSON 파일에서 데이터 읽어오기
with open('C:\\Users\\dev\\Documents\\squad\\train-v2.0.json', 'r') as f:
    data = json.load(f)

# 토크나이저 생성 후 패딩 토큰 설정
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# 데이터를 적절한 형태로 변환하여 input_ids, attention_mask, labels 생성
input_ids = []
attention_mask = []
labels = []

for paragraph in data['data']:
    for qa in paragraph['paragraphs']:
        for q in qa['qas']:
            input_id = tokenizer.encode(q['question'], return_tensors='pt', max_length=128, truncation=True, padding='max_length')
            attention_masks = input_id.ne(tokenizer.pad_token_id).long()
            if q['is_impossible']:
                label = tokenizer.encode("[NO_ANSWER]", return_tensors='pt', max_length=128, truncation=True, padding='max_length')
            else:
                label = tokenizer.encode(q['answers'][0]['text'], return_tensors='pt', max_length=128, truncation=True, padding='max_length')
            input_ids.append(input_id.squeeze())
            attention_mask.append(attention_masks.squeeze())
            labels.append(label.squeeze())

# TensorDataset과 DataLoader를 사용하여 데이터셋 구성
input_ids = torch.stack(input_ids)
attention_mask = torch.stack(attention_mask)
labels = torch.stack(labels)

train_dataset = TensorDataset(input_ids, attention_mask, labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# 모델 및 옵티마이저 설정
model = GPT2LMHeadModel.from_pretrained('gpt2')
optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# 학습 반복 횟수 설정
num_epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# 모델 학습
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = model(batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_dataloader)}')

# 훈련이 끝난 후 모델 저장
model_save_path = 'C:\\Users\\dev\\Documents\\GitHub\\ChatBot\\ver3'
os.makedirs(model_save_path, exist_ok=True)

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"모델과 토크나이저가 {model_save_path}에 저장되었습니다.")