In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2Config, AdamW, get_linear_schedule_with_warmup

In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2Config, AdamW, get_linear_schedule_with_warmup, GPT2LMHeadModel
from transformers.models.gpt2.modeling_gpt2 import GPT2Attention

class LoRALayer(torch.nn.Module):
    def __init__(self, num_heads, head_dim, rank, alpha):
        super(LoRALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha
        self.lora_A = torch.nn.Parameter(torch.Tensor(num_heads, head_dim, rank))
        self.lora_B = torch.nn.Parameter(torch.Tensor(num_heads, rank, head_dim))
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_normal_(self.lora_A)
        torch.nn.init.xavier_normal_(self.lora_B)

    def forward(self, x):
        lora_adjustment = self.alpha * torch.einsum('hmr,hrn->hmn', self.lora_A, self.lora_B)
        return x + lora_adjustment

class GPT2AttentionWithLoRA(GPT2Attention):
    def __init__(self, config, is_cross_attention=False):
        super().__init__(config, is_cross_attention)
        self.lora = LoRALayer(config.num_attention_heads, config.hidden_size // config.num_attention_heads, config.lora_rank, config.lora_alpha)

    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
        query = query + self.lora(query)
        return super()._attn(query, key, value, attention_mask, head_mask)

class GPT2LMHeadModelWithLoRA(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        for i, block in enumerate(self.transformer.h):
            block.attn.c_attn = GPT2AttentionWithLoRA(config, is_cross_attention=False).c_attn
            block.attn.c_proj = GPT2AttentionWithLoRA(config, is_cross_attention=False).c_proj


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Config

class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        self.file_path = file_path
        self.block_size = block_size
        self.examples = []

        # 计算文件中的行数
        with open(file_path, 'r', encoding='utf-8') as file:
            self.num_lines = sum(1 for line in file)

    def __len__(self):
        return self.num_lines

    def __getitem__(self, i):
        # 读取指定行的数据
        with open(self.file_path, 'r', encoding='utf-8') as file:
            for _ in range(i):
                next(file)
            line = next(file).strip()

        # 编码并截断
        tokens = self.tokenizer.encode(line, add_special_tokens=True)
        tokens = tokens[:self.block_size]

        return torch.tensor(tokens, dtype=torch.long)

# 加载分词器和配置
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
config = GPT2Config.from_pretrained("gpt2")
config.lora_rank = 4
config.lora_alpha = 32

# 创建数据集
file_path = 'D:\\wet_files\\reduced_train_part_0.txt'
block_size = 128
dataset = TextDataset(tokenizer, file_path, block_size)

# 创建数据加载器
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)


In [6]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size):
        self.tokenizer = tokenizer
        self.file_path = file_path
        self.block_size = block_size
        self.examples = []

        # 计算文件中的行数
        with open(file_path, 'r', encoding='utf-8') as file:
            self.num_lines = sum(1 for line in file)

    def __len__(self):
        return self.num_lines

    def __getitem__(self, i):
        # 读取指定行的数据
        with open(self.file_path, 'r', encoding='utf-8') as file:
            for _ in range(i):
                next(file)
            line = next(file).strip()

        # 编码并截断
        tokens = self.tokenizer.encode(line, add_special_tokens=True)
        tokens = tokens[:self.block_size]

        return tokens


In [7]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # 对批次中的所有句子进行填充
    padded_batch = pad_sequence([torch.tensor(item, dtype=torch.long) for item in batch], batch_first=True, padding_value=0)
    return padded_batch

# 创建数据加载器
data_loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)


In [8]:
# 创建带 LoRA 的 GPT-2 模型
model = GPT2LMHeadModelWithLoRA(config)

# 设置模型的设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 设置优化器和训练参数
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(data_loader) * 3  # 假设训练 3 个 epoch
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# 训练模型
model.train()
for epoch in range(3):
    total_loss = 0
    for batch in data_loader:
        optimizer.zero_grad()
        inputs = {'input_ids': batch.to(device), 'labels': batch.to(device)}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    print(f"Epoch: {epoch}, Average Loss: {avg_loss}")


  padded_batch = pad_sequence([torch.tensor(item, dtype=torch.long) for item in batch], batch_first=True, padding_value=0)
  padded_batch = pad_sequence([torch.tensor(item, dtype=torch.long) for item in batch], batch_first=True, padding_value=0)


Epoch: 0, Average Loss: 4.410923948986102
Epoch: 1, Average Loss: 3.6028256418558424
Epoch: 2, Average Loss: 3.486058124406189


In [9]:
def reduce_data_size(input_file_path, output_file_path, target_size_mb=1):
    # 计算目标大小（以字节为单位）
    target_size_bytes = target_size_mb * 1024 * 1024

    # 读取原始数据
    with open(input_file_path, 'r', encoding='utf-8') as input_file:
        data = input_file.readlines()

    # 计算需要保留的行数
    total_bytes = 0
    num_lines_to_keep = 0
    for line in data:
        total_bytes += len(line.encode('utf-8'))
        if total_bytes <= target_size_bytes:
            num_lines_to_keep += 1
        else:
            break

    # 截取数据以匹配目标大小
    reduced_data = data[:num_lines_to_keep]

    # 保存截取后的数据到新文件
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.writelines(reduced_data)

    print(f"Data has been reduced to {len(reduced_data)} lines and saved to {output_file_path}")

# 调用函数
input_file_path = 'D:\\wet_files\\valid_part_0.txt'
output_file_path = 'D:\\wet_files\\reduced_valid_part_0.txt'
reduce_data_size(input_file_path, output_file_path, target_size_mb=1)


Data has been reduced to 4239 lines and saved to D:\wet_files\reduced_valid_part_0.txt


In [11]:
train_file_path = 'D:\\wet_files\\reduced_train_part_0.txt'
model_save_path = train_file_path.replace('reduced_train_part_0.txt', 'trained_model.pt')
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")


Model saved to D:\wet_files\trained_model.pt


In [12]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from torch.nn.utils.rnn import pad_sequence

# 定义 TextDataset 类和 collate_fn 函数
# 省略了之前定义的 TextDataset 类和 collate_fn 函数...

# 加载分词器和配置
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
config = GPT2Config.from_pretrained("gpt2")
config.lora_rank = 4
config.lora_alpha = 32

# 创建验证数据集和数据加载器
valid_file_path = 'D:\\wet_files\\reduced_valid_part_0.txt'
block_size = 128
valid_dataset = TextDataset(tokenizer, valid_file_path, block_size)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# 加载训练好的模型
model_save_path = 'D:\\wet_files\\trained_model.pt'
model = GPT2LMHeadModel(config)
model.load_state_dict(torch.load(model_save_path))

# 设置模型的设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 评估模型
model.eval()
total_loss = 0
total_batches = len(valid_loader)
for i, batch in enumerate(valid_loader, start=1):
    inputs = {'input_ids': batch.to(device), 'labels': batch.to(device)}
    outputs = model(**inputs)
    loss = outputs.loss
    total_loss += loss.item()
    print(f"Progress: {i}/{total_batches}, Current Batch Loss: {loss.item()}", end='\r')
avg_loss = total_loss / total_batches
print(f"\nValidation Loss: {avg_loss}")


Progress: 2120/2120, Current Batch Loss: 3.5115964412689217
Validation Loss: 3.53309985174323


In [14]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from torch.nn.utils.rnn import pad_sequence

# 定义 TextDataset 类和 collate_fn 函数
# 省略了之前定义的 TextDataset 类和 collate_fn 函数...

# 定义评估困惑度的函数
def evaluate_perplexity(model, data_loader, device):
    model.eval()
    total_loss = 0
    total_batches = len(data_loader)
    for i, batch in enumerate(data_loader, start=1):
        inputs = {'input_ids': batch.to(device), 'labels': batch.to(device)}
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        print(f"Evaluating: {i}/{total_batches} batches processed.", end='\r')
    avg_loss = total_loss / total_batches
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

# 加载分词器和配置
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
config = GPT2Config.from_pretrained("gpt2")
config.lora_rank = 4
config.lora_alpha = 32

# 创建验证数据集和数据加载器
valid_file_path = 'D:\\wet_files\\reduced_valid_part_0.txt'
block_size = 128
valid_dataset = TextDataset(tokenizer, valid_file_path, block_size)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# 设置模型的设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载训练好的模型
model_save_path = 'D:\\wet_files\\trained_model.pt'
trained_model = GPT2LMHeadModel(config)
trained_model.load_state_dict(torch.load(model_save_path))
trained_model.to(device)

# 加载原始的 GPT-2 模型
original_model = GPT2LMHeadModel.from_pretrained("gpt2")
original_model.to(device)

# 评估两个模型的困惑度
trained_model_perplexity = evaluate_perplexity(trained_model, valid_loader, device)
original_model_perplexity = evaluate_perplexity(original_model, valid_loader, device)

# 比较两个模型的困惑度
if trained_model_perplexity < original_model_perplexity:
    improvement_percentage = (1 - trained_model_perplexity / original_model_perplexity) * 100
    print(f"\nTrained model has lower perplexity ({trained_model_perplexity:.2f}) compared to the original GPT-2 model ({original_model_perplexity:.2f}), which is an improvement of {improvement_percentage:.2f}%.")
else:
    improvement_percentage = (1 - original_model_perplexity / trained_model_perplexity) * 100
    print(f"\nOriginal GPT-2 model has lower perplexity ({original_model_perplexity:.2f}) compared to the trained model ({trained_model_perplexity:.2f}), which is an improvement of {improvement_percentage:.2f}%.")


Evaluating: 2120/2120 batches processed.
Trained model has lower perplexity (34.23) compared to the original GPT-2 model (411.32), which is an improvement of 91.68%.


In [15]:
# 获取训练前的 LM Head 参数的副本
lm_head_params_before = {name: param.clone() for name, param in model.named_parameters() if 'lm_head' in name}



# 获取训练后的 LM Head 参数
lm_head_params_after = {name: param for name, param in model.named_parameters() if 'lm_head' in name}

# 检查参数是否有变化
for name, param_before in lm_head_params_before.items():
    param_after = lm_head_params_after[name]
    if not torch.equal(param_before, param_after):
        print(f"Parameter {name} has changed during training.")
    else:
        print(f"Parameter {name} has not changed during training.")


In [16]:
for name, param in model.named_parameters():
    if 'lm_head' in name:
        print(f"{name}: requires_grad = {param.requires_grad}")



In [17]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# 定义 TextDataset 类和 collate_fn 函数
# 省略了之前定义的 TextDataset 类和 collate_fn 函数...

# 定义评估困惑度的函数
def evaluate_perplexity(model, data_loader, device):
    model.eval()
    total_loss = 0
    with tqdm(data_loader, desc="Evaluating", leave=False) as progress_bar:
        for batch in progress_bar:
            inputs = {'input_ids': batch.to(device), 'labels': batch.to(device)}
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    avg_loss = total_loss / len(data_loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()

# 加载分词器和配置
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
config = GPT2Config.from_pretrained("gpt2")
config.lora_rank = 4
config.lora_alpha = 32

# 创建验证数据集和数据加载器
valid_file_path = 'D:\\wet_files\\reduced_valid_part_0.txt'
block_size = 128
valid_dataset = TextDataset(tokenizer, valid_file_path, block_size)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

# 设置模型的设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载训练好的模型
model_save_path = 'D:\\wet_files\\trained_model.pt'
trained_model = GPT2LMHeadModel(config)
trained_model.load_state_dict(torch.load(model_save_path))
trained_model.to(device)

# 加载原始的 GPT-2 模型
original_model = GPT2LMHeadModel.from_pretrained("gpt2")
original_model.to(device)

# 评估两个模型的困惑度
trained_model_perplexity = evaluate_perplexity(trained_model, valid_loader, device)
print(f"Trained Model Perplexity: {trained_model_perplexity:.2f}")

original_model_perplexity = evaluate_perplexity(original_model, valid_loader, device)
print(f"Original GPT-2 Model Perplexity: {original_model_perplexity:.2f}")



                                                                            

Trained Model Perplexity: 34.23


                                                                            

Original GPT-2 Model Perplexity: 411.32


