In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
import os
from tqdm import tqdm

In [None]:
def load_csv(train_path, test_path):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    train_reviews = train_df[train_df['Product Class'] != 'Else']['review'].tolist()
    train_ratings = train_df[train_df['Product Class'] != 'Else']['rating'].tolist()

    test_reviews = test_df[test_df['Product Class'] != 'Else']['review'].tolist()
    test_ratings = test_df[test_df['Product Class'] != 'Else']['rating'].tolist()

    return train_reviews,train_ratings, test_reviews, test_ratings

train_reviews,train_ratings, test_reviews, test_ratings = load_csv('./data/drugsComTrain_raw_addclass.csv', './data/drugsComTest_raw_addclass.csv')

In [None]:
print(torch.unique(torch.tensor(train_ratings)))

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')

print(encoded_input)
# output = model(**encoded_input)

tokens = tokenizer.convert_ids_to_tokens([101, 5672, 1012, 102])
print(tokens)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [47]:
def tokenize(tokenizer, train_reviews, test_reviews):
    train_reviews_token = [tokenizer.encode_plus(
    text,
    truncation=True,
    add_special_tokens=True,
    max_length=1024,            
    pad_to_max_length=True,   
    return_attention_mask=True,  
    return_tensors='pt',      
    ) for text in train_reviews]

    test_reviews_token = [tokenizer.encode_plus(
    text,
    truncation=True,
    add_special_tokens=True,
    max_length=1024,            
    pad_to_max_length=True,   
    return_attention_mask=True,  
    return_tensors='pt',      
    ) for text in test_reviews]

    return train_reviews_token, test_reviews_token


train_reviews_token, test_reviews_token = tokenize(tokenizer, train_reviews, test_reviews)



In [37]:
print(train_reviews_token[0]['input_ids'].shape)
print(train_ratings)

torch.Size([1, 1024])
[9, 9, 9, 9, 10, 1, 7, 8, 10, 8, 10, 1, 10, 8, 5, 4, 8, 7, 8, 1, 10, 10, 7, 10, 8, 9, 1, 8, 8, 8, 9, 8, 8, 1, 8, 6, 3, 10, 10, 5, 1, 1, 1, 1, 2, 10, 3, 2, 5, 10, 7, 1, 7, 10, 8, 8, 10, 1, 4, 10, 5, 8, 10, 10, 7, 1, 10, 6, 10, 10, 8, 8, 9, 2, 9, 10, 8, 9, 10, 6, 10, 5, 9, 7, 6, 9, 10, 10, 8, 9, 6, 9, 10, 1, 2, 8, 10, 10, 10, 9, 8, 3, 8, 1, 9, 10, 10, 10, 9, 10, 9, 10, 10, 1, 10, 1, 7, 9, 1, 9, 9, 8, 7, 10, 9, 1, 8, 10, 10, 9, 10, 9, 9, 10, 10, 3, 3, 1, 7, 9, 1, 10, 10, 10, 8, 10, 9, 6, 1, 3, 10, 2, 10, 9, 10, 6, 1, 10, 9, 8, 10, 9, 1, 1, 9, 7, 7, 1, 9, 9, 10, 10, 9, 6, 8, 10, 10, 1, 9, 5, 3, 10, 10, 10, 1, 10, 10, 9, 10, 3, 10, 8, 1, 7, 1, 8, 10, 1, 9, 10, 9, 8, 10, 10, 10, 2, 8, 8, 10, 10, 6, 10, 2, 8, 9, 1, 8, 9, 6, 9, 10, 9, 9, 8, 9, 1, 9, 10, 10, 10, 2, 1, 10, 7, 10, 10, 10, 3, 9, 7, 9, 10, 1, 2, 1, 9, 1, 10, 9, 4, 10, 9, 2, 8, 10, 8, 8, 8, 8, 5, 6, 2, 10, 1, 4, 1, 3, 10, 10, 8, 2, 10, 10, 9, 10, 10, 9, 10, 8, 8, 10, 9, 1, 10, 10, 10, 9, 8, 10, 10, 9, 9, 1, 8, 

In [43]:
class Review_Rating_Dataset(torch.utils.data.Dataset):
    def __init__(self, reviews_token, rating):
        self.review = reviews_token
        self.rating = rating
 
    def __getitem__(self, idx):
        item = {k: v.squeeze(dim=0) for k, v in self.review[idx].items()}
        item["rating"] = torch.tensor(self.rating[idx] - 1)
        return item
 
    def __len__(self):
        return len(self.rating)


train_dataset = Review_Rating_Dataset(train_reviews_token, train_ratings)
test_dataset = Review_Rating_Dataset(test_reviews_token, test_ratings)

In [44]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
class BertWithMLP(nn.Module):
    def __init__(self, bert, hidden_size=768, mlp_hidden_size1=1024, mlp_hidden_size2 =256, num_classes=10):
        super(BertWithMLP, self).__init__()
        self.bert = bert
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, mlp_hidden_size1),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(mlp_hidden_size1, mlp_hidden_size2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(mlp_hidden_size2, num_classes)
        )
    
    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        cls = outputs.last_hidden_state[:, 0, :]
        
        logits = self.mlp(cls)
        
        return logits

In [50]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['rating'].to(device)

        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_loss += loss.item()
        
        # 更新进度条显示
        progress_bar.set_postfix({
            'loss': loss.item(),
            'acc': torch.sum(preds == labels).item()/len(labels)
        })
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / len(dataloader.dataset)
    return avg_loss, accuracy

def eval_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['rating'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            
            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_loss += loss.item()
            
            progress_bar.set_postfix({
                'loss': loss.item(),
                'acc': torch.sum(preds == labels).item()/len(labels)
            })
    
    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / len(dataloader.dataset)
    return avg_loss, accuracy

# 4. 主训练循环
def train_and_evaluate(
    model, 
    train_loader, 
    val_loader, 
    optimizer, 
    criterion, 
    device, 
    epochs, 
    model_save_path,
    eval_every=5  # 每多少轮评估一次
):
    best_val_acc = 0.0
    history = {
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': []
    }
    
    for epoch in range(1, epochs+1):
        print(f"\nEpoch {epoch}/{epochs}")
        
        # 训练阶段
        train_loss, train_acc = train_epoch(
            model, train_loader, optimizer, criterion, device)
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc.item())
        
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        
        # 验证阶段
        if epoch % eval_every == 0 and val_loader is not None:
            val_loss, val_acc = eval_model(
                model, val_loader, criterion, device)
            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc.item())
            
            print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
            
            # 保存最佳模型
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                torch.save(model.state_dict(), model_save_path)
                print(f"New best model saved to {model_save_path} with val_acc: {val_acc:.4f}")
    
    return history

# 5. 主函数
def main():
    # 初始化
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    BERT = BertModel.from_pretrained("bert-base-uncased")
    # for param in BERT.parameters():
    #     param.requires_grad = False

    # layers_to_unfreeze = ['encoder.layer.11', 'encoder.layer.10']  # 解冻最后两层
    # for name, param in model.named_parameters():
    #     if any(layer in name for layer in layers_to_unfreeze):
    #         param.requires_grad = True
    
    # 加载模型
    model = BertWithMLP(BERT, hidden_size=768, mlp_hidden_size1=1024, mlp_hidden_size2=256, num_classes=10)
    model.to(device)

    # for name, param in model.named_parameters():
    #     print(name, param.requires_grad)
    
    # 训练参数
    optimizer = torch.optim.Adam([
    {'params': model.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-2].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-3].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-4].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-5].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-6].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-7].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-8].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-9].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-10].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-11].parameters(), 'lr': 5e-5},
    {'params': model.bert.encoder.layer[-12].parameters(), 'lr': 5e-5},
    {'params': model.mlp.parameters(), 'lr': 1e-4}
    ])

    criterion = nn.CrossEntropyLoss()
    epochs = 20
    model_save_path = "./best_model.pth"
    
    # 创建保存目录
    os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
    
    # 训练和验证
    history = train_and_evaluate(
        model=model,
        train_loader=train_loader,
        val_loader=test_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=epochs,
        model_save_path=model_save_path,
        eval_every=5  # 每轮都验证
    )
    
    print("\nTraining complete!")
    print(f"Best validation accuracy: {max(history['val_acc']):.4f}")

if __name__ == "__main__":
    main()


Epoch 1/20


                                                  

OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU 0 has a total capacity of 11.99 GiB of which 0 bytes is free. Of the allocated memory 25.58 GiB is allocated by PyTorch, and 395.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)