In [None]:
!pip install transformers

In [None]:
import gc 
def report_gpu(): 
    print(torch.cuda.list_gpu_processes()) 
    gc.collect() 
    torch.cuda.empty_cache()

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch.nn as nn
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
train_file_path = "/kaggle/input/review/preprocessed_train_data.csv"
test_file_path = "/kaggle/input/review/preprocessed_test_data.csv"

In [None]:
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_data['text'], train_data['label'], test_size=0.1)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', download=True)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts.dropna()
        self.labels = labels.dropna()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx] if idx < len(self.texts) else ""
        label = self.labels.iloc[idx] if idx < len(self.labels) else 0

        # Tokenize the text
        tokens = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        input_ids = tokens.input_ids.squeeze()
        attention_mask = tokens.attention_mask.squeeze()

        # Adjust for texts longer than max_len
        if input_ids.size(0) > self.max_len - 2:  # accounting for [CLS] and [SEP] tokens
            first_part = input_ids[:201]  # 201 tokens for the first part
            last_part = input_ids[-311:]  # 311 tokens for the last part
            input_ids = torch.cat([first_part, last_part])

            first_mask = attention_mask[:201]  # Mask for the first part
            last_mask = attention_mask[-311:]  # Mask for the last part
            attention_mask = torch.cat([first_mask, last_mask])

        # Ensure uniform length by padding if necessary
        if input_ids.size(0) < self.max_len:
            padding_length = self.max_len - input_ids.size(0)
            input_ids = F.pad(input_ids, (1, padding_length + 1), value=self.tokenizer.pad_token_id)
            attention_mask = F.pad(attention_mask, (1, padding_length + 1), value=0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)
test_dataset = SentimentDataset(test_data['text'], test_data['label'], tokenizer)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)
#model.gradient_checkpointing_enable()
model.classifier = nn.Linear(model.config.hidden_size, 5)
# Freeze all layers except the last layer
for param in model.parameters():
    param.requires_grad = False

# Enable gradient computation for the classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
optimizer = AdamW(model.classifier.parameters(), lr=2.5e-5)

In [None]:
import os
output_dir = "/kaggle/working/results"
logging_dir = "/kaggle/working/logs"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

In [None]:
import shutil

shutil.rmtree(output_dir, ignore_errors=True)
shutil.rmtree(logging_dir, ignore_errors=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(logging_dir, exist_ok=True)

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=logging_dir,
    learning_rate=2.5e-5,
    evaluation_strategy="epoch",
    save_total_limit=1,  # Keep only the last checkpoint
    report_to="none", 
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
def save_checkpoint(model, optimizer, epoch, filename='checkpoint.pth'):
    state = {
        'epoch': epoch,
        'model_state': model.state_dict(),
        'optimizer_state': optimizer.state_dict(),
    }
    torch.save(state, filename)
    print(f"Checkpoint saved to {filename}")

In [None]:
def load_checkpoint(filename, model, optimizer):
    if os.path.isfile(filename):
        print(f"Loading checkpoint {filename}")
        checkpoint = torch.load(filename)
        model.load_state_dict(checkpoint['model_state'])
        optimizer.load_state_dict(checkpoint['optimizer_state'])
        epoch = checkpoint['epoch']
        print(f"Checkpoint loaded: start from epoch {epoch}")
        return epoch
    else:
        print(f"No checkpoint found at {filename}")
        return 0

In [None]:
checkpoint_path = "/kaggle/working/results/checkpoint-31000/"
start_epoch = load_checkpoint(checkpoint_path, model, optimizer)

In [None]:
for epoch in range(start_epoch, training_args.num_train_epochs):
    trainer.train()
    save_checkpoint(model, optimizer, epoch, checkpoint_path)

# Đánh giá mô hình
trainer.evaluate(test_dataset)

In [None]:
import torch
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.trainer_utils import get_last_checkpoint

In [None]:
checkpoint_path = "/kaggle/working/results/checkpoint-31000/"

In [None]:
config = BertConfig.from_pretrained(checkpoint_path + "config.json")
model = BertForSequenceClassification(config)
model.load_state_dict(torch.load(checkpoint_path + "model.safetensors"))

optimizer = AdamW(model.parameters(), lr=5e-5)
optimizer.load_state_dict(torch.load(checkpoint_path + "optimizer.pt"))

scheduler = torch.load(checkpoint_path + "scheduler.pt")