In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from transformers import AdamW, get_linear_schedule_with_warmup
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Enable CUDA_LAUNCH_BLOCKING for better error messages
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, num_labels):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_labels = num_labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        sentence = str(self.data.iloc[index]['sentence1'])  # Assuming 'sentence1' is the column name
        label = self.data.iloc[index]['label']  # Get label value
        
        # Check if label is a valid number
        if pd.isnull(label):
            print(f"NaN value found in 'label' column at index: {index}")
            label = 0
        else:
            label = int(label)
        
        # Ensure label is within the valid range
        if label < 0 or label >= self.num_labels:
            print(f"Invalid label {label} at index: {index}")
            label = 0
        
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Load tokenizer and model
num_labels = 4
tokenizer = AutoTokenizer.from_pretrained("aplycaebous/tb-BERT-fpt")
model = AutoModelForSequenceClassification.from_pretrained("aplycaebous/tb-BERT-fpt", num_labels=num_labels).to(device)

# Load datasets
train_data = pd.read_csv('/kaggle/input/sentiment01/train_data1.csv')  # Update with your file path
val_data = pd.read_csv('/kaggle/input/sentiment01/validation_data1.csv')  # Update with your file path
test_data = pd.read_csv('/kaggle/input/sentiment01/test_data1.csv')  # Update with your file path

# Define datasets and dataloaders
train_dataset = CustomDataset(train_data, tokenizer, max_length=64, num_labels=num_labels)  # Update max_length as needed
val_dataset = CustomDataset(val_data, tokenizer, max_length=64, num_labels=num_labels)
test_dataset = CustomDataset(test_data, tokenizer, max_length=64, num_labels=num_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1.25e-6)
epochs = 15
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss()

# Initialize lists to store training and validation accuracies
train_accuracies = []
val_accuracies = []

# Training loop
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    correct_train_preds = 0
    total_train_preds = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        preds = torch.argmax(outputs.logits, dim=1)
        correct_train_preds += (preds == labels).sum().item()
        total_train_preds += labels.size(0)
        
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    train_accuracy = correct_train_preds / total_train_preds
    train_accuracies.append(train_accuracy)
    
    # Evaluation on validation set
    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            val_preds.extend(preds.tolist())
            val_labels.extend(labels.tolist())

    val_accuracy = accuracy_score(val_labels, val_preds)
    val_accuracies.append(val_accuracy)
    
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')
    
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Val Accuracy: {val_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Plot training vs validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(range(1, epochs + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Train Accuracy vs Validation Accuracy')
plt.legend()
plt.grid()
plt.show()

# Evaluation on test set
test_preds = []
test_labels = []
for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        
        test_preds.extend(preds.tolist())
        test_labels.extend(labels.tolist())

test_accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')
conf_matrix = confusion_matrix(test_labels, test_preds)

print(f"Testing Accuracy: {test_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Save model weights to a .pth file
output_dir = '/kaggle/working/model/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
torch.save(model.state_dict(), os.path.join(output_dir, 'fp_bert_model.pth'))
print("Model weights saved to:", os.path.join(output_dir, 'fp_bert_model.pth'))
