In [None]:
!pip install transformers numpy pandas scikit-learn tqdm

In [None]:
import os
import zipfile
import pandas as pd

In [None]:
data = pd.read_csv('dataset.csv')
texts = data['x']
labels = data['y']

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, stratify=labels, test_size=0.3
)

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def convert_to_token(texts):
    return tokenizer(
        texts.tolist(),
        add_special_tokens=True,
        max_length=32,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_encodings = convert_to_token(train_texts)
val_encodings = convert_to_token(val_texts)

In [None]:
def create_attention_masks(encodings):
    return [
        [float(i > 0) for i in seq] for seq in encodings['input_ids']
    ]

train_masks = create_attention_masksmasks(train_encodings)
val_masks = create_attention_masks(val_encodings)


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(
    train_encodings['input_ids'],
    torch.tensor(train_masks),
    torch.tensor(train_labels)
)

val_dataset = TensorDataset(
    val_encodings['input_ids'],
    torch.tensor(val_masks),
    torch.tensor(val_labels)
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
#model.save_pretrained("/kaggle/working/mybertmodel")

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5, 
    output_attentions=False,
    output_hidden_states=False,
)

In [None]:
learning_rate = 0.00001
batch_size = 128
num_epochs = 4

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=learning_rate)

total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
import torch

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        b_input_ids, b_input_mask, b_labels = tuple(t for t in batch)

        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Loss: {avg_train_loss}')

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [34]:
model.eval()
import numpy as np

predictions, true_labels = [], []

In [35]:
with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to('cuda') for t in batch)

        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

In [36]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Concatenate all predictions and true labels
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

flat_true_labels = np.concatenate(true_labels, axis=0)

# Calculate accuracy
accuracy = accuracy_score(flat_true_labels, flat_predictions)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average='weighted')

print(f'Validation Accuracy: {accuracy}')
print(f'Validation Precision: {precision}')
print(f'Validation Recall: {recall}')
print(f'Validation F1-Score: {f1}')

Validation Accuracy: 0.36972193614830073
Validation Precision: 0.377629353764366
Validation Recall: 0.36972193614830073
Validation F1-Score: 0.3623859158368761
