In [14]:
!pip install transformers numpy pandas scikit-learn tqdm chardet



In [15]:
# import os
# os.chdir('/content/drive/MyDrive/ML_Midterm')

In [16]:
import pandas as pd

data = pd.read_csv('dataset.csv')
texts = data['x']
labels = data['y']

In [17]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

In [18]:
from sklearn.model_selection import train_test_split

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, stratify=labels, test_size=0.3
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, stratify=temp_labels, test_size=0.5
)

In [19]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts):
    return tokenizer(
        texts.tolist(),
        add_special_tokens=True,
        max_length=32,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_encodings = encode_texts(train_texts)
val_encodings = encode_texts(val_texts)
test_encodings = encode_texts(test_texts)

In [20]:
def create_attention_masks(encodings):
    return [
        [float(i > 0) for i in seq] for seq in encodings['input_ids']
    ]

train_masks = create_attention_masks(train_encodings)
val_masks = create_attention_masks(val_encodings)
test_masks = create_attention_masks(test_encodings)

In [21]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(
    train_encodings['input_ids'],
    torch.tensor(train_masks),
    torch.tensor(train_labels)
)

val_dataset = TensorDataset(
    val_encodings['input_ids'],
    torch.tensor(val_masks),
    torch.tensor(val_labels)
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    torch.tensor(test_masks),
    torch.tensor(test_labels)
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [22]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5,  # Adjust this based on your classification task
    output_attentions=False,
    output_hidden_states=False,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
learning_rate = 0.001
batch_size = 1000
num_epochs = 3

In [24]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=learning_rate)

total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)



In [26]:
import torch

for epoch in range(num_epochs):
    print("Starting epoch ",epoch)
    model.train()
    total_loss = 0

    for batch in train_loader:
        b_input_ids, b_input_mask, b_labels = tuple(t for t in batch)

        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Loss: {avg_train_loss}')

Starting epoch  0


In [None]:
model.eval()
import numpy as np

predictions, true_labels = [], []

In [None]:
with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to('cuda') for t in batch)

        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Concatenate all predictions and true labels
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

flat_true_labels = np.concatenate(true_labels, axis=0)

# Calculate accuracy
accuracy = accuracy_score(flat_true_labels, flat_predictions)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(flat_true_labels, flat_predictions, average='weighted')

print(f'Validation Accuracy: {accuracy}')
print(f'Validation Precision: {precision}')
print(f'Validation Recall: {recall}')
print(f'Validation F1-Score: {f1}')