In [3]:
!pip install transformers
!pip install datasets



In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Load the CSV files
train_data = pd.read_csv("Train.csv")
test_data = pd.read_csv("Test.csv")

# Ensure your data has 'text' and 'label' columns
# If not, rename columns accordingly
train_data = train_data.rename(columns={"Data": "text", "Label": "label"})
test_data = test_data.rename(columns={"Data": "text", "Label": "label"})

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Print to verify
print(train_dataset)
print(test_dataset)


Dataset({
    features: ['text', 'label'],
    num_rows: 12575
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1586
})


In [5]:
from transformers import AutoTokenizer

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenizer
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/12575 [00:00<?, ? examples/s]

Map:   0%|          | 0/1586 [00:00<?, ? examples/s]

In [6]:
from torch.utils.data import DataLoader

# Define DataLoader
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)


In [7]:
from transformers import AutoModelForSequenceClassification

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # Change num_labels based on your dataset


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import AdamW
from torch.optim.lr_scheduler import StepLR

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)




In [9]:
import torch
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Set device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
epochs = 8
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()

        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        optimizer.step()
    scheduler.step()
    print(f"Epoch {epoch + 1} - Loss: {total_loss / len(train_dataloader)}")


Epoch 1 - Loss: 1.0213277366476812
Epoch 2 - Loss: 0.9220760799470445
Epoch 3 - Loss: 0.824389477950016
Epoch 4 - Loss: 0.7867299586382834
Epoch 5 - Loss: 0.7593776813824364
Epoch 6 - Loss: 0.7551745777670057
Epoch 7 - Loss: 0.7505524290426997
Epoch 8 - Loss: 0.7489676909668148


In [10]:
model.eval()

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in test_dataloader:
        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=-1)

        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

# Metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average="weighted")
recall = recall_score(all_labels, all_preds, average="weighted")

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Recall: {recall}")


Accuracy: 0.6122320302648171
F1 Score: 0.6052023540606403
Recall: 0.6122320302648171


In [11]:
model.save_pretrained("saved_bert_model")
tokenizer.save_pretrained("saved_bert_model")


('saved_bert_model/tokenizer_config.json',
 'saved_bert_model/special_tokens_map.json',
 'saved_bert_model/vocab.txt',
 'saved_bert_model/added_tokens.json',
 'saved_bert_model/tokenizer.json')