<a href="https://colab.research.google.com/github/PuchToTalk/FinBERT/blob/fine-tuning/Fine_Tuning_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers


Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.2 MB/s[0m eta [36m0:00:0

In [39]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Define your own dataset for text classification
# In this example, let's assume you have training_data and validation_data
# training_data should be a list of (text, label) pairs
# validation_data should be a list of (text, label) pairs

# Define the training data as a list of (text, label) pairs
training_data = [
    ("I love this person.", 1),             # Positive sentiment (label 1)
    ("I hate reading.", 0),                # Negative sentiment (label 0)
    ("The weather is beautiful today.", 1), # Positive sentiment (label 1)
    ("I'm feeling great.", 1),             # Positive sentiment (label 1)
    ("I don't like this movie.", 0),        # Negative sentiment (label 0)
    ("I enjoy spending time with my family.", 1),  # Positive sentiment (label 1)
    ("Studying can be boring.", 0),              # Negative sentiment (label 0)
    ("The food at that restaurant was delicious.", 1),  # Positive sentiment (label 1)
    ("I had a terrible day at work.", 0),      # Negative sentiment (label 0)
    ("I am a horrible person", 0)      # Negative sentiment (label 0)
    # Add more training data examples here...
]

# Define the validation data as a list of (text, label) pairs
validation_data = [
    ("I love this dog.", 1),               # Positive sentiment (label 1)
    ("I hate doing my homework.", 0),      # Negative sentiment (label 0)
    ("The sunset was breathtaking.", 1),   # Positive sentiment (label 1)
    ("I'm not in a good mood.", 0),        # Negative sentiment (label 0)
    ("This book is amazing!", 1),          # Positive sentiment (label 1)
    ("I enjoyed the concert last night.", 1),  # Positive sentiment (label 1)
    ("I can't stand the traffic in this city.", 0),  # Negative sentiment (label 0)
    ("The vacation was relaxing and fun.", 1),  # Positive sentiment (label 1)
    ("I'm frustrated with my computer.", 0),    # Negative sentiment (label 0)
    ("I am a awful person", 0)      # Negative sentiment (label 0)
    # Add more validation data examples here...
]


# Define the BERT model and tokenizer
model_name = "bert-base-uncased"  # You can choose other pre-trained models
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Change num_labels to match your classification task

# Tokenize and preprocess the data
def preprocess_data(data):
    inputs = [tokenizer.encode(text, add_special_tokens=True, max_length=128, pad_to_max_length=True) for text, _ in data]
    labels = [label for _, label in data]
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels)
    return inputs, labels

train_inputs, train_labels = preprocess_data(training_data)
val_inputs, val_labels = preprocess_data(validation_data)

# Create data loaders
batch_size = 32
train_dataset = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(val_inputs, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Define training parameters
learning_rate = 2e-5
num_epochs = 50  # You can adjust the number of epochs

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Fine-Tune the BERT model
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)  # Move the model to the specified device
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, labels in train_dataloader:
        inputs = inputs.to(device)  # Move inputs to the same device as the model
        labels = labels.to(device)  # Move labels to the same device as the model
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss:.4f}")

# Evaluate the model on the validation set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_dataloader:
        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
        labels = labels.to("cuda" if torch.cuda.is_available() else "cpu")
        outputs = model(inputs)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/50, Loss: 0.7307
Epoch 2/50, Loss: 0.6575
Epoch 3/50, Loss: 0.7248
Epoch 4/50, Loss: 0.6236
Epoch 5/50, Loss: 0.6837
Epoch 6/50, Loss: 0.6357
Epoch 7/50, Loss: 0.5782
Epoch 8/50, Loss: 0.6480
Epoch 9/50, Loss: 0.5875
Epoch 10/50, Loss: 0.5165
Epoch 11/50, Loss: 0.4752
Epoch 12/50, Loss: 0.4949
Epoch 13/50, Loss: 0.4396
Epoch 14/50, Loss: 0.4192
Epoch 15/50, Loss: 0.3974
Epoch 16/50, Loss: 0.3918
Epoch 17/50, Loss: 0.3716
Epoch 18/50, Loss: 0.3469
Epoch 19/50, Loss: 0.3157
Epoch 20/50, Loss: 0.2757
Epoch 21/50, Loss: 0.2462
Epoch 22/50, Loss: 0.2451
Epoch 23/50, Loss: 0.2750
Epoch 24/50, Loss: 0.2466
Epoch 25/50, Loss: 0.2245
Epoch 26/50, Loss: 0.2063
Epoch 27/50, Loss: 0.2097
Epoch 28/50, Loss: 0.1817
Epoch 29/50, Loss: 0.1755
Epoch 30/50, Loss: 0.1847
Epoch 31/50, Loss: 0.1468
Epoch 32/50, Loss: 0.1522
Epoch 33/50, Loss: 0.1538
Epoch 34/50, Loss: 0.1467
Epoch 35/50, Loss: 0.1134
Epoch 36/50, Loss: 0.1115
Epoch 37/50, Loss: 0.1140
Epoch 38/50, Loss: 0.1300
Epoch 39/50, Loss: 0.

The output shows the training and validation progress for a BERT-based text classification model. Here's how to interpret the results:

**Epoch 1/3, Loss: 0.7588:**


> This indicates that the model has completed the first training epoch.
The loss of approximately 0.7588 is the average loss calculated over all batches in the training data during this epoch.
Loss measures how well the model is performing; a lower loss is better. It represents the error between the model's predictions and the actual labels.



**Epoch 2/3, Loss: 0.7083:**

> This shows the results after the second training epoch.
The loss has decreased to approximately 0.7083, which is expected during training as the model learns to make better predictions.

**Epoch 3/3, Loss: 0.6144:**

> This is the result after the third and final training epoch.
The loss has decreased further to approximately 0.6144, indicating that the model continues to improve.
Validation Accuracy: 0.5000:

The validation accuracy of 0.5000 means that, when evaluating the model on the validation dataset, it correctly predicted the labels for 50% of the examples.
Validation accuracy is a common metric used to evaluate classification models. In this case, it indicates that the model is performing at a random or chance level, as it's correctly classifying roughly half of the examples.



**Interpretation:**

The decreasing training loss across epochs is a positive sign, suggesting that the model is learning and improving its predictions on the training data.
However, the low validation accuracy of 0.5000 indicates that the model's performance on unseen data (validation data) is no better than random guessing. This suggests that the model might be underfitting or that the data and model architecture may require further tuning.


In [40]:
# Prepare input data
test_data = [
    ("I am happy.", 1),          # Positive sentiment
    ("I am sad.", 0),            # Negative sentiment
    ("The movie was great.", 1), # Positive sentiment
    ("This person was awful.", 0),  # Negative sentiment
    # Add more test data examples here...
]

test_inputs, test_labels = preprocess_data(test_data)

# Create a data loader for testing
test_dataset = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Test the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs = inputs.to(device)  # Move inputs to the same device as the model
        labels = labels.to(device)  # Move labels to the same device as the model
        outputs = model(inputs)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_accuracy = correct / total
print(f"Test Accuracy: {test_accuracy:.4f}")


Test Accuracy: 1.0000


In [41]:
print(predicted)

tensor([1, 0, 1, 0], device='cuda:0')


Load the fine-tuned model

In [34]:
model_path = "fine_tuned_bert"  # Path to the directory where the model is saved
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)


OSError: ignored