<a href="https://colab.research.google.com/github/PuchToTalk/FinBERT/blob/fine-tuning/Fine_Tuning_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Define your own dataset for text classification
# In this example, let's assume you have training_data and validation_data
# training_data should be a list of (text, label) pairs
# validation_data should be a list of (text, label) pairs
training_data = [
    ("This is a positive sentence.", 1),
    ("This is a negative sentence.", 0),
    # Add more data...
]

validation_data = [
    ("Another positive example.", 1),
    ("Another negative example.", 0),
    # Add more data...
]

# Define the BERT model and tokenizer
model_name = "bert-base-uncased"  # You can choose other pre-trained models
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Change num_labels to match your classification task

# Tokenize and preprocess the data
def preprocess_data(data):
    inputs = [tokenizer.encode(text, add_special_tokens=True, max_length=128, pad_to_max_length=True) for text, _ in data]
    labels = [label for _, label in data]
    inputs = torch.tensor(inputs)
    labels = torch.tensor(labels)
    return inputs, labels

train_inputs, train_labels = preprocess_data(training_data)
val_inputs, val_labels = preprocess_data(validation_data)

# Create data loaders
batch_size = 32
train_dataset = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(val_inputs, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Define training parameters
learning_rate = 2e-5
num_epochs = 3  # You can adjust the number of epochs

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Fine-tune the BERT model
model.to("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for inputs, labels in train_dataloader:
        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
        labels = labels.to("cuda" if torch.cuda.is_available() else "cpu")
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss:.4f}")

# Evaluate the model on the validation set
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_dataloader:
        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
        labels = labels.to("cuda" if torch.cuda.is_available() else "cpu")
        outputs = model(inputs)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_bert")
