<a href="https://colab.research.google.com/github/RafaelNovais/MasterAI/blob/master/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers datasets torch scikit-learn


In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# Load the dataset
train_df = pd.read_csv('path_to_train.csv')
dev_df = pd.read_csv('path_to_dev.csv')
test_df = pd.read_csv('path_to_test.csv')

# Preprocessing: Tokenize the text data with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(df):
    return tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

# Encode labels
def encode_labels(df):
    label_dict = {'positive': 2, 'neutral': 1, 'negative': 0}
    return df['sentiment'].map(label_dict).values

# Tokenize and encode the train, dev, and test sets
train_encodings = tokenize_data(train_df)
dev_encodings = tokenize_data(dev_df)
test_encodings = tokenize_data(test_df)

train_labels = encode_labels(train_df)
dev_labels = encode_labels(dev_df)
test_labels = encode_labels(test_df)

# Convert to Dataset object for Hugging Face's Trainer API
def create_dataset(encodings, labels):
    return Dataset.from_dict({'input_ids': encodings['input_ids'], 'attention_mask': encodings['attention_mask'], 'labels': labels})

train_dataset = create_dataset(train_encodings, train_labels)
dev_dataset = create_dataset(dev_encodings, dev_labels)
test_dataset = create_dataset(test_encodings, test_labels)

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define evaluation metrics
def compute_metrics(p):
    pred_labels = p.predictions.argmax(-1)
    accuracy = accuracy_score(p.label_ids, pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, pred_labels, average='weighted')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate on the test set
test_result = trainer.evaluate(test_dataset)
print(test_result)
