In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# current_epoch = 0

# Load your dataset
data = pd.read_excel('Training Data.xlsx')  # Replace with your actual dataset path

# Clean the text data (handle missing values and ensure all are strings)
data['Text'] = data['Text'].fillna('')  # Fill NaN values with empty string
data['Text'] = data['Text'].astype(str)  # Ensure all entries are strings

# Preprocess Sentiment labels (convert them to numerical values)
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)


---
# Start here for a new epoch 
---

In [3]:
# Load BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('./final')

# Tokenization function
def tokenize_function(examples):
    # Tokenize the text and make sure to include 'labels' for the target
    return tokenizer(examples['Text'], padding='max_length', truncation=True)

# Convert data to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data[['Text', 'sentiment']])
val_dataset = Dataset.from_pandas(val_data[['Text', 'sentiment']])

# Add the 'labels' column, which should be the sentiment column
train_dataset = train_dataset.map(lambda x: {'labels': x['sentiment']}, batched=True)
val_dataset = val_dataset.map(lambda x: {'labels': x['sentiment']}, batched=True)

# Apply tokenization to both train and validation datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map: 100%|██████████| 45074/45074 [00:00<00:00, 1155629.54 examples/s]
Map: 100%|██████████| 11269/11269 [00:00<00:00, 1024973.15 examples/s]
Map: 100%|██████████| 45074/45074 [00:13<00:00, 3239.82 examples/s]
Map: 100%|██████████| 11269/11269 [00:03<00:00, 3322.42 examples/s]


In [4]:

# Load pre-trained BERT model
# model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=len(label_encoder.classes_))
model = BertForSequenceClassification.from_pretrained('./final', num_labels=len(label_encoder.classes_))


In [5]:

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',              # Output directory
    num_train_epochs=1,                  # Number of epochs
    per_device_train_batch_size=8,       # Batch size for training
    per_device_eval_batch_size=16,       # Batch size for evaluation
    warmup_steps=500,                    # Warmup steps for learning rate scheduler
    weight_decay=0.01,                   # Strength of weight decay
    logging_dir='./logs',                # Directory for storing logs
    logging_steps=10,                    # Log every 10 steps
    evaluation_strategy='epoch',         # Evaluate every epoch
    save_strategy='epoch',               # Save model every epoch
)




In [6]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)  # Convert logits to predicted class

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train(resume_from_checkpoint=True)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=5635, training_loss=0.0, metrics={'train_runtime': 0.022, 'train_samples_per_second': 2048855.662, 'train_steps_per_second': 256141.049, 'total_flos': 1.1859893634797568e+16, 'train_loss': 0.0, 'epoch': 1.0})

In [7]:

# current_epoch = trainer.state.epoch
# print(f"Epoch {current_epoch}")

In [1]:

# Train the model0
trainer.train()

trainer.save_model('./final')
tokenizer.save_pretrained('./final')
trainer.save_state()

# print(f"\nEpoch {epoch + 1} completed and model saved at {checkpoint_path}\n")


NameError: name 'trainer' is not defined

In [None]:
# Evaluate the model
results = trainer.evaluate()


NameError: name 'np' is not defined

In [None]:

# Print the evaluation results, which include accuracy
print(f"Accuracy: {results['eval_accuracy']}")

KeyError: 'eval_accuracy'