In [1]:
%%capture
%pip install transformers
%pip install evaluate

In [2]:
import pandas as pd
import datasets
from sklearn.preprocessing import LabelEncoder
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification

# Load the dataset
train_df = pd.read_csv('./NLP Dataset/final_train.csv')
test_df = pd.read_csv('./NLP Dataset/final_test.csv')

# Rename the columns to match the expected column names
train_df = train_df.rename(columns={'unprocessed_text': 'text', 'class': 'label'})
test_df = test_df.rename(columns={'unprocessed_text': 'text', 'class': 'label'})

#remove tweeetid
train_df = train_df.drop('tweetid',axis=1,)
train_df = test_df.drop('tweetid',axis=1,)

In [3]:
# Convert the labels to integers (m=2, c=1, a=0, u=3)
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

In [5]:
# Load the tokenizer and model
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4) #model with added classification head

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [6]:
# Tokenize the dataset
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

tokenized_train_dataset = datasets.Dataset.from_pandas(train_df)
tokenized_train_dataset = tokenized_train_dataset.map(tokenize, batched=True)

tokenized_test_dataset = datasets.Dataset.from_pandas(test_df)
tokenized_test_dataset = tokenized_test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/3271 [00:00<?, ? examples/s]

Map:   0%|          | 0/3271 [00:00<?, ? examples/s]

In [7]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3271
})

In [8]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./bertweet_results',          # Output directory
    evaluation_strategy = "epoch",   # Evaluation strategy
    # learning_rate=2e-5,              # Learning rate
    # per_device_train_batch_size=16,  # Batch size
    # num_train_epochs=3,              # Number of training epochs
    # weight_decay=0.01,               # Weight decay
    # push_to_hub=False,
)

In [9]:
# Define the trainer
trainer = Trainer(
    model=model,                         # The model to be trained
    args=training_args,                  # Training arguments
    train_dataset=tokenized_train_dataset, # Training dataset
    eval_dataset=tokenized_test_dataset,  # Evaluation dataset
)

In [10]:
# Train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3271
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1227
  Number of trainable parameters = 134903044


  0%|          | 0/1227 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

{'eval_loss': 0.4592694044113159, 'eval_runtime': 131.0028, 'eval_samples_per_second': 24.969, 'eval_steps_per_second': 3.122, 'epoch': 1.0}


Saving model checkpoint to ./results2/checkpoint-500
Configuration saved in ./results2/checkpoint-500/config.json


{'loss': 0.7534, 'learning_rate': 2.962510187449063e-05, 'epoch': 1.22}


Model weights saved in ./results2/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

{'eval_loss': 0.23849736154079437, 'eval_runtime': 131.325, 'eval_samples_per_second': 24.908, 'eval_steps_per_second': 3.114, 'epoch': 2.0}


Saving model checkpoint to ./results2/checkpoint-1000
Configuration saved in ./results2/checkpoint-1000/config.json


{'loss': 0.3934, 'learning_rate': 9.250203748981255e-06, 'epoch': 2.44}


Model weights saved in ./results2/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.13533981144428253, 'eval_runtime': 134.6034, 'eval_samples_per_second': 24.301, 'eval_steps_per_second': 3.039, 'epoch': 3.0}
{'train_runtime': 1755.3177, 'train_samples_per_second': 5.59, 'train_steps_per_second': 0.699, 'train_loss': 0.5153115046344934, 'epoch': 3.0}


TrainOutput(global_step=1227, training_loss=0.5153115046344934, metrics={'train_runtime': 1755.3177, 'train_samples_per_second': 5.59, 'train_steps_per_second': 0.699, 'train_loss': 0.5153115046344934, 'epoch': 3.0})

In [11]:
# Evaluate the model
eval_results = trainer.evaluate(tokenized_test_dataset)
print(eval_results)

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

{'eval_loss': 0.13533981144428253, 'eval_runtime': 125.3816, 'eval_samples_per_second': 26.088, 'eval_steps_per_second': 3.262, 'epoch': 3.0}


In [12]:
#calculate overall f1 score

import numpy as np

# Make predictions on the test set
predictions, _, _ = trainer.predict(tokenized_test_dataset)

# Convert predicted probabilities to predicted classes
predicted_classes = np.argmax(predictions, axis=1)

# Calculate F1 score
from sklearn.metrics import f1_score
f1 = f1_score(test_df['label'], predicted_classes, average='macro')
print("F1 score:", f1)

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, tweetid. If text, tweetid are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3271
  Batch size = 8


  0%|          | 0/409 [00:00<?, ?it/s]

F1 score: 0.9539286357412506


In [13]:
# Calculate F1 score for label 0 (a)
f1 = f1_score(test_df['label'], predicted_classes, labels=[0], average='macro')
print("F1 score for label 0:", f1)

F1 score for label 0: 0.9197969543147206
