In [93]:
# Dieser Approach finetuned BERT. Es würde wahrscheinlich schneller und mindestens gleichgut gehen wenn man nen eigenes Modell (siehe https://www.kaggle.com/code/youssefabdelghfar/twitter-sentiment-analysis-nlp-lstm/notebook),
# oder ein dictionaryBased Modell wie Vader benutzen würde,
# aber ich wollte schlichtweg mal ausprobieren wie gut es klappt wenn man BERT finetuned.
# Wegen des finetunes wird eine GPU vorrausgesetzt -> alle params außer dem classifier sind gefreezed. Braucht trotzdem erstaunlich lange (ca. 7 min pro epoche auf colab)

# Ich hab mir allerdings das Data cleaning und preprocessing von https://www.kaggle.com/code/youssefabdelghfar/twitter-sentiment-analysis-nlp-lstm/notebook ausgeliehen :-)


In [80]:
import pandas as pd

from transformers import BertForSequenceClassification, AdamW, BertTokenizer

from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from torch.nn import CrossEntropyLoss
import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

from tqdm import tqdm

import re

In [81]:
# Custom Dataset class
class TweetsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe['text']
        self.labels = dataframe['label']
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.text[index])
        label = self.labels[index]

        # Tokenize the input text - später maybe noch im preprocessing machen und nicht on-demand im getter
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [82]:
# Preprocess
def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove special characters, numbers, and punctuations
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = re.sub(r'\d', ' ', tweet)
    tweet = re.sub(r'\s+', ' ', tweet)
    tweet = tweet.strip()
    return tweet

In [83]:
# load data and add columns
column_names = ["id","category","labelStr","text"]
trainingData = pd.read_csv(r"/content/training.csv",names=column_names)
validationData = pd.read_csv(r"/content/validation.csv",names=column_names)

trainingData = trainingData.drop(columns=['id', 'category'])
validationData = validationData.drop(columns=['id', 'category'])

########## Data Cleaning ausgeborgt von kaggle

# clean data
trainingData = trainingData.dropna().reset_index(drop=True) # Das wirft im Dataloader nen keyError wenn man den index nicht resettet. Das hat mich locker ne Stunde gekostet zu schauen woran das liegt. Warum wird das nicht automatisch gemacht.
trainingData = trainingData.drop_duplicates().reset_index(drop=True)

# preprocess
trainingData['text'] = trainingData['text'].apply(clean_tweet)
validationData['text'] = validationData['text'].apply(clean_tweet)

# lowercase everything
trainingData['text'] = trainingData['text'].str.lower()
validationData['text'] = validationData['text'].str.lower()

# Drop rows where 'OriginalTweet' is empty
trainingData = trainingData.dropna(subset=['text']).reset_index(drop=True)
validationData = validationData.dropna(subset=['text']).reset_index(drop=True)

# Alternatively, if there are rows with just whitespace, use this to remove them as well:
trainingData = trainingData[trainingData['text'].str.strip() != ''].reset_index(drop=True)
validationData = validationData[validationData['text'].str.strip() != ''].reset_index(drop=True)

trainingData = trainingData.drop_duplicates(subset=['text']).reset_index(drop=True)
###########
print(len(trainingData))

# convert String into int
label_encoder = LabelEncoder()
trainingData['label'] = label_encoder.fit_transform(trainingData[["labelStr"]])
validationData['label'] = label_encoder.transform(validationData[["labelStr"]])

# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128  # Length should be enough as tweets are limited in length anyway

# create datasets
train_dataset = TweetsDataset(trainingData, tokenizer, MAX_LEN)
val_dataset = TweetsDataset(validationData, tokenizer, MAX_LEN)
print(trainingData.head())

66388


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


   labelStr                                               text  label
0  Positive  im getting on borderlands and i will murder yo...      3
1  Positive  i am coming to the borders and i will kill you...      3
2  Positive  im getting on borderlands and i will kill you all      3
3  Positive  im coming on borderlands and i will murder you...      3
4  Positive  im getting on borderlands and i will murder yo...      3




In [94]:
# create Dataloader
BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [95]:
# load model and set output to the 4 classes
# This will cause the parameter "warnings" below, as its not originally trained on 4 classes.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)


# freeze all params except for the classifier one (classifier heißt: model.classifier.*)
for param in model.bert.parameters():
    param.requires_grad = False


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
# optimizer and loss
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01  ,correct_bias=False) # AdamW optimizer because is rarely not good lol
loss_fn = CrossEntropyLoss() # Cross entropy is standard bert loss

# move to gpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(torch.cuda.is_available())
model.to(device)

True




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [98]:
# train method
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model.train()
    total_loss = 0
    total_correct = 0
    all_predictions = []
    all_labels = []

    for batch in tqdm(data_loader):

        # Move input tensors to device (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (logits.argmax(dim=1) == labels).sum().item()

        # Collect predictions and labels for F1 score calculation
        all_predictions.extend(logits.argmax(dim=1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    # Calculate the average loss and accuracy
    avg_loss = total_loss / len(data_loader)
    accuracy = total_correct / len(data_loader.dataset)

    # Calculate the F1 score
    f1 = f1_score(all_labels, all_predictions, average='weighted')

    return avg_loss, accuracy, f1

# eval method
def eval_model(model, data_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            total_correct += (logits.argmax(dim=1) == labels).sum().item()

            # Collect predictions and labels for F1 score calculation
            all_predictions.extend(logits.argmax(dim=1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate the average loss and accuracy
    avg_loss = total_loss / len(data_loader)
    accuracy = total_correct / len(data_loader.dataset)

    # Calculate the F1 score
    f1 = f1_score(all_labels, all_predictions, average='weighted')

    return avg_loss, accuracy, f1

In [None]:

# Train / Eval / Save dict
EPOCHS = 5

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')

    train_loss, train_acc, train_f1 = train_epoch(model, train_loader, loss_fn, optimizer, device)
    val_loss, val_acc, val_f1 = eval_model(model, val_loader, loss_fn, device)

    print(f"Train loss: {train_loss}, accuracy: {train_acc}, F1: {train_f1}")
    print(f"Validation loss: {val_loss}, accuracy: {val_acc}, F1: {val_f1}")

torch.save(model.state_dict(), "model.pth")

# BERT scheint tatsächlich nicht so gut hierfür zu klappen.
# Ich hab (tatsächlich erst nachdem ich das geschrieben hab). Mal bei Kaggle nach anderen Bert Ansätzen für das Dataset geschaut,
# und da kommen ähnliche Ergebnisse raus wie hier. https://www.kaggle.com/code/kirollosashraf/twitter-sentiment-analysis-nlp-using-bert
# Ich habe leider auch nicht groß hyperparameter tuning betrieben, weil mein Cuda lokal irgendwie kaputt gegangen ist, und ich ich bei colab im Nutzungslimit bin.


Epoch 1/5


100%|██████████| 2075/2075 [07:44<00:00,  4.47it/s]
100%|██████████| 32/32 [00:07<00:00,  4.37it/s]


Train loss: 1.261530587587012, accuracy: 0.4441615954690607, F1: 0.4172665033798929
Validation loss: 1.156990835443139, accuracy: 0.528, F1: 0.4938582249817842
Epoch 2/5


100%|██████████| 2075/2075 [07:58<00:00,  4.33it/s]
100%|██████████| 32/32 [00:07<00:00,  4.38it/s]


Train loss: 1.224975165257971, accuracy: 0.4680966439718021, F1: 0.44208599357681255
Validation loss: 1.1689600814133883, accuracy: 0.477, F1: 0.4499318404128305
Epoch 3/5


100%|██████████| 2075/2075 [07:55<00:00,  4.36it/s]
100%|██████████| 32/32 [00:07<00:00,  4.57it/s]


Train loss: 1.2170501064392458, accuracy: 0.47297704404410434, F1: 0.4489657134316694
Validation loss: 1.1978987660259008, accuracy: 0.506, F1: 0.4625993374245612
Epoch 4/5


100%|██████████| 2075/2075 [07:55<00:00,  4.36it/s]
100%|██████████| 32/32 [00:06<00:00,  4.60it/s]


Train loss: 1.2151943658058901, accuracy: 0.47231427366391515, F1: 0.4491744151914261
Validation loss: 1.136104928329587, accuracy: 0.527, F1: 0.5028788179338414
Epoch 5/5


 96%|█████████▌| 1997/2075 [07:37<00:18,  4.32it/s]