# BERT 
# PART A

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [3]:
# Load datasets
train_df = pd.read_csv('/kaggle/input/emotion-dataset/training.csv')
val_df = pd.read_csv('/kaggle/input/emotion-dataset/validation.csv')
test_df = pd.read_csv('/kaggle/input/emotion-dataset/test.csv')

# Encode labels
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label'])  # Adjust column name as needed
val_df['label'] = label_encoder.transform(val_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

In [15]:
label_encoder.classes_

array([0, 1, 2, 3, 4, 5])

In [10]:
#tokenize and encode them
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=128)

In [5]:
# create custom dataset class to feed it to model
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_df['label'].tolist())
val_dataset = TextDataset(val_encodings, val_df['label'].tolist())
test_dataset = TextDataset(test_encodings, test_df['label'].tolist())

In [6]:
# COnfigure data pipelines
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [7]:
#initialise models
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to('cuda' if torch.cuda.is_available() else 'cpu')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# initialise optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [9]:
# train the model
for epoch in range(3):  # Total number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to('cuda' if torch.cuda.is_available() else 'cpu') for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1} completed.')

Epoch 1 completed.
Epoch 2 completed.
Epoch 3 completed.


In [11]:
# evaluate on validation set
model.eval()
val_preds = []
val_labels = []
with torch.no_grad():
    for batch in val_loader:
        inputs = {key: val.to('cuda' if torch.cuda.is_available() else 'cpu') for key, val in batch.items()}
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(batch['labels'].cpu().numpy())

val_accuracy = (np.array(val_preds) == np.array(val_labels)).mean()
print(f'Validation Accuracy: {val_accuracy:.4f}')


Validation Accuracy: 0.9300


In [12]:
# evaluate on testing set
model.eval()
test_preds = []
test_labels = []
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to('cuda' if torch.cuda.is_available() else 'cpu') for key, val in batch.items()}
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_labels.extend(batch['labels'].cpu().numpy())

test_accuracy = (np.array(test_preds) == np.array(test_labels)).mean()
print(f'Test Accuracy: {test_accuracy:.4f}')

Test Accuracy: 0.9170


In [14]:
# try inference with random text
def infer(text):
    model.eval()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128).to('cuda' if torch.cuda.is_available() else 'cpu')
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)
    return label_encoder.inverse_transform(preds.cpu().numpy())

# Example inference
new_text = "I am feeling great today!"
prediction = infer(new_text)
print(f'Predicted Emotion: {prediction}')

Predicted Emotion: [1]


# PART B
# Using German dataset and German-Bert Model

In [3]:
data = pd.read_csv('/kaggle/input/german-rap-de/final_raps_de.csv')
split_index = int(len(data) * 0.8)
train_df = data.iloc[:split_index] 
test_df = data.iloc[split_index:] 
train_df['grade'] = label_encoder.fit_transform(train_df['grade'])
test_df['grade'] = label_encoder.transform(test_df['grade'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['grade'] = label_encoder.fit_transform(train_df['grade'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['grade'] = label_encoder.transform(test_df['grade'])


In [5]:
model_name = 'bert-base-german-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_df['lyrics'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['lyrics'].tolist(), truncation=True, padding=True, max_length=128)

In [7]:
# create custom dataset class to feed it to model
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_df['grade'].tolist())
test_dataset = TextDataset(test_encodings, test_df['grade'].tolist())

In [8]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [9]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))
model.to('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [10]:
# initialise optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [11]:
# train the model
for epoch in range(3):  # Total number of epochs
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to('cuda' if torch.cuda.is_available() else 'cpu') for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch + 1} completed.')

Epoch 1 completed.
Epoch 2 completed.
Epoch 3 completed.


In [12]:
# evaluate on test set
model.eval()
val_preds = []
val_labels = []
with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to('cuda' if torch.cuda.is_available() else 'cpu') for key, val in batch.items()}
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(batch['labels'].cpu().numpy())

val_accuracy = (np.array(val_preds) == np.array(val_labels)).mean()
print(f'Validation Accuracy: {val_accuracy:.4f}')


Validation Accuracy: 0.4787
