<a href="https://colab.research.google.com/github/Theieyrre/Hate-Speech-NLP/blob/main/BERT_Model_with_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Model with torch

### For jupyter install:

In [None]:
#%pip install torch
#%pip install pandas
#%pip install transformers
#%pip install numpy
#%pip install tqdm

### For Colab install:
Google Colab already has torch,numpy, tqdm and pandas installed. No need to install again

In [None]:
%pip install transformers



In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict

## Parameters

In [None]:
# Data Parameters
h_na = '-'
h_train_filename = 'offenseval-tr-training-v1.tsv'
h_test_filename = 'offenseval-tr-testset-v1.tsv'
h_test_labels = 'offenseval-tr-labela-v1.tsv'
h_t_sep = '\t'
h_c_sep = ','
h_index = 'id'
h_num_labels = 2

# BERT Parameters
h_preprocess_mode = 'dbmdz/bert-base-turkish-cased'
h_max_len = 280
h_batch_size = 16
h_epoch = 5
h_text = 'tweet'
h_label = 'label'

# Adam Optimizer Parameters
h_learning_rate = 2e-5
h_eps = 1e-8

## Import and prepare data
Import train file

In [None]:
df_train = pd.read_csv(h_train_filename, na_values=h_na, sep=h_t_sep)
df_train = df_train.rename(columns={"subtask_a": "label"})
df_train.head()

Unnamed: 0,id,tweet,label
0,20948,@USER en güzel uyuyan insan ödülü jeon jungkoo...,NOT
1,10134,"@USER Mekanı cennet olsun, saygılar sayın avuk...",NOT
2,23457,Kızlar aranızda kas yığını beylere düşenler ol...,NOT
3,18401,Biraz ders çalışayım. Tembellik ve uyku düşman...,NOT
4,17525,@USER Trezeguet yerine El Sharawy daha iyi olm...,NOT


Import test file and labels

In [None]:
df_test = pd.read_csv(h_test_filename, na_values=h_na, sep=h_t_sep)
df_test.head()

Unnamed: 0,id,tweet
0,41993,@USER Sayın başkanım bu şekilde devam inşallah👏
1,23000,"Herkes gevşekliği kadar duyar kasıyor,hayat bö..."
2,42478,Olgun ilişkisi olan arkadaş size en güzel hedi...
3,21748,@USER @USER Burada atıp tutacağına o kötü koşu...
4,13607,@USER İşte o onur dediğin sende yok sorun o işte


In [None]:
df_test_label = pd.read_csv(h_test_labels, na_values=h_na, sep=h_c_sep, names=['id', 'label'])
df_test_label.head()

Unnamed: 0,id,label
0,41993,NOT
1,23000,NOT
2,42478,NOT
3,21748,OFF
4,13607,OFF


### Add Labels to test dataframe

In [None]:
df_test = df_test.merge(df_test_label, on='id')
df_test.head()

Unnamed: 0,id,tweet,label
0,41993,@USER Sayın başkanım bu şekilde devam inşallah👏,NOT
1,23000,"Herkes gevşekliği kadar duyar kasıyor,hayat bö...",NOT
2,42478,Olgun ilişkisi olan arkadaş size en güzel hedi...,NOT
3,21748,@USER @USER Burada atıp tutacağına o kötü koşu...,OFF
4,13607,@USER İşte o onur dediğin sende yok sorun o işte,OFF


Value counts

In [None]:
df_train[h_label].value_counts()

NOT    25231
OFF     6046
Name: label, dtype: int64

In [None]:
df_test[h_label].value_counts()

NOT    2804
OFF     711
Name: label, dtype: int64

### Multilabel transform

In [None]:
possible_labels = df_train[h_label].unique()
label_train_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_train_dict[possible_label] = index
df_train['category'] = df_train[h_label].replace(label_train_dict)
df_train.set_index(h_index, inplace=True)
df_train.head()

Unnamed: 0_level_0,tweet,label,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20948,@USER en güzel uyuyan insan ödülü jeon jungkoo...,NOT,0
10134,"@USER Mekanı cennet olsun, saygılar sayın avuk...",NOT,0
23457,Kızlar aranızda kas yığını beylere düşenler ol...,NOT,0
18401,Biraz ders çalışayım. Tembellik ve uyku düşman...,NOT,0
17525,@USER Trezeguet yerine El Sharawy daha iyi olm...,NOT,0


In [None]:
label_test_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_test_dict[possible_label] = index
df_test['category'] = df_test[h_label].replace(label_test_dict)
df_test.set_index(h_index, inplace=True)
df_test.head()

Unnamed: 0_level_0,tweet,label,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
41993,@USER Sayın başkanım bu şekilde devam inşallah👏,NOT,0
23000,"Herkes gevşekliği kadar duyar kasıyor,hayat bö...",NOT,0
42478,Olgun ilişkisi olan arkadaş size en güzel hedi...,NOT,0
21748,@USER @USER Burada atıp tutacağına o kötü koşu...,OFF,1
13607,@USER İşte o onur dediğin sende yok sorun o işte,OFF,1


In [None]:
df_train.dropna(inplace=True)
df_train["category"].value_counts()

0    25231
1     6046
Name: category, dtype: int64

In [None]:
df_test.dropna(inplace=True)
df_test["category"].value_counts()

0    2804
1     711
Name: category, dtype: int64

## Loading Tokenize and Encoding

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
tokenizer = BertTokenizer.from_pretrained(h_preprocess_mode)

## Performance Metrics

In [None]:
def accuracy_per_class(preds, labels):
    all_acc = {}
    label_dict_inverse = {v: k for k, v in label_train_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_preds = np.round(y_preds)
        y_true = labels_flat[labels_flat==label]
        _class = label_dict_inverse[label]
        acc = str(len(y_preds[y_preds==label])) + "/" + str(len(y_true)) + "=" + str(len(y_preds[y_preds==label]) / len(y_true))
        print(f'Class: {_class}')
        print(f'Accuracy: {acc}\n')
        all_acc[_class] = acc
    return all_acc

In [None]:
def get_scores(preds, labels):

    y_preds_off = np.round(preds[labels==1])
    y_true_off = labels[labels==1]
    y_preds_not = np.round(preds[labels==0])
    y_true_not = labels[labels==0]
    tp = len(y_preds_off[y_preds_off==1])
    fp = len(y_preds_off[y_preds_off==0])
    fn = len(y_preds_not[y_preds_not==1])
    tn = len(y_preds_not[y_preds_not==0])

    matrix = np.array([["                   ", "Real Offensive", "Real Not-Offensive"],
                       ["Model Offensive    ", str(tp)+"          ", fp],
                       ["Model Not-Offensive", str(fn)+"          ", tn]])

    print(matrix)
    prec = tp / (tp + fp)
    rec = tp / (tp + fn)
    f1 = 2 * prec * rec / (prec + rec)

    return prec, rec, f1

### Device control
Pick if CUDA available else use CPU, print for sanity check

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Data Loader

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

### Create Special DataSet

In [None]:
class BERTDataset(Dataset):
  def __init__(self, text, label, tokenizer, max_len):
    self.text = text
    self.label = label
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.text)
  
  def __getitem__(self, item):
    text = str(self.text[item])
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    return {
        'text': text,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(self.label[item], dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = BERTDataset(
      text=df[h_text].to_numpy(),
      label=df['category'].to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
  )

  return DataLoader(
      ds,
      batch_size=batch_size,
      num_workers=4
  )

### Prepare train data

In [None]:
dataloader_train = create_data_loader(df_train, tokenizer, h_max_len, h_batch_size)

### Prepare test data

In [None]:
dataloader_test = create_data_loader(df_test, tokenizer, h_max_len, h_batch_size)

## Build Classifier

In [None]:
from transformers import BertForSequenceClassification
from transformers import BertModel

class Classifier(nn.Module):
  def __init__(self, n_classes):
    super(Classifier, self).__init__()
    self.bert = BertModel.from_pretrained(h_preprocess_mode)
    self.drop = nn.Dropout(0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    #output = self.out(output)
    #return self.softmax(output)
    return self.out(output)


In [None]:
model = Classifier(h_num_labels)
model = model.to(device)

### Setting up optimizer

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(),
                  lr=h_learning_rate,
                  correct_bias=False, 
                  eps=h_eps)

#### Get Scheduler

In [None]:
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*h_epoch)


#### Loss Function

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)

## Training

In [None]:
def train_epoch(
    model,
    dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in tqdm(dataloader):
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    labels = d['labels'].to(device)

    outputs = model(
        input_ids,
        attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)

    correct_predictions += torch.sum(preds == labels)
    losses.append(loss)

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  return correct_predictions.double() / n_examples, torch.mean(torch.stack(losses))

In [None]:
def eval_model(model, dataloader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
      for d in tqdm(dataloader):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        outputs = model(
          input_ids,
          attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels).detach().cpu().numpy()
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss)

  return correct_predictions.double() / n_examples, np.mean(losses)

#### Training Loop

In [None]:
histroy = defaultdict(list)
best_accuracy = 0

epoch = 0
for epoch in tqdm(range(h_epoch+1), desc='Epoch {:1d}'.format(epoch)):
  train_acc, train_loss = train_epoch(
      model,
      dataloader_train,
      loss_fn,
      optimizer,
      device,
      scheduler,
      len(df_train)
  )

  tqdm.write(f'Train Loss: {train_loss}')
  tqdm.write(f'Train Acc: {train_acc}')

  test_acc, test_loss = eval_model(
      model,
      dataloader_test,
      loss_fn,
      device,
      len(df_test)
  )

  tqdm.write(f'Test Loss: {test_loss}')
  tqdm.write(f'Test Acc: {test_acc}')

  histroy['train_acc'].append(train_acc)
  histroy['train_loss'].append(train_loss)

  histroy['test_acc'].append(test_acc)
  histroy['test_loss'].append(test_loss)

  if test_acc > best_accuracy:
    torch.save(model.state_dict(), "model.bin")
    best_accuracy = test_acc


HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=6.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, max=1955.0), HTML(value='')))


Train Loss: 0.4053685963153839
Train Acc: 0.8494740544169836


HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))


Test Loss: 0.3487786054611206
Test Acc: 0.8711237553342818


HBox(children=(FloatProgress(value=0.0, max=1955.0), HTML(value='')))


Train Loss: 0.32587501406669617
Train Acc: 0.8822137673050484


HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))


Test Loss: 0.3259274661540985
Test Acc: 0.8714082503556189


HBox(children=(FloatProgress(value=0.0, max=1955.0), HTML(value='')))


Train Loss: 0.3312501311302185
Train Acc: 0.8795920324839338


HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))


Test Loss: 0.3949303925037384
Test Acc: 0.8475106685633003


HBox(children=(FloatProgress(value=0.0, max=1955.0), HTML(value='')))


Train Loss: 0.2601981461048126
Train Acc: 0.9106052370751669


HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))


Test Loss: 0.36433494091033936
Test Acc: 0.8731152204836417


HBox(children=(FloatProgress(value=0.0, max=1955.0), HTML(value='')))


Train Loss: 0.23249003291130066
Train Acc: 0.9255043642293058


HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))


Test Loss: 0.40421155095100403
Test Acc: 0.8728307254623044


HBox(children=(FloatProgress(value=0.0, max=1955.0), HTML(value='')))


Train Loss: 0.21871335804462433
Train Acc: 0.9303002206093934


HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))


Test Loss: 0.40421155095100403
Test Acc: 0.8728307254623044



# Evaluate

In [None]:
def get_texts(model, dataloader):
  model = model.eval()
  texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
      for d in dataloader:
        texts = d['text']
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)

        outputs = model(
          input_ids,
          attention_mask
        )

        _, preds = torch.max(outputs, dim=1)

        texts.extend(texts)
        predictions.extend(preds)
        prediction_probs.extend(outputs)
        real_values.extend(labels)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()

  return texts, predictions, prediction_probs, real_values


# Load Model


In [None]:
model.load_state_dict(torch.load('model.bin'))
model = model.to(device)

In [None]:
y_texts, y_preds, y_pred_probs, y_test = get_texts(model, dataloader_test)

In [None]:
prec, rec, f1 = get_scores(y_preds, y_test)
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'F1 Score: {f1}')

[['                   ' 'Real Offensive' 'Real Not-Offensive']
 ['Model Offensive    ' '443          ' '268']
 ['Model Not-Offensive' '178          ' '2626']]
Precision: 0.6230661040787623
Recall: 0.7133655394524959
F1 Score: 0.6651651651651651
