<a href="https://colab.research.google.com/github/Theieyrre/Hate-Speech-NLP/blob/main/%5BTRAIN_ONLY%5D_BERT_Model_with_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Model with torch

### For jupyter install:

In [None]:
#%pip install torch
#%pip install pandas
#%pip install transformers
#%pip install numpy
#%pip install tqdm

### For Colab install:
Google Colab already has torch,numpy, tqdm and pandas installed. No need to install again

In [None]:
%pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 21.0MB/s eta 0:00:01[K     |▉                               | 20kB 4.7MB/s eta 0:00:01[K     |█▎                              | 30kB 6.1MB/s eta 0:00:01[K     |█▊                              | 40kB 6.2MB/s eta 0:00:01[K     |██▏                             | 51kB 5.3MB/s eta 0:00:01[K     |██▋                             | 61kB 5.7MB/s eta 0:00:01[K     |███                             | 71kB 6.2MB/s eta 0:00:01[K     |███▍                            | 81kB 5.7MB/s eta 0:00:01[K     |███▉                            | 92kB 6.3MB/s eta 0:00:01[K     |████▎                           | 102kB 6.5MB/s eta 0:00:01[K     |████▊                           | 112kB 6.5MB/s eta 0:00:01[K     |█████▏                          | 122kB 6.5M

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict

## Parameters

In [None]:
# Data Parameters
h_na = '-'
h_train_filename = 'offenseval-tr-training-v1.tsv'
h_test_filename = 'offenseval-tr-testset-v1.tsv'
h_test_labels = 'offenseval-tr-labela-v1.tsv'
h_t_sep = '\t'
h_c_sep = ','
h_index = 'id'
h_num_labels = 2

# BERT Parameters
h_preprocess_mode = 'dbmdz/bert-base-turkish-cased'
h_max_len = 280
h_batch_size = 16
h_epoch = 3
h_text = 'tweet'
h_label = 'label'

# Adam Optimizer Parameters
h_learning_rate = 2e-5
h_eps = 1e-8

## Import and prepare data
Import train file

In [None]:
df_train = pd.read_csv(h_train_filename, na_values=h_na, sep=h_t_sep)
df_train = df_train.rename(columns={"subtask_a": "label"})
df_train.head()

Unnamed: 0,id,tweet,label
0,20948,@USER en güzel uyuyan insan ödülü jeon jungkoo...,NOT
1,10134,"@USER Mekanı cennet olsun, saygılar sayın avuk...",NOT
2,23457,Kızlar aranızda kas yığını beylere düşenler ol...,NOT
3,18401,Biraz ders çalışayım. Tembellik ve uyku düşman...,NOT
4,17525,@USER Trezeguet yerine El Sharawy daha iyi olm...,NOT


Import test file and labels

In [None]:
df_test = pd.read_csv(h_test_filename, na_values=h_na, sep=h_t_sep)
df_test.head()

Unnamed: 0,id,tweet
0,41993,@USER Sayın başkanım bu şekilde devam inşallah👏
1,23000,"Herkes gevşekliği kadar duyar kasıyor,hayat bö..."
2,42478,Olgun ilişkisi olan arkadaş size en güzel hedi...
3,21748,@USER @USER Burada atıp tutacağına o kötü koşu...
4,13607,@USER İşte o onur dediğin sende yok sorun o işte


In [None]:
df_test_label = pd.read_csv(h_test_labels, na_values=h_na, sep=h_c_sep, names=['id', 'label'])
df_test_label.head()

Unnamed: 0,id,label
0,41993,NOT
1,23000,NOT
2,42478,NOT
3,21748,OFF
4,13607,OFF


### Add Labels to test dataframe

In [None]:
df_test = df_test.merge(df_test_label, on='id')
df_test.head()

Unnamed: 0,id,tweet,label
0,41993,@USER Sayın başkanım bu şekilde devam inşallah👏,NOT
1,23000,"Herkes gevşekliği kadar duyar kasıyor,hayat bö...",NOT
2,42478,Olgun ilişkisi olan arkadaş size en güzel hedi...,NOT
3,21748,@USER @USER Burada atıp tutacağına o kötü koşu...,OFF
4,13607,@USER İşte o onur dediğin sende yok sorun o işte,OFF


Concatenate train and test

In [None]:
df = pd.concat([df_train, df_test])
df

Unnamed: 0,id,tweet,label
0,20948,@USER en güzel uyuyan insan ödülü jeon jungkoo...,NOT
1,10134,"@USER Mekanı cennet olsun, saygılar sayın avuk...",NOT
2,23457,Kızlar aranızda kas yığını beylere düşenler ol...,NOT
3,18401,Biraz ders çalışayım. Tembellik ve uyku düşman...,NOT
4,17525,@USER Trezeguet yerine El Sharawy daha iyi olm...,NOT
...,...,...,...
3510,27484,@USER Çok iyi oyuncu. Serdar’dan çok çok iyi,NOT
3511,41742,@USER bu gün mənim Bəbəşimin ad günüdü💜💜💜💜. Tə...,NOT
3512,45705,bu gecelik bu kadar yarın gün içerisinde göz...,NOT
3513,29225,@USER Okulu bırak o zaman fbshshsb,NOT


In [None]:
df[h_label].value_counts()

NOT    28035
OFF     6757
Name: label, dtype: int64

### Multilabel transform

In [None]:
possible_labels = df[h_label].unique()
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
df['category'] = df[h_label].replace(label_dict)
df.set_index(h_index, inplace=True)
df.head()

Unnamed: 0_level_0,tweet,label,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20948,@USER en güzel uyuyan insan ödülü jeon jungkoo...,NOT,0
10134,"@USER Mekanı cennet olsun, saygılar sayın avuk...",NOT,0
23457,Kızlar aranızda kas yığını beylere düşenler ol...,NOT,0
18401,Biraz ders çalışayım. Tembellik ve uyku düşman...,NOT,0
17525,@USER Trezeguet yerine El Sharawy daha iyi olm...,NOT,0


In [None]:
df.dropna(inplace=True)
df["category"].value_counts()

0    28035
1     6757
Name: category, dtype: int64

## Loading Tokenizer and Encoding


In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
tokenizer = BertTokenizer.from_pretrained(h_preprocess_mode)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=251003.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=60.0, style=ProgressStyle(description_w…




### Device control
Pick if CUDA available else use CPU, print for sanity check

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Data Loader

In [None]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

## Create Special Dataset


In [None]:
class BERTDataset(Dataset):
  def __init__(self, text, label, tokenizer, max_len):
    self.text = text
    self.label = label
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.text)
  
  def __getitem__(self, item):
    text = str(self.text[item])
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )

    return {
        'text': text,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': torch.tensor(self.label[item], dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = BERTDataset(
      text=df[h_text].to_numpy(),
      label=df['category'].to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
  )

  return DataLoader(
      ds,
      batch_size=batch_size,
      num_workers=4
  )

### Prepare data

In [None]:
dataloader = create_data_loader(df, tokenizer, h_max_len, h_batch_size)

# Build Classifier

In [None]:
from transformers import BertForSequenceClassification
from transformers import BertModel

class Classifier(nn.Module):
  def __init__(self, n_classes):
    super(Classifier, self).__init__()
    self.bert = BertModel.from_pretrained(h_preprocess_mode)
    self.drop = nn.Dropout(0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    #output = self.out(output)
    #return self.softmax(output)
    return self.out(output)


In [None]:
model = Classifier(h_num_labels)
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445018508.0, style=ProgressStyle(descri…




### Setting up optimizer

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(),
                  lr=h_learning_rate,
                  correct_bias=False, 
                  eps=h_eps)

### Get scheduler

In [None]:
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader)*h_epoch)

### Loss function

In [None]:
loss_fn = nn.CrossEntropyLoss().to(device)

# Training

In [None]:
def train_epoch(
    model,
    dataloader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in tqdm(dataloader):
    input_ids = d['input_ids'].to(device)
    attention_mask = d['attention_mask'].to(device)
    labels = d['labels'].to(device)

    outputs = model(
        input_ids,
        attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)

    correct_predictions += torch.sum(preds == labels)
    losses.append(loss)

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  return correct_predictions.double() / n_examples, torch.mean(torch.stack(losses))

## Training Loop

In [None]:
histroy = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(h_epoch+1)):
  train_acc, train_loss = train_epoch(
      model,
      dataloader,
      loss_fn,
      optimizer,
      device,
      scheduler,
      len(df)
  )

  tqdm.write(f'Train Loss: {train_loss}')
  tqdm.write(f'Train Acc: {train_acc}')
  
  histroy['train_acc'].append(train_acc)
  histroy['train_loss'].append(train_loss)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2175.0), HTML(value='')))


Train Loss: 0.12143976986408234
Train Acc: 0.9717751207174063


HBox(children=(FloatProgress(value=0.0, max=2175.0), HTML(value='')))


Train Loss: 0.10211078077554703
Train Acc: 0.9778972177512072


HBox(children=(FloatProgress(value=0.0, max=2175.0), HTML(value='')))


Train Loss: 0.10029123723506927
Train Acc: 0.9785870315014946


HBox(children=(FloatProgress(value=0.0, max=2175.0), HTML(value='')))


Train Loss: 0.1009930893778801
Train Acc: 0.9785582892618994



## Save the model

In [None]:
torch.save(model.state_dict(), "model.bin")