### Imports

In [1]:
import pandas as pd
import numpy as np

import random
import pickle

import torch
import torch.nn as nn
from torch import Tensor
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

import matplotlib.pyplot as plt

from transformers import *
from datasets import load_dataset, load_metric

from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel



### Set seeds

In [2]:
torch.manual_seed(42)  # Sets up seed for both devices
np.random.seed(42)
random.seed(42)

### Device

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cpu


### Loading data

In [4]:
dataset_dict = load_dataset('csv', data_files='hate_train.csv', sep=',')
dataset_dict

Using custom data configuration default-3a271b8e32bc9cdb
Reusing dataset csv (C:\Users\patry\.cache\huggingface\datasets\csv\default-3a271b8e32bc9cdb\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 10041
    })
})

In [5]:
dataset_dict['train']['sentence'][0]

'Dla mnie faworytem do tytułu będzie Cracovia. Zobaczymy, czy typ się sprawdzi.'

### Find max length of sentence

In [6]:
max_length = len(max(dataset_dict['train']['sentence'], key=len))

In [7]:
max_length

214

### Splitting the dataset in train and test split

In [8]:
dataset = dataset_dict['train']
datasets = dataset.train_test_split(test_size=0.1)

Loading cached split indices for dataset at C:\Users\patry\.cache\huggingface\datasets\csv\default-3a271b8e32bc9cdb\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e\cache-12b53225ce4d4d12.arrow and C:\Users\patry\.cache\huggingface\datasets\csv\default-3a271b8e32bc9cdb\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e\cache-6acb14e16d3c714b.arrow


In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 9036
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1005
    })
})

### Tokenizer & Model

In [10]:
model_name = 'dkleczek/Polish-Hate-Speech-Detection-Herbert-Large'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# nlp = pipeline('fill-mask', model=model, tokenizer=tokenizer)
# for pred in nlp(f"Adam Mickiewicz wielkim polskim {nlp.tokenizer.mask_token} był."):
#     print(pred)

Some weights of the model checkpoint at dkleczek/Polish-Hate-Speech-Detection-Herbert-Large were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
tokenizer

PreTrainedTokenizerFast(name_or_path='dkleczek/Polish-Hate-Speech-Detection-Herbert-Large', vocab_size=50000, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [12]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(50000, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

### Tokenizing

In [13]:
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', max_length=max_length, truncation=True)

In [14]:
tokenized_datasets = datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'token_type_ids'])
tokenized_datasets.set_format('torch')
train_dataset = tokenized_datasets['train'].shuffle(seed=42)
test_dataset = tokenized_datasets['test']

Loading cached processed dataset at C:\Users\patry\.cache\huggingface\datasets\csv\default-3a271b8e32bc9cdb\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e\cache-6a8338dba4127db0.arrow
Loading cached processed dataset at C:\Users\patry\.cache\huggingface\datasets\csv\default-3a271b8e32bc9cdb\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e\cache-0ba652be8c07a96e.arrow
Loading cached shuffled indices for dataset at C:\Users\patry\.cache\huggingface\datasets\csv\default-3a271b8e32bc9cdb\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e\cache-7f1f70e511cc19a2.arrow


In [15]:
train_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 9036
})

In [16]:
test_dataset

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 1005
})

### DataLoaders

In [17]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

### Training

In [18]:
hate_speech_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [19]:
hate_speech_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [20]:
optimizer = Adam(hate_speech_model.parameters(), lr=5e-5)
hate_speech_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1

In [21]:
num_epochs = 3
loss_fun = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    losses = []
    for batch in train_dataloader:
        labels = batch['label'].to(device)
        batch = {'attention_mask': batch['attention_mask'].to(device), 'input_ids': batch['input_ids'].to(device)}
        outputs = hate_speech_model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
    print(np.mean(losses))

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [None]:
torch.save(hate_speech_model.state_dict(), 'hate_speech_model.tar')

### Validation

In [None]:
hate_speech_model.load_state_dict(torch.load('hate_speech_model.tar'))

In [23]:
metric = load_metric('accuracy')
model.eval()
for batch in test_dataloader:
    labels = batch['label'].to(device)
    batch = {'attention_mask': batch['attention_mask'].to(device), 'input_ids': batch['input_ids'].to(device)}

    with torch.no_grad():
        outputs = hate_speech_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    print(f'Predictions: {predictions}')
    print(f'Labels:      {labels}')
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

Predictions: tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Labels:      tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0])
Predictions: tensor([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])
Labels:      tensor([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Predictions: tensor([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])
Labels:      tensor([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0])
Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])
Labels:      tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])
Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0])
Labels:      tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
Labels:      tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])
Predictions: tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])
Labels:      tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0])
Predictions: tensor(

{'accuracy': 0.8746268656716418}

### Generating predictions

In [43]:
dataset_dict = load_dataset('text', data_files='hate_test_data.txt')
dataset_dict



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [39]:
dataset_dict['train']['text'][0]

'@anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.'

In [40]:
max_length = len(max(dataset_dict['train']['text'], key=len))
max_length

182

In [44]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', max_length=max_length, truncation=True)

In [45]:
dataset_dict = dataset_dict.map(tokenize_function, batched=True)
dataset_dict = dataset_dict.remove_columns(['text', 'token_type_ids'])
dataset_dict.set_format('torch')
dataset = dataset_dict['train']

  0%|          | 0/1 [00:00<?, ?ba/s]

In [46]:
dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1000
})

In [47]:
dataloader = DataLoader(dataset, batch_size=16)

In [48]:
outs = list()
with torch.no_grad():
    for batch in dataloader:
        batch = {'attention_mask': batch['attention_mask'].to(device), 'input_ids': batch['input_ids'].to(device)}

        outputs = hate_speech_model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        for i in predictions:
            outs.append(i.tolist())

In [49]:
outs = Tensor(outs)

In [53]:
outs_n = outs.numpy().astype(int)
outs_n = pd.DataFrame(outs_n)
outs_n.to_csv('poniedzialek_Bandyra_Nowakowski.csv', index=False, header=False)

In [54]:
outs_n

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
995,0
996,0
997,0
998,1
