In [34]:
import pandas as pd
import torch
print(torch.cuda.is_available())
device = torch.device("cuda")
print('GPU:', torch.cuda.get_device_name(0))

True
GPU: NVIDIA GeForce GTX 1650 Ti


In [21]:
train_df = pd.read_csv("../data/jigsaw-toxic-comment-train.csv")
dev_df = pd.read_csv("../data/validation.csv")
print("Number of training examples:", train_df.shape[0])
train_df.head(5)

Number of training examples: 223549


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [22]:
train_df.sample(5)[['comment_text', 'toxic']]

Unnamed: 0,comment_text,toxic
104192,"""\n\n Opinion please \n\nAs a result of a leng...",0
200058,"""1215 Hours 1 April, 2006 \n\n The great unwas...",0
199857,तेव्हा \n सूर्याचे असंख्य असह्य किरण \n शोषून ...,0
3399,Do you support confessions produced by torture...,0
112912,"My mistake, completely missed the GFDL tag. R...",0


In [23]:
dev_df.head(5)

Unnamed: 0,id,comment_text,lang,toxic
0,0,Este usuario ni siquiera llega al rango de ...,es,0
1,1,Il testo di questa voce pare esser scopiazzato...,it,0
2,2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,es,1
3,3,Bu maddenin alt başlığı olarak uluslararası i...,tr,0
4,4,Belçika nın şehirlerinin yanında ilçe ve belde...,tr,0


In [25]:
train_sentences = train_df.comment_text.values
train_labels = train_df.toxic.values

dev_sentences = dev_df.comment_text.values
dev_labels = dev_df.toxic.values

In [15]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
max_length = 0

for sentence in train_sentences:
    sentence = tokenizer.encode(sentence, add_special_tokens = True)
    max_length = max(max_length, len(sentence))

print("Maximum sentence length:", max_length)

Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors


Maximum sentence length: 4952


In [26]:
train_input_ids = []
train_attention_masks = []

for sentence in train_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens = True,
                        max_length = 256,
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                    )
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])

train_input_ids = torch.cat(train_input_ids, dim = 0)
train_attention_masks = torch.cat(train_attention_masks, dim = 0)
train_labels = torch.tensor(train_labels)

print("Example:", train_sentences[0])
print("Tokens:", train_input_ids[0])

dev_input_ids = []
dev_attention_masks = []

for sentence in dev_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sentence,
                        add_special_tokens = True,
                        max_length = 256,
                        truncation = True,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                    )
    dev_input_ids.append(encoded_dict['input_ids'])
    dev_attention_masks.append(encoded_dict['attention_mask'])

dev_input_ids = torch.cat(dev_input_ids, dim = 0)
dev_attention_masks = torch.cat(dev_attention_masks, dim = 0)
dev_labels = torch.tensor(dev_labels)

print("Example:", dev_sentences[0])
print("Tokens:", dev_input_ids[0])



Example: Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
Tokens: tensor([  101,  7526,  2339,  1996, 10086,  2015,  2081,  2104,  2026,  5310,
        18442, 13076, 12392,  2050,  5470,  2020, 16407,  1029,  2027,  4694,
         1005,  1056,  3158,  9305, 22556,  1010,  2074,  8503,  2006,  2070,
         3806,  2044,  1045,  5444,  2012,  2047,  2259, 14421,  6904,  2278,
         1012,  1998,  3531,  2123,  1005,  1056,  6366,  1996, 23561,  2013,
         1996,  2831,  3931,  2144,  1045,  1005,  1049,  3394,  2085,  1012,
         6486,  1012, 16327,  1012,  4229,  1012,  2676,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,  

In [27]:
from torch.utils.data import TensorDataset

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
dev_data = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)

In [28]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
                        train_data,
                        sampler = RandomSampler(train_data),
                        batch_size = batch_size
                    )

dev_dataloader = DataLoader(
                        dev_data,
                        sampler = RandomSampler(dev_data),
                        batch_size = batch_size
                    )

In [29]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda()

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [31]:
optimizer = torch.optim.AdamW(  
            model.parameters(),
            lr = 2e-5,
            eps = 1e-8
                )

In [32]:
from transformers import get_linear_schedule_with_warmup

epochs = 4
number_of_training_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps = 0,
                num_training_steps = number_of_training_steps 
            )

In [33]:
import numpy as np

def display_accuracy(predicted_labels, actual_labels):
    predicted_labels = np.argmax(predicted_labels, axis = 1).flatten()
    actual_labels = actual_labels.flatten()
    return np.sum(predicted_labels == actual_labels)/len(actual_labels)

In [41]:
import random
import numpy as np

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

training_statistics = []

for epoch in range(0, epochs):
    print("Epoch:", epoch)
    total_training_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        result = model(
                    b_input_ids,
                    attention_mask = b_input_mask,
                    labels = b_labels,
                    return_dict = True
                )
        loss = result.loss
        logits = result.logits
        total_training_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    average_training_loss = total_training_loss/len(train_dataloader)
    print("Average training loss:", round(average_training_loss, ndigits = 4))

    model.eval()
    total_dev_accuracy = 0
    total_dev_loss = 0
    number_of_dev_steps = 0

    for batch in dev_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            result = model(
                    b_input_ids,
                    attention_mask = b_input_mask,
                    labels = b_labels,
                    return_dict = True
                )
        loss = result.loss
        logits = result.logits
        total_dev_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_dev_accuracy += display_accuracy(logits, label_ids)
    
    average_dev_accuracy = total_dev_accuracy/len(dev_dataloader)
    print("dev-set Accuracy:", round(average_dev_accuracy, ndigits = 4))
    average_dev_loss = total_dev_loss/len(dev_dataloader)
    print("Average dev-set loss:", round(average_dev_accuracy, ndigits = 4))

    training_statistics.append(
        {
            'epoch': epoch + 1,
            'Training Loss': average_training_loss,
            'Valid. Loss': average_dev_loss,
            'Valid. Accur.': average_dev_accuracy
        }
    )



Epoch: 0


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 4.00 GiB total capacity; 3.30 GiB already allocated; 0 bytes free; 3.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [39]:
import pandas as pd

pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_statistics)
df_stats = df_stats.set_index('epoch')
df_stats

OptionError: Pattern matched multiple keys

In [None]:
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()