In [1]:
!pip install transformers -U



In [2]:
!pip install accelerate -U



In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("https://raw.githubusercontent.com/iampukar/toxic-comments-classification/master/train.csv")

In [5]:
data = data.head(500)

In [6]:
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
495,014b44616d8cb457,"Sarek of Vulcan: Unfortunately for you, you ca...",0,0,0,0,0,0
496,014bb932bd289352,Keep your chin up! Darwinism was not accepted ...,0,0,0,0,0,0
497,014c96f873db11ff,"""""""Nazi filth"""" is impolite 04:27, 20 Jan 200...",1,0,0,0,1,0
498,014d00c8f2a76df4,Interesting. I checked the other case number K...,0,0,0,0,0,0


In [7]:
data['toxic'].value_counts()

0    451
1     49
Name: toxic, dtype: int64

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
import accelerate

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
model = model.to('cuda')

In [12]:
sample_data = ["I am eating","I am playing "]
tokenizer(sample_data, padding=True, truncation=True, max_length=512)

{'input_ids': [[101, 1045, 2572, 5983, 102], [101, 1045, 2572, 2652, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [13]:
X = list(data["comment_text"])
y = list(data["toxic"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [14]:
len(X_train),len(X_val)

(400, 100)

Now to finetune model on our own custom dataset, we need to convert our tokenized output into a dataset format.

Ref: PyTorch docs: https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files

In [15]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [16]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [17]:
train_dataset[2]

{'input_ids': tensor([  101,  1000,  2383,  2056,  2008,  1010,  1045,  1005,  2310,  8184,
          3718,  2026, 11186,  2241,  2006, 22330,  3207,  1005,  1055,  6040,
          1010, 14223,  1037,  1000,  1000,  5227,  2005, 10465,  1000,  1000,
          1045,  1005,  2310,  2356,  2005,  2006,  1996,  2831,  3931,  1012,
          1045,  9075,  3087,  3752,  2023,  2000,  3789,  2061,  2057,  2035,
          2113,  2054,  1996,  2451,  4122,  1012,  1000,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [18]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [20]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=8

)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=250, training_loss=0.10149684143066406, metrics={'train_runtime': 175.3572, 'train_samples_per_second': 11.405, 'train_steps_per_second': 1.426, 'total_flos': 526222110720000.0, 'train_loss': 0.10149684143066406, 'epoch': 5.0})

In [22]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.4122975170612335,
 'eval_accuracy': 0.93,
 'eval_precision': 0.7142857142857143,
 'eval_recall': 0.5,
 'eval_f1': 0.588235294117647,
 'eval_runtime': 3.2167,
 'eval_samples_per_second': 31.087,
 'eval_steps_per_second': 4.041,
 'epoch': 5.0}

In [23]:
text = "That was good point"
# text = "go to hell"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model(**inputs)
print(outputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)
predictions = predictions.cpu().detach().numpy()
predictions

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.8728, -3.4365]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor([[0.9982, 0.0018]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


array([[0.9981839 , 0.00181606]], dtype=float32)

In [24]:
trainer.save_model('CustomModel')

In [None]:
#load the save model

model_2 = BertForSequenceClassification.from_pretrained("/content/CustomModel")
model_2.to('cuda')

In [26]:
# text = "That was good point"
text = "go to hell"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.00265655, 0.9973435 ]], dtype=float32)