In [8]:
pip install transformers



In [9]:
import pandas as pd
import numpy as np

import transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig

import torch
from torch import nn
from torch.utils.data import Dataset
from torch import cuda

from sklearn.model_selection import train_test_split

import random
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 100)


In [10]:
# To use GPU if available, else CPU
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [46]:
# Reading train and test data sets
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [47]:
# train = train.sample(5000)
# test = test.sample(2000)

In [48]:
print(train.shape)
print(test.shape)

(159571, 8)
(153164, 2)


In [49]:
print(train.isna().sum())
print(test.isna().sum())

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64
id              0
comment_text    0
dtype: int64


In [50]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They wer...,0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, Januar...",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relev...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [51]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofu...
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can on..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [52]:
for col_name in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    print("For", col_name, '\n')
    print(train[col_name].value_counts())
    print("\n")

For toxic 

0    144277
1     15294
Name: toxic, dtype: int64


For severe_toxic 

0    157976
1      1595
Name: severe_toxic, dtype: int64


For obscene 

0    151122
1      8449
Name: obscene, dtype: int64


For threat 

0    159093
1       478
Name: threat, dtype: int64


For insult 

0    151694
1      7877
Name: insult, dtype: int64


For identity_hate 

0    158166
1      1405
Name: identity_hate, dtype: int64




In [53]:
# Preparation of labels for multi-label classification
train['list'] = train[train.columns[2:]].values.tolist()
new_df = train[['comment_text', 'list']].copy()
new_df.head()

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They wer...,"[0, 0, 0, 0, 0, 0]"
1,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, Januar...","[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relev...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember what page that's on?","[0, 0, 0, 0, 0, 0]"


In [54]:
# Loading bert model from transformers
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=6,
    output_attentions=False,
    output_hidden_states=False,
)

model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7

In [55]:
# Splitting train set into train and validation sets
X = list(new_df["comment_text"])
y = list(new_df["list"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=128)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=128)

In [56]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [57]:
# Creating torch dataset for train and validation sets to pass as input to model
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [58]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101, 13055, 26568,  2323,  6402,  1999, 11669, 13055, 26568,  2003,
         11669,  1012,  1045,  5223, 13055, 26568,  1012,  1042,  1003,  1003,
          1047,  2014,  2000,  3109,   999,  6390,  1012,  6356,  1012,  6146,
          1012,  2871,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     

In [59]:
X_train[2]

'"\n\nThe Objectivity of this Discussion is doubtful (non-existent)\n\n(1) As indicated earlier, the section on Marxist leaders’ views is misleading:\n\n(a) it lays unwarranted and excessive emphasis on Trotsky, creating the misleading impression that other prominent Marxists (Marx, Engels, Lenin) did not advocate and/or practiced terrorism;\n\n(b) it lays unwarranted and excessive emphasis on the theoretical “rejection of individual terrorism”, creating the misleading impression that this is the main (only) Marxist position on terrorism. \n\n(2) The discussion is not being properly monitored:\n\n(a) no discernible attempt is being made to establish and maintain an acceptable degree of objectivity;\n\n(b) important and relevant scholarly works such as the International Encyclopedia of Terrorism are being ignored or illicitly excluded from the discussion;\n\n(c) though the only logical way to remedy the blatant imbalance in the above section is to include quotes by/on other leaders who 

In [60]:
# sanity check
tokenizer.decode(train_dataset[2]["input_ids"])

'[CLS] " the objectivity of this discussion is doubtful ( non - existent ) ( 1 ) as indicated earlier, the section on marxist leaders ’ views is misleading : ( a ) it lays unwarranted and excessive emphasis on trotsky, creating the misleading impression that other prominent marxists ( marx, engels, lenin ) did not advocate and / or practiced terrorism ; ( b ) it lays unwarranted and excessive emphasis on the theoretical “ rejection of individual terrorism ”, creating the misleading impression that this is the main ( only ) marxist position on terrorism. ( 2 ) the discussion is not being properly monitored : ( a [SEP]'

In [61]:
# Overriding compute_loss function of trainer to use BCE loss, since this is a multilabel classification task
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [62]:
# Function to calculate accuracy for multilabel classification
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

In [63]:
# Function to compute metrics during training Epochs
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

In [64]:
# configure logging so we see training loss
BATCH_SIZE = 16
logging_steps = len(train_dataset) // BATCH_SIZE

args = TrainingArguments(
    output_dir="bert_base_toxic",             # Path to output the model files
    evaluation_strategy="epoch",              # Evaluation to be done after each epoch 
    per_device_train_batch_size=BATCH_SIZE,   # Batch size for train set
    per_device_eval_batch_size=BATCH_SIZE,    # Batch size for test set
    num_train_epochs=1,                       # Number of epochs for model training
    seed=RANDOM_SEED,                 
    learning_rate=2e-5,
    fp16=True,
    weight_decay=0.001,
    logging_steps=logging_steps
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [65]:
# creating Trainer object
trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

Using amp fp16 backend


In [66]:
# Train pre-trained model (Finetuning)
trainer.train()

***** Running training *****
  Num examples = 127656
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 7979


Epoch,Training Loss,Validation Loss,Accuracy Thresh
1,0.0469,0.037657,0.985122


  args.max_grad_norm,
Saving model checkpoint to bert_base_toxic/checkpoint-500
Configuration saved in bert_base_toxic/checkpoint-500/config.json
Model weights saved in bert_base_toxic/checkpoint-500/pytorch_model.bin
tokenizer config file saved in bert_base_toxic/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert_base_toxic/checkpoint-500/special_tokens_map.json
  args.max_grad_norm,
Saving model checkpoint to bert_base_toxic/checkpoint-1000
Configuration saved in bert_base_toxic/checkpoint-1000/config.json
Model weights saved in bert_base_toxic/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in bert_base_toxic/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert_base_toxic/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to bert_base_toxic/checkpoint-1500
Configuration saved in bert_base_toxic/checkpoint-1500/config.json
Model weights saved in bert_base_toxic/checkpoint-1500/pytorch_model.bin
tokenizer config file sa

TrainOutput(global_step=7979, training_loss=0.04693829012722004, metrics={'train_runtime': 1677.5327, 'train_samples_per_second': 76.097, 'train_steps_per_second': 4.756, 'total_flos': 8397227791208448.0, 'train_loss': 0.04693829012722004, 'epoch': 1.0})

In [67]:
# sanity check that we can run evaluation
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 31915
  Batch size = 16


{'epoch': 1.0,
 'eval_accuracy_thresh': 0.9851219654083252,
 'eval_loss': 0.03765677288174629,
 'eval_runtime': 259.9185,
 'eval_samples_per_second': 122.789,
 'eval_steps_per_second': 7.675}

### Testing model

In [68]:
# Tokenizing test data
X_test = list(test["comment_text"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=128)

In [69]:
# Creating torch dataset for test data
test_dataset = Dataset(X_test_tokenized)
# Performing predictions for test data using trained model
raw_pred, _, _ = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 153164
  Batch size = 16


In [70]:
raw_pred

array([[ 5.836 ,  0.3877,  4.32  , -1.104 ,  3.178 ,  0.6357],
       [-6.977 , -7.85  , -7.582 , -7.973 , -7.574 , -7.83  ],
       [-6.81  , -7.914 , -7.473 , -8.12  , -7.613 , -7.934 ],
       ...,
       [-6.977 , -7.85  , -7.535 , -7.977 , -7.61  , -7.85  ],
       [-6.816 , -7.83  , -7.574 , -7.96  , -7.613 , -7.668 ],
       [ 2.5   , -3.924 ,  1.187 , -5.605 , -0.3777, -4.734 ]],
      dtype=float16)

In [71]:
# Sigmoid function to calculate probablities of predictions from trainer
def sigmoid(X):
   return 1/(1+np.exp(-X))

In [72]:
# Calculating probabilities 
probabilities = sigmoid(raw_pred)

In [73]:
probabilities

array([[9.971e-01, 5.957e-01, 9.863e-01, 2.490e-01, 9.595e-01, 6.538e-01],
       [9.327e-04, 3.889e-04, 5.093e-04, 3.448e-04, 5.136e-04, 3.982e-04],
       [1.103e-03, 3.655e-04, 5.684e-04, 2.983e-04, 4.935e-04, 3.581e-04],
       ...,
       [9.327e-04, 3.889e-04, 5.336e-04, 3.433e-04, 4.954e-04, 3.889e-04],
       [1.095e-03, 3.982e-04, 5.136e-04, 3.486e-04, 4.935e-04, 4.673e-04],
       [9.243e-01, 1.938e-02, 7.666e-01, 3.662e-03, 4.067e-01, 8.713e-03]],
      dtype=float16)

In [74]:
raw_pred_df = pd.DataFrame(probabilities)

In [75]:
raw_pred_df

Unnamed: 0,0,1,2,3,4,5
0,0.997070,0.595703,0.986328,0.249023,0.959473,0.653809
1,0.000933,0.000389,0.000509,0.000345,0.000514,0.000398
2,0.001103,0.000365,0.000568,0.000298,0.000494,0.000358
3,0.000863,0.000411,0.000503,0.000377,0.000514,0.000429
4,0.001400,0.000325,0.000527,0.000289,0.000525,0.000335
...,...,...,...,...,...,...
153159,0.691895,0.003662,0.254883,0.001484,0.127563,0.003325
153160,0.004719,0.000267,0.001012,0.000245,0.000863,0.000278
153161,0.000933,0.000389,0.000534,0.000343,0.000495,0.000389
153162,0.001095,0.000398,0.000514,0.000349,0.000494,0.000467


In [96]:
test_with_pred_df = test.join(raw_pred_df)

In [97]:
test_with_pred_df

Unnamed: 0,id,comment_text,0,1,2,3,4,5
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofu...,0.997070,0.595703,0.986328,0.249023,0.959473,0.653809
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",0.000933,0.000389,0.000509,0.000345,0.000514,0.000398
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """,0.001103,0.000365,0.000568,0.000298,0.000494,0.000358
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can on...",0.000863,0.000411,0.000503,0.000377,0.000514,0.000429
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.001400,0.000325,0.000527,0.000289,0.000525,0.000335
...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing but too-long-crap",0.691895,0.003662,0.254883,0.001484,0.127563,0.003325
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n Does it get there faster by throwing to cut off m...,0.004719,0.000267,0.001012,0.000245,0.000863,0.000278
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I see your changes and agree this is """"more correct....",0.000933,0.000389,0.000534,0.000343,0.000495,0.000389
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the EU - Germany - has a Law of Return quite similar ...",0.001095,0.000398,0.000514,0.000349,0.000494,0.000467


In [78]:
test_with_pred_df.shape

(153164, 8)

In [79]:
test_with_pred_df.to_csv("./test_with_prob_sigmoid_whole_train.csv")