In [1]:
import numpy as np
import pandas as pd
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
import torch
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model = model.to("cuda")

In [5]:
data = data[["comment_text","toxic"]]
data = data[0:1000]

In [6]:
sample_data = ["i love eating", "i am playing"]
tokenizer(sample_data,truncate=True,paddings=True)

Keyword arguments {'truncate': True, 'paddings': True} not recognized.
Keyword arguments {'truncate': True, 'paddings': True} not recognized.


{'input_ids': [[101, 1045, 2293, 5983, 102], [101, 1045, 2572, 2652, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [7]:
X = list(data["comment_text"])
y = list(data["toxic"])
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, truncation=True, padding=True, max_length=512)
X_val_tokenized = tokenizer(X_val, truncation=True, padding=True, max_length=512)

In [43]:
for x in X_train_tokenized.items():
    print(x)
    break

('input_ids', [[101, 1000, 11721, 3319, 2462, 1045, 1005, 1049, 3374, 2000, 2360, 2023, 1010, 2021, 1045, 2031, 2000, 8246, 2023, 3720, 1005, 1055, 25957, 1012, 1996, 2195, 2350, 3471, 2008, 2716, 2039, 1999, 2023, 3720, 1005, 1055, 3025, 11721, 7667, 2031, 2025, 2042, 3843, 1012, 2004, 1055, 1013, 2002, 2038, 2056, 2077, 2033, 1010, 1000, 1000, 2002, 3720, 1005, 1055, 1059, 2361, 1024, 2599, 2323, 2022, 2936, 1012, 1996, 2381, 1010, 2865, 3086, 1010, 15032, 1010, 3076, 2231, 1010, 28321, 1010, 13012, 9035, 1010, 1998, 3176, 2592, 2930, 2024, 2200, 13366, 20132, 1999, 25022, 10711, 9285, 1012, 2036, 1010, 13012, 3567, 1998, 3176, 2592, 2323, 2022, 6377, 2046, 1996, 2717, 1997, 1996, 3720, 1012, 1000, 1000, 2153, 1010, 3531, 2298, 2058, 1059, 2361, 1024, 15536, 16098, 1998, 8081, 2122, 3291, 1010, 2059, 17738, 19269, 2009, 2005, 11721, 1011, 2465, 1012, 2174, 1010, 1045, 2031, 2128, 1011, 14155, 2023, 3720, 2004, 1038, 1011, 2465, 1010, 2004, 2009, 2515, 3113, 1038, 1011, 2465, 9181, 10

In [27]:
print(len(X_train), len(X_val))
print(len(y_train), len(y_val))

800 200
800 200


In [28]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels=None):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, index):
    item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
    if self.labels:
      item["labels"] = torch.tensor(self.labels[index])
    return item

  def __len__(self):
    return len(self.encodings["input_ids"])

In [29]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [13]:
def compute_metrics(p):
  pred, labels = p
  pred = np.argmax(pred, axis=1)

  accuracy = accuracy_score(y_true=labels, y_pred=pred)
  recall = recall_score(y_true=labels, y_pred=pred)
  precision = precision_score(y_true=labels, y_pred=pred)
  f1 = f1_score(y_true=labels,y_pred=pred)

  return {"accuracy": accuracy, "recall": recall, "precision": precision, "f1": f1}

In [14]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=3,
    per_device_train_batch_size=8
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=300, training_loss=0.12489206949869791, metrics={'train_runtime': 149.8991, 'train_samples_per_second': 16.011, 'train_steps_per_second': 2.001, 'total_flos': 631466532864000.0, 'train_loss': 0.12489206949869791, 'epoch': 3.0})

In [16]:
trainer.evaluate()

{'eval_loss': 0.3224886357784271,
 'eval_accuracy': 0.93,
 'eval_recall': 0.7142857142857143,
 'eval_precision': 0.6521739130434783,
 'eval_f1': 0.6818181818181818,
 'eval_runtime': 3.3867,
 'eval_samples_per_second': 59.055,
 'eval_steps_per_second': 7.382,
 'epoch': 3.0}

In [17]:
text = "That point was good"

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize input and move to the same device
inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
inputs = {key: val.to(device) for key, val in inputs.items()}

# Forward pass
outputs = model(**inputs)

# Get predictions
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()

print(predictions)

[[0.9980444  0.00195564]]


In [18]:
np.set_printoptions(suppress=True)

In [19]:
trainer.save_model("CustomModel")