In [1]:
import pandas as pd

df = pd.read_csv('./jigsaw-toxic-comment-classification-challenge/train.csv')

In [2]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [3]:
df['comment_text'].apply(len).mean()

394.0732213246768

In [4]:
# List of all toxic labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Count how many comments belong to each label
label_counts = df[labels].sum().sort_values(ascending=False)

label_counts

toxic            15294
obscene           8449
insult            7877
severe_toxic      1595
identity_hate     1405
threat             478
dtype: int64

In [5]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

# Download stop words 
def preprocess(text):
    # remove urls
    text = re.sub(r'http\S+|www\S+|https\S+','',text)
    # lowercase the entire text
    text = text.lower()
    # remove extra spaces
    text = re.sub(r'\s+',' ',text)
    # remove mentions
    text = re.sub(r'@\w+','',text)
    # remove hashtags
    text = re.sub(r'#\w+','',text)
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # get rid of stop words
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(words)
    return text

def tokenize_and_lemmatize(text):
    tokens = text.split()
    #Lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pragatibagul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pragatibagul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/pragatibagul/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
df['preprocessed_text'] = df['comment_text'].apply(preprocess)
df['preprocessed_text'] = df['preprocessed_text'].apply(tokenize_and_lemmatize)

In [7]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'preprocessed_text'],
      dtype='object')

In [8]:
!pip install iterative-stratification



In [9]:
import pandas as pd
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
def print_label_dist(df,label_cols,title=''):
    counts = df[label_cols].sum().astype(int)
    pct = (counts/len(df)*100).round(3)
    out = pd.DataFrame({'count':counts,'pct':pct})
    print(f'\n{title}')
    print(out)

# Quick check of full data label distribution
print_label_dist(df,label_cols,'Full dataset distribution')

#Create multilabel stratified split (80% train, 20% val)
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# iterstrat expects a 2D array for X
X = df['preprocessed_text'].values
y = df[label_cols].values

train_idx,val_idx = next(msss.split(X,y))
train = df.iloc[train_idx].reset_index(drop=True)
val = df.iloc[val_idx].reset_index(drop=True)

print_label_dist(train,label_cols,"Train distribution")
print_label_dist(val,label_cols,"Validation distribution")


Full dataset distribution
               count    pct
toxic          15294  9.584
severe_toxic    1595  1.000
obscene         8449  5.295
threat           478  0.300
insult          7877  4.936
identity_hate   1405  0.880

Train distribution
               count    pct
toxic          12235  9.584
severe_toxic    1276  1.000
obscene         6759  5.295
threat           382  0.299
insult          6302  4.937
identity_hate   1124  0.880

Validation distribution
               count    pct
toxic           3059  9.585
severe_toxic     319  1.000
obscene         1690  5.295
threat            96  0.301
insult          1575  4.935
identity_hate    281  0.880


In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer
import joblib
import json
import os

text_col = 'preprocessed_text'
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']
max_features = 50000
ngram_range = (1,2)
C = 1.0
model_dir = 'baseline_model'
os.makedirs(model_dir,exist_ok=True)

# Input (train, val)
df_train = train
df_val = val

# 1. Convert comments to TF IDF Vectorizer
tfidf = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            strip_accents='unicode',
            analyzer='word',
            lowercase=True,
            token_pattern=r'(?u)\b\w+\b')

X_train = tfidf.fit_transform(df_train[text_col].fillna('').astype(str).tolist())
X_val = tfidf.transform(df_val[text_col].fillna('').astype(str).tolist())

# Multi-label target matrix (shape: n_samples x n_labels)
y_train = df_train[label_cols].values
y_val = df_val[label_cols].values

# Sanity check shapes
print('X_train:',X_train.shape,'X_val:',X_val.shape,'y_train:',y_train.shape,'y_val:',y_val.shape)

X_train: (127656, 50000) X_val: (31915, 50000) y_train: (127656, 6) y_val: (31915, 6)


In [11]:
# Train Logistic Regression for multi-label
base_clf = LogisticRegression(
    penalty='l2',
    C=C,
    solver='saga',
    max_iter=200,
    n_jobs=-1,
    class_weight=None
)

clf = OneVsRestClassifier(base_clf,n_jobs=-1)
clf.fit(X_train,y_train)
print('Training complete')

Training complete


In [12]:
#Evaluate using F1 score
# Predict binary with default threshold of 0.5 on probabilities
y_val_probs = clf.predict_proba(X_val)
y_val_pred = (y_val_probs >= 0.5).astype(int)

#Per-label metrics
precision,recall,f1,support = precision_recall_fscore_support(
    y_val,
    y_val_pred,
    average=None,
    labels=range(len(label_cols)))

per_label = []
for i, lbl in enumerate(label_cols):
    per_label.append({
        'label':lbl,
        'precision':float(precision[i]),
        'recall':float(recall[i]),
        'f1':float(f1[i]),
        'support':int(support[i])
    })

#Macro and Micro f1
macro_f1 = float(f1_score(y_val,y_val_pred,average='macro'))
micro_f1 = float(f1_score(y_val,y_val_pred,average='micro'))

#Print summary
print('\n Per-label summary')
for r in per_label:
    print(f"{r['label']:15s} | precision={r['precision']:.3f} recall={r['recall']:.3f} f1={r['f1']:.3f} support={r['support']}")

print(f'\n Macro F1: {macro_f1:.4f}')
print(f'\n Micro F1: {macro_f1:.4f}')


 Per-label summary
toxic           | precision=0.925 recall=0.598 f1=0.726 support=3059
severe_toxic    | precision=0.535 recall=0.213 f1=0.305 support=319
obscene         | precision=0.925 recall=0.646 f1=0.761 support=1690
threat          | precision=0.438 recall=0.073 f1=0.125 support=96
insult          | precision=0.827 recall=0.521 f1=0.639 support=1575
identity_hate   | precision=0.567 recall=0.135 f1=0.218 support=281

 Macro F1: 0.4624

 Micro F1: 0.4624


In [13]:
# Save baseline results
joblib.dump(tfidf,os.path.join(model_dir,'tfidf_vectorizer.joblib'))
joblib.dump(clf,os.path.join(model_dir,'ovr_logreg.joblib'))

#Save evaluation results to JSON/CSV
results = {
    'per_label':per_label,
    'macro_f1':macro_f1,
    'micro_f1':micro_f1,
    'params':{
        'max_features':max_features,
        'ngram_range':ngram_range,
        'C':C
    }
}

with open(os.path.join(model_dir,'baseline_results.json'),'w') as f:
    json.dump(results,f,indent=2)
    
pd.DataFrame(per_label).to_csv(os.path.join(model_dir,'per_label_results.csv'),index=False)
print(f"\nSaved vectorizer, model, and results to: {model_dir}")


Saved vectorizer, model, and results to: baseline_model


It trains a pretrained transformer (BERT by default) with a simple linear classifier head, uses BCEWithLogitsLoss, trains for a few epochs, and evaluates on a validation set with per-label and macro F1. It also shows how to tokenize the dataframe column in batches and how to build a Dataset / DataLoader.

In [14]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.metrics import f1_score,classification_report
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, get_linear_schedule_with_warmup

MODEL_NAME = 'bert-base-cased'
TEXT_COL = 'comment_text'
LABEL_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

MAX_LEN = 64
BATCH_SIZE = 24
LR = 2e-5
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 3
TEST_SIZE = 0.2
RANDOM_STATE = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
msss = MultilabelStratifiedShuffleSplit(n_splits=1,test_size=TEST_SIZE,random_state=RANDOM_STATE)
X = np.arange(len(df)).reshape(-1,1)
y = df[LABEL_COLS].values
train_idx, val_idx = next(msss.split(X,y))

In [16]:
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)
print('Sizes -> full : ',len(df),' Train: ',len(train_df),' Validation : ',len(val_df))

Sizes -> full :  159571  Train:  127656  Validation :  31915


In [17]:
# Tokenizer Batch Encode
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def batch_encode_texts(texts,tokenizer,max_length=MAX_LEN,batch_size=512):
    input_ids_list = []
    attention_masks_list = []
    for i in range(0,len(texts),batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(
                batch,
                padding='max_length',
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
        input_ids_list.append(enc['input_ids'])
        attention_masks_list.append(enc['attention_mask'])
    input_ids = torch.cat(input_ids_list,dim=0)
    attention_mask = torch.cat(attention_masks_list,dim=0)
    return {'input_ids':input_ids,'attention_mask':attention_mask}

print('Tokenising train...')
train_enc = batch_encode_texts(train_df[TEXT_COL].astype(str).tolist(),tokenizer)
print('Tokenising val...')
val_enc = batch_encode_texts(val_df[TEXT_COL].astype(str).tolist(),tokenizer)

Tokenising train...
Tokenising val...


In [18]:
# Create TensorDatasets + DataLoaders
y_train = torch.tensor(train_df[LABEL_COLS].values.astype(float),dtype=torch.float)
y_val = torch.tensor(val_df[LABEL_COLS].values.astype(float),dtype=torch.float)

train_dataset = TensorDataset(train_enc['input_ids'],train_enc['attention_mask'],y_train)
val_dataset = TensorDataset(val_enc['input_ids'],val_enc['attention_mask'],y_val)

train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=2)
val_loader = DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=False,num_workers=2)

In [19]:
# Pos weight for BCE (HELPS WITH IMBALANCE)
pos = train_df[LABEL_COLS].sum().values.astype(float)
neg = len(train_df) - pos
pos_weight_arr = np.where(pos == 0,1.0,neg/np.where(pos==0,1.0,pos))
pos_weight = torch.tensor(pos_weight_arr,dtype=torch.float).to(DEVICE)
print('pos_weight per label',dict(zip(LABEL_COLS,np.round(pos_weight_arr,3))))

pos_weight per label {'toxic': 9.434, 'severe_toxic': 99.044, 'obscene': 17.887, 'threat': 333.178, 'insult': 19.256, 'identity_hate': 112.573}


In [20]:
# Model: transformer backbone + linear head
class TransformerForMultiLabel(nn.Module):
    def __init__(self,model_name,num_labels):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.backbone = AutoModel.from_pretrained(model_name,config=self.config)
        self.classifier = nn.Linear(self.backbone.config.hidden_size,num_labels)
    def forward(self,input_ids,attention_mask):
        out=self.backbone(input_ids=input_ids,attention_mask=attention_mask)
        cls = out.last_hidden_state[:,0,:]
        logits = self.classifier(cls)
        return logits

In [21]:
model = TransformerForMultiLabel(MODEL_NAME,num_labels=len(LABEL_COLS))
model.to(DEVICE)

TransformerForMultiLabel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [22]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=int(0.05*total_steps),num_training_steps=total_steps)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [23]:
#Eval function
def evaluate(model,loader,device,threshold=0.5):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(loader,desc='Eval',leave=False):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels_np = labels.cpu().numpy()
            logits = model(input_ids=input_ids,attention_mask=attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()
            preds = (probs >= threshold).astype(int)
            all_labels.append(labels_np)
            all_preds.append(preds)
    y_true = np.vstack(all_labels)
    y_pred = np.vstack(all_preds)
    per_label_f1 = f1_score(y_true,y_pred,average=None,zero_division=0)
    macro_f1 = f1_score(y_true,y_pred,average='macro',zero_division=0)
    report = classification_report(y_true,y_pred,target_names=LABEL_COLS,zero_division=0)
    return per_label_f1,macro_f1,report
        

In [24]:
# Training loop
best_macro = -1.0
for epoch in range(1,NUM_EPOCHS+1):
    model.train()
    running_loss = 0.0
    loop = tqdm(train_loader,desc=f'Epoch {epoch}/{NUM_EPOCHS}')
    for input_ids, attention_mask, labels in loop:
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        labels = labels.to(DEVICE)
        
        optimizer.zero_grad()
        logits = model(input_ids=input_ids,attention_mask=attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        running_loss += loss.item()
        loop.set_postfix(loss=loss.item())
        
    avg_loss = running_loss / len(train_loader)
    print(f'\n Epoch {epoch} avg loss : {avg_loss:.4f}')
    
    # Evaluate
    per_label_f1, macro_f1, report = evaluate(model, val_loader, DEVICE, threshold=0.5)
    print("Validation Macro F1: {:.4f}".format(macro_f1))
    print("Per-label F1:", dict(zip(LABEL_COLS, np.round(per_label_f1, 4))))
    print("Classification report:\n", report)

    if macro_f1 > best_macro:
        best_macro = macro_f1
        save_path = "best_transformer_multilabel.pt"
        torch.save({
            'model_state_dict': model.state_dict(),
            'tokenizer_name': MODEL_NAME,
            'label_cols': LABEL_COLS,
            'max_len': MAX_LEN
        }, save_path)
        print("Saved best model:", save_path)

print("Training complete. Best macro F1:", best_macro)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 1/3:   0%|          | 0/5319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



 Epoch 1 avg loss : 0.4218


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Eval:   0%|          | 0/1330 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Validation Macro F1: 0.4008
Per-label F1: {'toxic': 0.7045, 'severe_toxic': 0.2555, 'obscene': 0.6367, 'threat': 0.0693, 'insult': 0.5688, 'identity_hate': 0.1701}
Classification report:
                precision    recall  f1-score   support

        toxic       0.57      0.91      0.70      3059
 severe_toxic       0.15      0.98      0.26       319
      obscene       0.48      0.95      0.64      1690
       threat       0.04      0.94      0.07        96
       insult       0.41      0.95      0.57      1575
identity_hate       0.09      0.98      0.17       281

    micro avg       0.34      0.94      0.50      7020
    macro avg       0.29      0.95      0.40      7020
 weighted avg       0.47      0.94      0.61      7020
  samples avg       0.04      0.09      0.06      7020

Saved best model: best_transformer_multilabel.pt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 2/3:   0%|          | 0/5319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



 Epoch 2 avg loss : 0.2352


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)


Eval:   0%|          | 0/1330 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Validation Macro F1: 0.4288
Per-label F1: {'toxic': 0.7007, 'severe_toxic': 0.2642, 'obscene': 0.6707, 'threat': 0.1303, 'insult': 0.5913, 'identity_hate': 0.2154}
Classification report:
                precision    recall  f1-score   support

        toxic       0.57      0.92      0.70      3059
 severe_toxic       0.15      0.98      0.26       319
      obscene       0.52      0.95      0.67      1690
       threat       0.07      0.93      0.13        96
       insult       0.43      0.95      0.59      1575
identity_hate       0.12      0.98      0.22       281

    micro avg       0.38      0.94      0.55      7020
    macro avg       0.31      0.95      0.43      7020
 weighted avg       0.48      0.94      0.62      7020
  samples avg       0.05      0.09      0.06      7020

Saved best model: best_transformer_multilabel.pt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 3/3:   0%|          | 0/5319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



 Epoch 3 avg loss : 0.1635


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)


Eval:   0%|          | 0/1330 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Validation Macro F1: 0.4993
Per-label F1: {'toxic': 0.7226, 'severe_toxic': 0.3647, 'obscene': 0.7032, 'threat': 0.2393, 'insult': 0.6292, 'identity_hate': 0.337}
Classification report:
                precision    recall  f1-score   support

        toxic       0.60      0.91      0.72      3059
 severe_toxic       0.23      0.96      0.36       319
      obscene       0.56      0.94      0.70      1690
       threat       0.14      0.81      0.24        96
       insult       0.48      0.93      0.63      1575
identity_hate       0.21      0.88      0.34       281

    micro avg       0.47      0.92      0.63      7020
    macro avg       0.37      0.91      0.50      7020
 weighted avg       0.52      0.92      0.66      7020
  samples avg       0.06      0.09      0.07      7020

Saved best model: best_transformer_multilabel.pt
Training complete. Best macro F1: 0.4993244255850884


In [25]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def predict(text,threshold=0.5):
    #Tokenize
    enc = tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,
                return_tensors='pt'
            )
    #Move to CPU or CUDA
    input_ids = enc['input_ids']
    attention_mask = enc['attention_mask']
    
    #Forward pass
    with torch.no_grad():
        logits = model(input_ids=input_ids,attention_mask=attention_mask)
        probs = torch.sigmoid(logits).numpy()[0]
        
    #Convert probabilities to 0/1
    preds = (probs >= threshold).astype(int)
    
    #Map to dictionary
    result = {
        'probabilities':dict(zip(LABEL_COLS,np.round(probs,4))),
        'predictions':dict(zip(LABEL_COLS,preds.astype(int)))
    }
    return result

In [26]:
text = 'You are a complete idiot'
output = predict(text)

In [27]:
print('Probabilities : ',output['probabilities'])
print('Binary Predictions : ',output['predictions'])

Probabilities :  {'toxic': 0.9938, 'severe_toxic': 0.2402, 'obscene': 0.9714, 'threat': 0.0198, 'insult': 0.9944, 'identity_hate': 0.0311}
Binary Predictions :  {'toxic': 1, 'severe_toxic': 0, 'obscene': 1, 'threat': 0, 'insult': 1, 'identity_hate': 0}


In [28]:
text = 'You are a piece of shit. You scoundrel!'
output = predict(text)
print('Probabilities : ',output['probabilities'])
print('Binary Predictions : ',output['predictions'])

Probabilities :  {'toxic': 0.9978, 'severe_toxic': 0.8452, 'obscene': 0.993, 'threat': 0.0261, 'insult': 0.9977, 'identity_hate': 0.1857}
Binary Predictions :  {'toxic': 1, 'severe_toxic': 1, 'obscene': 1, 'threat': 0, 'insult': 1, 'identity_hate': 0}
