In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

import transformers
from transformers import AdamW, get_linear_schedule_with_warmup

import tokenizers
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve, auc
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Visualization

In [None]:
train_table = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_table = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_tags = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
train_table.describe()

Number of Lost Values

We only choose 30000 data for training.

In [None]:
train_table = train_table.iloc[0:30000,:]
train_table.head()

In [None]:
train_table.isna().sum()

In [None]:
train_tags = train_table.drop(['id', 'comment_text'], axis = 1)
label_counts = train_tags.sum()
df_counts = pd.DataFrame(label_counts)
df_counts.rename(columns = {0:'counts'}, inplace = True)
df_counts = df_counts.sort_values('counts', ascending = False)
df_counts

We could see the data is quite unbalanced. Among 30,000 data, only 10% are in class "toxic".

In [None]:
ax = df_counts.plot.barh(width=0.7, fontsize='24', figsize=(12,5));
ax.legend(bbox_to_anchor=(1, 0.3), fontsize='16');
for p in ax.patches:
    w = p.get_width()
    ax.annotate(f'{w:d}', (w * 1, p.get_y() + 0.1))

Seem that data having tags are far smaller than those not

In [None]:
import random

train_tags.loc[random.sample(range(train_tags.shape[0]),5)]

Could tell from the table that a row could have multiple tags.

In [None]:
comment = train_table['comment_text']
comment.head()

In [None]:
idxs = random.sample(range(comment.shape[0]),5)
texts = comment.loc[idxs].reset_index(drop=True)
for i in range(5):
    print(repr('%d: %s'%(i,list(texts)[i][:128])))

    

## Text preprocessing

In [None]:
import re

def text_preprocessing(text):
    #lower case
    text = text.lower()
    
    #pattern = [zero or more character]
    text = re.sub('\[.*?\]', '', text)
    
    #pattern = (zero or more character)
    text = re.sub('\(.*?\)', '', text)
    
    #pattern = with or without(http),://, one or more non-white space character, OR www, .,one or more non-white space character
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    #pattern = @some characters + space
    text = re.sub(r'(@.*?)[\s]', ' ', text) 
    
#     #pattern = num
#     text = re.sub(r'[0-9]+' , '' ,text)
    
    #pattern = space+@+A-Za-z0-9_
    text = re.sub(r'\s([@][\w_-]+)', '', text).strip()
    
    #pattern = &amp
    text = re.sub(r'&amp;', '&', text)
    
    #pattern = multiple space
    text = re.sub(r'\s+', ' ', text).strip()
    
    #replace #
    text = text.replace("#" , " ")
    
    #pattern = any new line
    text = re.sub('\n', '', text)
    
    encoded_string = text.encode("ascii", "ignore")
    decode_string = encoded_string.decode()
    return decode_string

In [None]:
# import re
# import string

# def clean_text(text):
#     '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
#     and remove words containing numbers.'''
#     #text = text.lower()
    
#     #pattern = [zero or more character]
#     text = re.sub('\[.*?\]', '', text)
    
#     #pattern = with or without(http),://, one or more non-white space character, OR www, .,one or more non-white space character
#     text = re.sub('https?://\S+|www\.\S+', '', text)
    
#     #pattern = <, zero or more characters, >, (one or more occurance of >)
#     text = re.sub('<.*?>+', '', text)
    
#     #pattern = any punctionation
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
#     #pattern = any new line
#     text = re.sub('\n', '', text)
    
#     #pattern = any from[a-zA-Z0-9_], any from[0-9], any from [a-zA-Z0-9_]
#     text = re.sub('\w*\d\w*', '', text)
#     return text

In [None]:
train_table['processed_text'] = comment.apply(str).apply(lambda x: text_preprocessing(x))
train_table['processed_text_length'] = train_table['processed_text'].apply(lambda x:len(x))

# Analyze the length of label words

In [None]:
other_comment_length = train_table['processed_text_length'][(train_table["toxic"] != 1) & (train_table["severe_toxic"] != 1) & (train_table["obscene"] != 1) & (train_table["threat"] != 1) & (train_table["insult"] != 1) & (train_table["identity_hate"] != 1)].reset_index(drop = True)
ax1 = other_comment_length.plot.hist(title = "other_comments_counts")

In [None]:
toxic_comment_length = train_table['processed_text_length'][(train_table["toxic"] == 1) | (train_table["severe_toxic"] == 1) | (train_table["obscene"] == 1) | (train_table["threat"] == 1) | (train_table["insult"] == 1) | (train_table["identity_hate"] == 1)].reset_index(drop = True)
ax2 = toxic_comment_length.plot.hist(title = "toxic_comments_counts")

## Analyze most common word in text

In [None]:
all_comments = []
for item in train_table['processed_text']:
    all_comments.append(item)
commonWord = ' '.join(all_comments)

from wordcloud import WordCloud,STOPWORDS
common_word_cloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(commonWord)

plt.figure(1,figsize=(12, 12))
plt.imshow(common_word_cloud)
plt.axis('off')
plt.show()

In [None]:
toxic_comments = []
for item in train_table["processed_text"][(train_table["toxic"] == 1) | (train_table["severe_toxic"] == 1) | (train_table["obscene"] == 1) | (train_table["threat"] == 1) | (train_table["insult"] == 1) | (train_table["identity_hate"] == 1)]:
    toxic_comments.append(item)
toxicWord = ' '.join(toxic_comments)

toxic_word_cloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(toxicWord)

plt.figure(1,figsize=(12, 12))
plt.imshow(toxic_word_cloud)
plt.axis('off')
plt.show()

## Naive Bayes

related_link: https://www.analyticsvidhya.com/blog/2021/07/performing-sentiment-analysis-with-naive-bayes-classifier/

In [None]:
from sklearn.model_selection import train_test_split
import joblib
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
train_table['easy_label'] = np.where(((train_table["toxic"] == 1) | (train_table["severe_toxic"] == 1) | (train_table["obscene"] == 1) | (train_table["threat"] == 1) | (train_table["insult"] == 1) | (train_table["identity_hate"] == 1)), 1, 0)


In [None]:
nb_data = train_table[["processed_text", "easy_label"]]

In [None]:
nb_x = nb_data['processed_text']
nb_y = nb_data['easy_label']
nb_x, nb_x_test, nb_y, nb_y_test = train_test_split(nb_x,nb_y, stratify=nb_y, test_size=0.25, random_state=42)

In [None]:
vec = CountVectorizer(stop_words='english')
nb_x = vec.fit_transform(nb_x).toarray()
nb_x_test = vec.transform(nb_x_test).toarray()

In [None]:
nb_x.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(nb_x, nb_y)

In [None]:
nb_model.score(nb_x_test, nb_y_test)

In [None]:
from sklearn.metrics import roc_auc_score, plot_roc_curve, RocCurveDisplay, ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(nb_model, nb_x_test, nb_y_test)
plt.show()

In [None]:
nb_auc_score = roc_auc_score(nb_y_test, nb_model.predict_proba(nb_x_test)[:,1])
RocCurveDisplay.from_estimator(nb_model, nb_x_test, nb_y_test)
plt.show()

# Converting to Tokens

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
nb_data["tokenized_length"] = nb_data["processed_text"].apply(lambda x:len(tokenizer.tokenize(x)))
nb_data.head()

In [None]:
import seaborn as sns

sns.set_theme(style="whitegrid")
ax = sns.histplot(x = "tokenized_length", data=nb_data)
ax.set(xlim=(0,350))

In [None]:
len(nb_data.loc[nb_data["tokenized_length"]<=350])/len(nb_data["tokenized_length"])

We could see most length is within 350.

In [None]:
from torch.utils.data import DataLoader, Dataset

max_len = 350

class BertDataSet(Dataset):
    
    def __init__(self, sentences, toxic_labels):
        self.sentences = sentences
        #target is a matrix with shape [#1 x #6(toxic, obscene, etc)]
        self.targets = toxic_labels.to_numpy()
    
    def __len__(self):
        return len(self.sentences)
    
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        bert_senten = tokenizer.encode_plus(
                                            text = sentence, 
                                            add_special_tokens = True, # Add `[CLS]` and `[SEP]`
                                            max_length = max_len, # Max length to truncate/pad
                                            pad_to_max_length=True, # Pad sentence to max length
                                            truncation = True, # truncate long sentence
                                            return_attention_mask = True  # Return attention mask
                                             )
        ids = torch.tensor(bert_senten['input_ids'], dtype = torch.long)
        mask = torch.tensor(bert_senten['attention_mask'], dtype = torch.long)
        toxic_label = torch.tensor(self.targets[idx], dtype = torch.float)
        
        
        return {
            'ids' : ids,
            'mask' : mask,
            'toxic_label':toxic_label
        }

In [None]:
# from torch.utils.data import DataLoader, Dataset

# max_len = 350

# Class BertDataSet(Dataset):
#     def __init__(sentences, labels):
#         self.sentences = sentences
#         self.labels = labels.to_numpy()
    
#     def __len__(self):
#         return len(self.sentences)
    
#     def __getitem__(self,idx):
        
        

@ohmeow you're loading the bert-base-cased checkpoint (which is a checkpoint that was trained using a similar architecture to BertForPreTraining) in a BertForSequenceClassification model.

This means that:

The layers that BertForPreTraining has, but BertForSequenceClassification does not have will be discarded
The layers that BertForSequenceClassification has but BertForPreTraining does not have will be randomly initialized.
This is expected, and tells you that you won't have good performance with your BertForSequenceClassification model before you fine-tune it 🙂.

@fliptrail this warning means that during your training, you're not using the pooler in order to compute the loss. I don't know how you're finetuning your model, but if you're not using the pooler layer then there's no need to worry about that warning.

# Model & Training

In [None]:
Kfold = 5
nb_data['kfold'] = nb_data.index % Kfold
nb_data.head()

# model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as func

batch_size = 16
epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

loss_fn = nn.BCEWithLogitsLoss()
loss_fn.to(device)
scaler = torch.cuda.amp.GradScaler()

In [None]:
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
def train(train_dataloader, model, optimizer, scheduler):
    
    model.train()
    torch.backends.cudnn.benchmark = True
    correct_predictions = 0
    
    for a in tqdm(train_dataloader):
        losses = []
        optimizer.zero_grad()
        
        #allpreds = []
        #alltargets = []
        
        with torch.cuda.amp.autocast():
            
            ids = a['ids'].to(device, non_blocking = True)
            mask = a['mask'].to(device, non_blocking = True) 

            output = model(ids, mask) #This gives model as output, however we want the values at the output
            output = output['logits'].squeeze(-1).to(torch.float32)

            output_probs = torch.sigmoid(output)
            preds = torch.where(output_probs > 0.5, 1, 0)
            
            toxic_label = a['toxic_label'].to(device, non_blocking = True) 
            loss = loss_fn(output, toxic_label)            
            
            losses.append(loss.item())
            #allpreds.append(output.detach().cpu().numpy())
            #alltargets.append(toxic.detach().squeeze(-1).cpu().numpy())
            correct_predictions += torch.sum(preds == toxic_label)
        
        scaler.scale(loss).backward() #Multiplies (‘scales’) a tensor or list of tensors by the scale factor.
                                      #Returns scaled outputs. If this instance of GradScaler is not enabled, outputs are returned unmodified.
        scaler.step(optimizer) #Returns the return value of optimizer.step(*args, **kwargs).
        scaler.update() #Updates the scale factor.If any optimizer steps were skipped the scale is multiplied by backoff_factor to reduce it. 
                        #If growth_interval unskipped iterations occurred consecutively, the scale is multiplied by growth_factor to increase it
        scheduler.step() # Update learning rate schedule
    
    losses = np.mean(losses)
    corr_preds = correct_predictions.detach().cpu().numpy()
#     accuracy = corr_preds/(len(train_dataloader)*batch_size*6)
    accuracy = corr_preds/(len(train_dataloader)*batch_size)
    
    return losses, accuracy

def validate(valid_dataloader, model):
    
    model.eval()
    correct_predictions = 0
    all_output_probs = []
    
    for a in valid_dataloader:
        losses = []
        ids = a['ids'].to(device, non_blocking = True)
        mask = a['mask'].to(device, non_blocking = True)
        output = model(ids, mask)
        output = output['logits'].squeeze(-1).to(torch.float32)
        output_probs = torch.sigmoid(output)
        preds = torch.where(output_probs > 0.5, 1, 0)
            
        toxic_label = a['toxic_label'].to(device, non_blocking = True)
        loss = loss_fn(output, toxic_label)
        losses.append(loss.item())
        all_output_probs.extend(output_probs.detach().cpu().numpy())
        
        correct_predictions += torch.sum(preds == toxic_label)
        corr_preds = correct_predictions.detach().cpu().numpy()
    
    losses = np.mean(losses)
    corr_preds = correct_predictions.detach().cpu().numpy()
#     accuracy = corr_preds/(len(valid_dataloader)*batch_size*6)
    accuracy = corr_preds/(len(valid_dataloader)*batch_size)

    
    return losses, accuracy, all_output_probs
        

In [None]:
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
def train(train_dataloader, model, optimizer, scheduler):
    
    model.train()
    torch.backends.cudnn.benchmark = True
    correct_predictions = 0
    
    for a in tqdm(train_dataloader):
        losses = []
        optimizer.zero_grad()
        
        #allpreds = []
        #alltargets = []
        
        with torch.cuda.amp.autocast():
            
            ids = a['ids'].to(device, non_blocking = True)
            mask = a['mask'].to(device, non_blocking = True) 

            output = model(ids, mask) #This gives model as output, however we want the values at the output
            output = output['logits'].squeeze(-1).to(torch.float32)

            output_probs = torch.sigmoid(output)
            preds = torch.where(output_probs > 0.5, 1, 0)
            
            toxic_label = a['toxic_label'].to(device, non_blocking = True) 
            loss = loss_fn(output, toxic_label)            
            
            losses.append(loss.item())
            #allpreds.append(output.detach().cpu().numpy())
            #alltargets.append(toxic.detach().squeeze(-1).cpu().numpy())
            correct_predictions += torch.sum(preds == toxic_label)
        
        scaler.scale(loss).backward() #Multiplies (‘scales’) a tensor or list of tensors by the scale factor.
                                      #Returns scaled outputs. If this instance of GradScaler is not enabled, outputs are returned unmodified.
        scaler.step(optimizer) #Returns the return value of optimizer.step(*args, **kwargs).
        scaler.update() #Updates the scale factor.If any optimizer steps were skipped the scale is multiplied by backoff_factor to reduce it. 
                        #If growth_interval unskipped iterations occurred consecutively, the scale is multiplied by growth_factor to increase it
        scheduler.step() # Update learning rate schedule
    
    losses = np.mean(losses)
    corr_preds = correct_predictions.detach().cpu().numpy()
    accuracy = corr_preds/(len(train_dataloader)*batch_size*6)
    
    return losses, accuracy

def validate(valid_dataloader, model):
    
    model.eval()
    correct_predictions = 0
    all_output_probs = []
    
    for a in valid_dataloader:
        losses = []
        ids = a['ids'].to(device, non_blocking = True)
        mask = a['mask'].to(device, non_blocking = True)
        output = model(ids, mask)
        output = output['logits'].squeeze(-1).to(torch.float32)
        output_probs = torch.sigmoid(output)
        preds = torch.where(output_probs > 0.5, 1, 0)
            
        toxic_label = a['toxic_label'].to(device, non_blocking = True)
        loss = loss_fn(output, toxic_label)
        losses.append(loss.item())
        all_output_probs.extend(output_probs.detach().cpu().numpy())
        
        correct_predictions += torch.sum(preds == toxic_label)
        corr_preds = correct_predictions.detach().cpu().numpy()
    
    losses = np.mean(losses)
    corr_preds = correct_predictions.detach().cpu().numpy()
    accuracy = corr_preds/(len(valid_dataloader)*batch_size*6)
    
    return losses, accuracy, all_output_probs
        

In [None]:
best_scores = []

for fold in range(5):
    
    Xtrain = nb_data["processed_text"][nb_data['kfold'] != fold].reset_index(drop = True)
    Ytrain = nb_data["easy_label"][nb_data['kfold'] != fold].reset_index(drop = True)
    Xvalid = nb_data["processed_text"][nb_data['kfold'] == fold].reset_index(drop = True)
    Yvalid = nb_data["easy_label"][nb_data['kfold'] == fold].reset_index(drop = True)
    
    train_dataset = BertDataSet(Xtrain, Ytrain)
    valid_dataset = BertDataSet(Xvalid, Yvalid)
    
    train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True)
    valid_dataloader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = True, num_workers = 2, pin_memory = True)
    
    model = transformers.BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 1) # num_labels!
    model.to(device)
    
    LR = 2e-5
    optimizer = AdamW(model.parameters(), LR, betas = (0.9, 0.999), weight_decay = 1e-2)
    
    train_steps = int(len(Xtrain)/batch_size * epochs)
    num_steps = int(train_steps * 0.1)
    
    scheduler = get_linear_schedule_with_warmup(optimizer, num_steps, train_steps)
    
    best_score = 1000
    train_accs = []
    valid_accs = []
    train_losses = []
    valid_losses = []
    best_valid_probs = []
    
    print("-------------- Fold = " + str(fold) + "-------------")
    
    for epoch in range(epochs):
        print("-------------- Epoch = " + str(epoch) + "-------------")
        
        train_loss, train_acc = train(train_dataloader, model, optimizer, scheduler)
        valid_loss, valid_acc, valid_probs = validate(valid_dataloader, model)

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        
        print('train losses: %.4f' %(train_loss), 'train accuracy: %.3f' %(train_acc))
        print('valid losses: %.4f' %(valid_loss), 'valid accuracy: %.3f' %(valid_acc))

        if (valid_loss < best_score):

            best_score = valid_loss
            print("Found an improved model! :)")

            state = {'state_dict': model.state_dict(),
                     'optimizer_dict': optimizer.state_dict(),
                     'best_score':best_score
                    }

            torch.save(state, "model" + str(fold) + ".pth")
            best_valid_prob = valid_probs
            torch.cuda.memory_summary(device = None, abbreviated = False)
        else:
            pass


    best_scores.append(best_score)
    best_valid_probs.append(best_valid_prob)
    
    ##Plotting the result for each fold
    x = np.arange(epochs)
    fig, ax = plt.subplots(1, 2, figsize = (15,4))
    ax[0].plot(x, train_losses)
    ax[0].plot(x, valid_losses)
    ax[0].set_ylabel('Losses', weight = 'bold')
    ax[0].set_xlabel('Epochs')
    ax[0].grid(alpha = 0.3)
    ax[0].legend(labels = ['train losses', 'valid losses'])

    ax[1].plot(x, train_accs)
    ax[1].plot(x, valid_accs)
    ax[1].set_ylabel('Accuracy', weight = 'bold')
    ax[1].set_xlabel('Epochs')
    ax[1].legend(labels = ['train acc', 'valid acc'])

    ax[1].grid(alpha = 0.3)
    fig.suptitle('Fold = '+str(fold), weight = 'bold') 
    

In [None]:
best_scores

In [None]:
print('Mean of',Kfold, 'folds for best loss in', epochs, 'epochs cross-validation folds is %.4f.' %(np.mean(best_scores)))