In [215]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/harshdetection/sample.csv
/kaggle/input/harshdetection/train.csv
/kaggle/input/harshdetection/test.csv


# Load the Data Files

In [216]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [217]:
main_df = pd.read_csv('/kaggle/input/harshdetection/train.csv')
test_df = pd.read_csv('/kaggle/input/harshdetection/test.csv')
print('Train Data Shape: ', main_df.shape)
print('Test Data Shape: ', test_df.shape)

Train Data Shape:  (89359, 8)
Test Data Shape:  (38297, 2)


In [218]:
main_df.head()

Unnamed: 0,id,text,harsh,extremely_harsh,vulgar,threatening,disrespect,targeted_hate
0,a8be7c5d4527adbbf15f,""", 6 December 2007 (UTC)\nI am interested, not...",0,0,0,0,0,0
1,0b7ca73f388222aad64d,I added about three missing parameters to temp...,0,0,0,0,0,0
2,db934381501872ba6f38,SANDBOX?? \n\nI DID YOUR MADRE DID IN THE SANDBOX,1,0,0,0,0,0
3,228015c4a87c4b1f09a7,"why good sir? Why? \n\nYou, sir, obviously do ...",1,0,1,1,1,0
4,b18f26cfa1408b52e949,"""\n\n Source \n\nIncase I forget, or someone e...",0,0,0,0,0,0


In [219]:
test_df.head()

Unnamed: 0,id,text
0,e0ae9d9474a5689a5791,in an interview before his execution
1,b64a191301cad4f11287,He knew what he was doing. The below posts are...
2,5e1953d9ae04bdc66408,Zzzzzzz... youre a real bore. Now go bore some...
3,23128f98196c8e8f7b90,"""\n\nYet, it remains confusion because the 910..."
4,2d3f1254f71472bf2b78,I was referring to them losing interest in van...


# BERT

In [220]:
# Device
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [221]:
class BertInputItem(object):
    
    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [222]:
def formatInput(text, label, max_seq_length, tokenizer, verbose=0):
    
    bert_inputs = []
    input_pair = zip(text, label)
    
    for(index, (text, label)) in tqdm(enumerate(input_pair)):
        
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]
            
        segment_ids = [0] * len(input_ids)
        input_mask = [1] * len(input_ids)
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        bert_inputs.append(BertInputItem(text=text,
                                        input_ids = input_ids,
                                        input_mask = input_mask,
                                        segment_ids = segment_ids,
                                        label_id = label))
    
    return bert_inputs

In [223]:
def evaluate(model, dataloader):
    model.eval()
    
    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask,
                                          token_type_ids=segment_ids, labels=label_ids)

        outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        
        predicted_labels += list(outputs)
        correct_labels += list(label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    
    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
        
    return eval_loss, correct_labels, predicted_labels

In [224]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def get_data_loader(features, max_seq_length, batch_size, shuffle=True): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader

In [225]:
from transformers import BertForSequenceClassification
from transformers import BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

from tqdm import trange
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

import time


MAX_SEQ_LENGTH = 100
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 20
LEARNING_RATE = 5e-5
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5
OUTPUT_DIR = "/tmp/"
MODEL_FILE_NAME = "bert_model.pth"
PATIENCE = 2
START_TIME = time.time()

def train(column, BERT_MODEL = 'bert-base-uncased'):
    
    model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels = 2)
    model.to(device)
    
    print(f'1. Model Loaded. Time = {time.time()-START_TIME}')
    
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
    
    print(f'2. Tokenizer Loaded, Time = {time.time()-START_TIME}')
    
    X_main = main_df['text']
    y_main = main_df[column]
    
    X_train, X_cv, y_train, y_cv = train_test_split(X_main, y_main, test_size=0.2, random_state=42)
    
    train_features = formatInput(X_train, y_train, MAX_SEQ_LENGTH, tokenizer, verbose=0)
    cv_features = formatInput(X_cv, y_cv, MAX_SEQ_LENGTH, tokenizer, verbose=0)
    
    print(f'3. Data Formatted, Time = {time.time()-START_TIME}')
    
    train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True)
    cv_dataloader = get_data_loader(cv_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)
    
    print(f'4. Data Loaded, Time = {time.time()-START_TIME}')
    
    num_train_steps = int(len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps = -1)    
    
    print(f"Training Starts for {NUM_TRAIN_EPOCHS} epochs")
    
    loss_history = []
    no_improvement = 0
    for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
            loss = outputs[0]

            if GRADIENT_ACCUMULATION_STEPS > 1:
                loss = loss / GRADIENT_ACCUMULATION_STEPS

            loss.backward()
            tr_loss += loss.item()

            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)  

                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

        dev_loss, _, _ = evaluate(model, dev_dataloader)

        print("Loss history:", loss_history)
        print("Dev loss:", dev_loss)

        if len(loss_history) == 0 or dev_loss < min(loss_history):
            no_improvement = 0
            model_to_save = model.module if hasattr(model, 'module') else model
            output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            no_improvement += 1

        if no_improvement >= PATIENCE: 
            print("No improvement on development set. Finish training.")
            break


        loss_history.append(dev_loss)


In [226]:
main_df.head()

Unnamed: 0,id,text,harsh,extremely_harsh,vulgar,threatening,disrespect,targeted_hate
0,a8be7c5d4527adbbf15f,""", 6 December 2007 (UTC)\nI am interested, not...",0,0,0,0,0,0
1,0b7ca73f388222aad64d,I added about three missing parameters to temp...,0,0,0,0,0,0
2,db934381501872ba6f38,SANDBOX?? \n\nI DID YOUR MADRE DID IN THE SANDBOX,1,0,0,0,0,0
3,228015c4a87c4b1f09a7,"why good sir? Why? \n\nYou, sir, obviously do ...",1,0,1,1,1,0
4,b18f26cfa1408b52e949,"""\n\n Source \n\nIncase I forget, or someone e...",0,0,0,0,0,0


In [227]:
# train(column = 'harsh')

In [None]:
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels = 2)
model.to(device)

print(f'1. Model Loaded. Time = {time.time()-START_TIME}')

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

print(f'2. Tokenizer Loaded, Time = {time.time()-START_TIME}')

X_main = main_df['text']
y_main = main_df['harsh']

X_train, X_cv, y_train, y_cv = train_test_split(X_main, y_main, test_size=0.2, random_state=42)

train_features = formatInput(X_train, y_train, MAX_SEQ_LENGTH, tokenizer, verbose=0)
cv_features = formatInput(X_cv, y_cv, MAX_SEQ_LENGTH, tokenizer, verbose=0)

print(f'3. Data Formatted, Time = {time.time()-START_TIME}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


1. Model Loaded. Time = 1.573256015777588
2. Tokenizer Loaded, Time = 1.7353920936584473


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for(index, (text, label)) in tqdm(enumerate(input_pair)):


0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (803 > 512). Running this sequence through the model will result in indexing errors


In [None]:
train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True)
cv_dataloader = get_data_loader(cv_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

print(f'4. Data Loaded, Time = {time.time()-START_TIME}')

num_train_steps = int(len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps = -1)    

print(f"Training Starts for {NUM_TRAIN_EPOCHS} epochs")

loss_history = []
no_improvement = 0

In [None]:
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
        loss = outputs[0]

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        tr_loss += loss.item()

        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)  

            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

    cv_loss, _, _ = evaluate(model, cv_dataloader)

    print("Loss history:", loss_history)
    print("Cv loss:", cv_loss)

    if len(loss_history) == 0 or cv_loss < min(loss_history):
        no_improvement = 0
        model_to_save = model.module if hasattr(model, 'module') else model
        output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
    else:
        no_improvement += 1

    if no_improvement >= PATIENCE: 
        print("No improvement on cv set. Finish training.")
        break


    loss_history.append(cv_loss)

In [None]:
for i in trange(100,desc="COVAXIN",unit="KAGGLE"):
    sleep(0.01)