<a href="https://colab.research.google.com/github/SaumilShah-7/Unintended-Bias-in-Toxicity-Classification-Kaggle/blob/master/BERT%20Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall torch -y

In [None]:
!pip install torch==1.1.0

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
import sys
import shutil
from tqdm.notebook import tqdm
import gc
import pickle

In [None]:
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ../input/nvidiaapex/repository/NVIDIA-apex-39e153a

from apex import amp

In [None]:
bert_package_dir = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.insert(0, bert_package_dir)

from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch, BertConfig, BertTokenizer, BertForSequenceClassification, BertAdam

In [None]:
bert_model_path = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch(bert_model_path + 'bert_model.ckpt', bert_model_path + 'bert_config.json', 'pytorch_model.bin')

shutil.copyfile(bert_model_path + 'bert_config.json', 'bert_config.json')
bert_config = BertConfig(bert_model_path + 'bert_config.json')

In [None]:
def convert_lines(example, max_seq_length, tokenizer):
    max_seq_length -= 2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens = tokenizer.tokenize(text)
        if len(tokens) > max_seq_length:
            tokens = tokens[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens+["[SEP]"]) + [0]*(max_seq_length - len(tokens))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

In [None]:
SEED = 1234
num_to_load = 700000

train_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv').sample(num_to_load, random_state=SEED)
print(train_df.shape)

In [None]:
max_seq_length = 220

train_df['comment_text'] = train_df['comment_text'].astype(str).fillna('DUMMY_VALUE')
tokenizer = BertTokenizer.from_pretrained(bert_model_path, cache_dir=None, do_lower_case=True)

sequences = convert_lines(train_df["comment_text"], max_seq_length, tokenizer)

with open('sequences.pickle', 'wb') as handle:
  pickle.dump(sequences, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('../input/bert-unintended-bias/sequences.pickle', 'rb') as handle:
#     sequences = pickle.load(handle)

In [None]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
y_columns = ['target']

train_df = train_df.fillna(0)
train_df[y_columns] = (train_df[y_columns]>=0.5).astype(float)

In [None]:
X = sequences[:num_to_load]                
y = train_df[y_columns].values[:num_to_load]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
epochs = 1
lr = 2e-5
batch_size = 32
accumulation_steps = 2

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
train = torch.utils.data.TensorDataset(torch.tensor(X,dtype=torch.long), torch.tensor(y,dtype=torch.float))
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)

In [None]:
model = BertForSequenceClassification.from_pretrained("./", cache_dir=None, num_labels=len(y_columns))

model.zero_grad()
model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

num_train_optimization_steps = int(epochs*len(train)/batch_size/accumulation_steps)
optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
model = model.train()

In [None]:
# checkpoint = torch.load('./bert_pytorch_checkpoint.bin')

# model = BertForSequenceClassification(bert_config, num_labels=len(y_columns))
# model.load_state_dict(checkpoint['model_state_dict'])
# model.zero_grad()
# model = model.to(device)

# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

# num_train_optimization_steps = int(epochs*len(train)/batch_size/accumulation_steps)
# optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps)
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
# model = model.eval()

# del checkpoint
# torch.cuda.empty_cache()
# gc.collect()
# print(round(torch.cuda.memory_allocated(device)/(1024**3),1), round(torch.cuda.memory_cached(device)/(1024**3),1))

In [None]:
# tq1 = tqdm(enumerate(train_loader), total=len(train_loader))

# for i, (x_batch, y_batch) in tq1:
#     if i == len(train_loader) / 2:
#         x_batch_t = x_batch.to(device)
#         y_batch_t = y_batch.to(device)
#         a_mask = (x_batch>0).to(device)
#         y_pred = model(x_batch_t, attention_mask=a_mask, labels=None)
#         loss =  F.binary_cross_entropy_with_logits(y_pred, y_batch_t)
#         print(loss.item())
#     else:
#         continue
        
# del model, optimizer, y_pred, x_batch, y_batch, x_batch_t, y_batch_t, tq1, a_mask, loss
# torch.cuda.empty_cache()
# gc.collect()
# print(round(torch.cuda.memory_allocated(device)/(1024**3),1), round(torch.cuda.memory_cached(device)/(1024**3),1))

In [None]:
tq = tqdm(range(epochs))

for epoch in tq:
    avg_loss = 0.
    avg_accuracy = 0.
    lossf = None
    optimizer.zero_grad()
    tq1 = tqdm(enumerate(train_loader), total=len(train_loader))
    
    for i, (x_batch, y_batch) in tq1:
        x_batch_t = x_batch.to(device)
        y_batch_t = y_batch.to(device)
        a_mask = (x_batch>0).to(device)
        y_pred = model(x_batch_t, attention_mask=a_mask, labels=None)
        loss =  F.binary_cross_entropy_with_logits(y_pred, y_batch_t)
        print(i)
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        if (i+1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tq1.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float)).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss, avg_accuracy=avg_accuracy)
    
# checkpoint_model_file = 'bert_pytorch_checkpoint.bin'
# torch.save({'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, checkpoint_model_file)

output_model_file = "bert_pytorch.bin"
torch.save(model.state_dict(), output_model_file)

del model, optimizer, x_batch, y_batch, y_pred, tq, tq1, loss, scaled_loss, a_mask, x_batch_t, y_batch_t
torch.cuda.empty_cache()
gc.collect()
print(round(torch.cuda.memory_allocated(device)/(1024**3),1), round(torch.cuda.memory_cached(device)/(1024**3),1))

In [None]:
test_df = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')
print(test_df.shape)

test_df['comment_text'] = test_df['comment_text'].astype(str).fillna('DUMMY_VALUE')

X_test = convert_lines(test_df["comment_text"], max_seq_length, tokenizer)

test = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

In [None]:
model = BertForSequenceClassification(bert_config, num_labels=len(y_columns))
# model.load_state_dict(torch.load(output_model_file))
model.load_state_dict(torch.load('../input/toxic-bert-plain-vanila/bert_pytorch.bin'))
model.to(device)

for param in model.parameters():
    param.requires_grad=False
model.eval()

test_preds = np.zeros((len(X_test)))

for i, (x_batch, ) in tqdm(enumerate(test_loader), total=len(test_loader)):
    pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    test_preds[i*batch_size:(i+1)*batch_size] = pred[:,0].detach().cpu().squeeze().numpy()

In [None]:
test_preds = torch.sigmoid(torch.tensor(test_preds)).numpy()
print(test_preds.shape)

In [None]:
submission = pd.DataFrame.from_dict({'id': test_df['id'], 'prediction': test_preds})
submission.to_csv('submission.csv', index=False)