In [333]:
import torch
import torch.nn as nn
import pickle
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import torch.nn.functional as F
from tqdm import tqdm_notebook, trange
import os
from pytorch_transformers import BertConfig
# from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

from torch.utils.data import Dataset, DataLoader
import re
import math

from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import spacy
import nltk
import string
from collections import defaultdict
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import WordNetLemmatizer

In [2]:
from sklearn.model_selection import train_test_split

## BERT - fine tuning

In [3]:
model = BertModel.from_pretrained('bert-base-uncased',
           output_hidden_states = True, return_dict=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def get_embedding(text, model=model, tokenizer=tokenizer):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(inputs['input_ids'])
    hidden_states = outputs['hidden_states']
    return outputs['pooler_output']

In [5]:
# get_embedding('fair')

In [6]:
data = pd.read_csv('data/train_clean.csv', index_col = 'Unnamed: 0')

In [7]:
data=data.dropna()

In [12]:
data.head()

Unnamed: 0,comment,target
0,explanation why the edits made under username ...,0
1,aww matches this background colour seemingly s...,0
2,hey man really not trying edit war just that t...,0
3,more can make any real suggestions improvement...,0
4,you sir are hero any chance you remember what ...,0


In [8]:
## down sample majority group
data['target'].value_counts()

0    143332
1     16220
Name: target, dtype: int64

In [9]:
data_maj = data[data['target'] == 0].sample(16220)
data_mio = data[data['target'] == 1]

In [10]:
data_out = pd.concat([data_maj, data_mio], ignore_index=True)

In [11]:
data_out['target'].value_counts()

0    16220
1    16220
Name: target, dtype: int64

In [14]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    text = text.lower()
    text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if (len(w) > 3) and (w not in ENGLISH_STOP_WORDS)]  # ignore a, an, to, at, be, ...
    return tokens

def normalizewords(words):
    """
    Given a list of tokens/words, return a new list of normalize words
    """
    lemmatizer = nltk.stem.WordNetLemmatizer()
    normal = []
    for word, tag in nltk.pos_tag(words):
        wtag = tag[0].lower()
        wtag = wtag if wtag in ['a', 'r', 'n', 'v'] else None
        lemma = lemmatizer.lemmatize(word, wtag) if wtag else word
        normal.append(lemma)
    return ' '.join(normal)

def pre_proess_text(x):
    X = []
    for i in range(len(x)):
        X.append(normalizewords(tokenize(x[i])))
    return X

In [334]:
comment = pre_proess_text(data_out['comment'].values)

In [337]:
data_out['clean'] = comment

In [342]:
data_out['length'] = data_out['clean'].apply(lambda x: len(x))

In [344]:
data_out.shape

(32440, 4)

In [350]:
data_out = data_out[data_out['length'] > 5]

In [351]:
X_train, X_valid, y_train, y_valid = train_test_split(data_out['comment'].values, data_out['target'].values,
                                                      test_size=0.2, random_state=1)

In [355]:
# X_train

In [356]:
X = [t.split() for t in X_train]
X_test = [t.split() for t in X_valid]

In [357]:
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line)
        for word in words:
            vocab[word] += 1
    return vocab  

In [358]:
word_count = get_vocab(X)

In [359]:
len(word_count)

55244

In [361]:
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, hidden_dropout_prob=0.25,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

In [418]:
def text2tks(text, max_seq_length=50, padding_start = False):
    tok_text = tokenizer.tokenize(text)
    if len(tok_text) > max_seq_length:
        tok_text = tok_text[:max_seq_length]
    ids_text  = tokenizer.convert_tokens_to_ids(tok_text)
    padding = [0] * (max_seq_length - len(ids_text))
    if padding_start:
        out = padding + ids_text
    else:
        out = ids_text + padding
    return np.array(out)

In [419]:
text2tks(X_train[600], padding_start=False)

array([ 6100,  2098,  4183,  5227,  7592,  2551,  1996,  6904,  2278,
        6798,  8670, 27488,  1998,  2070, 15814,  2031,  4081,  2008,
        1996, 12388,  3791,  2147,  2017,  2031,  2051,  2202,  2298,
        1996,  3720,  4283,  5083,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0])

In [420]:
class toxicDataset(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
        
    def __getitem__(self, index):
        x = self.x[index]
        x = text2tks(x, padding_start=False)
        return x, self.y[index]
    
    def __len__(self):
        return len(self.y)

In [421]:
train_ds = toxicDataset(X_train, y_train)
valid_ds = toxicDataset(X_valid, y_valid)

In [422]:
batch_size = 10
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [424]:
# next(iter(train_dl))

In [425]:
class BertForSequenceClassification(nn.Module):
    def __init__(self):
        super(BertForSequenceClassification, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased',
                                               output_hidden_states = True, return_dict=True)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits
    
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [426]:
def train_model(model, loss_func, optimizer, num_epochs=25):
    for epoch in range(num_epochs):
        model.train()
        sum_loss = 0
        total = 0
        weights = [0.1]
        class_weight = torch.FloatTensor(weights).cuda()
        for x, y in train_dl:
            x = x.cuda()
            y = y.unsqueeze(1).float().cuda()
            optimizer.zero_grad()
            logits = model(x)
#             print(logits.shape, y.shape)
            loss = loss_func(logits, y)
            loss.backward()
#             print(loss.grad, logits.grad)
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        epoch_loss = sum_loss/total
        val_loss, accuracy = eval_model(model, loss_func)
        print('train loss: {:.3f}, valid loss {:.3f} accuracy {:.3f}'.format(epoch_loss, val_loss, accuracy))

In [427]:
def eval_model(model, loss_func):
    model.eval()
    sum_loss = 0
    total = 0
    correct = 0
    weights = [0.1, 0.9]
    class_weight = torch.FloatTensor(weights).cuda()
    for x, y in valid_dl:
        x = x.cuda()
        y = y.unsqueeze(1).float().cuda()
        y_hat = model(x)
#         print(y_hat)
        loss = F.binary_cross_entropy_with_logits(y_hat, y) 
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        sum_loss += loss.item()*y.shape[0]
        total += y.shape[0]
    accuracy = correct/total
    epoch_loss = sum_loss/total
    return epoch_loss, accuracy

In [429]:
torch.cuda.set_device(3)

In [439]:
model = BertForSequenceClassification().cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [432]:
# criterion = nn.BCEWithLogitsLoss()
# weights = [0.9]
# class_weight = torch.FloatTensor(weights).cuda()
loss_func = nn.BCEWithLogitsLoss()

In [431]:
# !nvidia-smi

In [440]:
lrlast = .0001
lrmain = .000001
optimizer = torch.optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

In [441]:
train_model(model, loss_func, optimizer, num_epochs=8)

train loss: 0.422, valid loss 0.303 accuracy 0.870
train loss: 0.265, valid loss 0.238 accuracy 0.902
train loss: 0.224, valid loss 0.227 accuracy 0.908
train loss: 0.199, valid loss 0.209 accuracy 0.915
train loss: 0.185, valid loss 0.211 accuracy 0.916
train loss: 0.168, valid loss 0.211 accuracy 0.920
train loss: 0.153, valid loss 0.207 accuracy 0.919
train loss: 0.140, valid loss 0.218 accuracy 0.918


In [442]:
lrlast = .00001
lrmain = .000001
optimizer = torch.optim.Adam(
    [
        {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

In [443]:
train_model(model, loss_func, optimizer, num_epochs=3)

train loss: 0.126, valid loss 0.227 accuracy 0.916
train loss: 0.113, valid loss 0.238 accuracy 0.916
train loss: 0.101, valid loss 0.264 accuracy 0.910


## BERT -Use as per-train Embedding and applied on CBOW

In [362]:
def delete_rare_words(word_count, min_df=4):
    """ Deletes rare words from word_count
    
    Deletes words from word_count if they are not in word_vecs
    and don't have at least min_df occurrencies in word_count.
    """
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df:
            words_delete.append(word)
    for word in words_delete: 
        word_count.pop(word)
    return word_count

In [363]:
def create_embedding_matrix(get_emb, word_count, emb_size=768):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_count, min_df=5)
    V = len(word_count.keys()) + 2
    vocab2index = {}
    W = torch.zeros((V, emb_size))
    # adding a vector for padding
    W[0] = torch.zeros(emb_size)
    # adding a vector for rare words 
    W[1] = torch.from_numpy(np.random.uniform(-0.25,0.25, emb_size))
    vocab2index["UNK"] = 1
    i = 2
    for word in word_count:
        if word in word_count.keys():
            word_vec = get_emb(word)
            vocab2index[word] = i
            W[i] = word_vec
            i += 1
        else:
            W[i] = torch.from_numpy(np.random.uniform(-0.25,0.25, emb_size))
            vocab2index[word] = i
            i += 1
    return W, vocab2index

In [58]:
# word_count

In [364]:
pretrained_weight, vocab2index = create_embedding_matrix(get_embedding, word_count)

In [366]:
# vocab2index

In [49]:
def encode_sentence(s, N=80):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc, l

In [50]:
class toxicDataset2(Dataset):
    def __init__(self, X, y):
        self.x = X
        self.y = y
        
    def __getitem__(self, index):
        x = self.x[index]
        x, s = encode_sentence(x, 60)
        return x, self.y[index], s
    
    def __len__(self):
        return len(self.y)

In [367]:
train_ds2 = toxicDataset2(X, y_train)
valid_ds2 = toxicDataset2(X_test, y_valid)

In [368]:
batch_size = 500
train_dl2 = DataLoader(train_ds2, batch_size=batch_size, shuffle=True)
valid_dl2 = DataLoader(valid_ds2, batch_size=batch_size)

In [214]:
# for i in iter(train_dl2):
#     aa = i

In [408]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, emb_size=768, glove_weights=None):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        if glove_weights is not None:
            self.embedding.weight.data.copy_(glove_weights)
            self.embedding.weight.requires_grad = False ## freeze embeddings
        self.bn = nn.BatchNorm1d(768)
        self.linear = nn.Linear(emb_size, 1)
        
        
    def forward(self, x, s):
        x = self.embedding(x)
        
#         print('emb')
#         x = self.bn(x)
#         print('norm')
#         print(x)
        x = torch.nansum(x, dim=1)/ s
        x = self.bn(x)
        x = self.linear(x)
#         print(x.shape)
#         print(x)
        return x

In [397]:
def train_epocs2(model, optimizer, epochs=10):
    for i in range(epochs):
        model.train()
        total_loss = 0
        total = 0
        count = 0
        for x, y, s in train_dl2:
            count += 1
            x = x.long()
            y = y.float().unsqueeze(1)
            s = s.float().view(s.shape[0], 1)
            out = model(x, s)
#             print(x)
#             print(count)
#             print(out)
            loss = F.binary_cross_entropy_with_logits(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += x.size(0)*loss.item()
            total += x.size(0)
#             print(count)
            if count > 2:
#                 print(out) 
                break
        train_loss = total_loss/total
        val_loss, val_accuracy = valid_metrics2(model)
        
        print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (
            train_loss, val_loss, val_accuracy))

In [378]:
def valid_metrics2(model):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x, y, s in valid_dl2:
        x = x.long()  #.cuda()
        y = y.float().unsqueeze(1)
        s = s.float().view(s.shape[0], 1)
        batch = y.shape[0]
        out = model(x, s)
        loss = F.binary_cross_entropy_with_logits(out, y)
        sum_loss += batch*(loss.item())
        total += batch
        pred = (out > 0).float()
        correct += (pred == y).float().sum().item()
    val_loss = sum_loss/total
    val_acc = correct/total
    return val_loss, val_acc

In [409]:
model = CBOW(len(pretrained_weight), emb_size=768, glove_weights=pretrained_weight)

In [406]:
# for i in model.parameters():
#     print(i)
#     print(i.shape)

In [410]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
train_epocs2(model, optimizer, epochs=10)

train_loss 0.916 val_loss 0.684 val_accuracy 0.524
train_loss 0.693 val_loss 0.673 val_accuracy 0.621
train_loss 0.600 val_loss 0.721 val_accuracy 0.494
train_loss 0.633 val_loss 0.670 val_accuracy 0.499
train_loss 0.501 val_loss 0.647 val_accuracy 0.666
train_loss 0.506 val_loss 0.642 val_accuracy 0.626
train_loss 0.503 val_loss 0.629 val_accuracy 0.668
train_loss 0.480 val_loss 0.618 val_accuracy 0.673
train_loss 0.439 val_loss 0.596 val_accuracy 0.735
train_loss 0.439 val_loss 0.589 val_accuracy 0.702


In [411]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_epocs2(model, optimizer, epochs=10)

train_loss 0.447 val_loss 0.563 val_accuracy 0.761
train_loss 0.445 val_loss 0.550 val_accuracy 0.773
train_loss 0.422 val_loss 0.541 val_accuracy 0.763
train_loss 0.451 val_loss 0.526 val_accuracy 0.771
train_loss 0.411 val_loss 0.509 val_accuracy 0.790
train_loss 0.413 val_loss 0.495 val_accuracy 0.800
train_loss 0.441 val_loss 0.482 val_accuracy 0.800
train_loss 0.417 val_loss 0.472 val_accuracy 0.798
train_loss 0.445 val_loss 0.463 val_accuracy 0.802
train_loss 0.406 val_loss 0.457 val_accuracy 0.802


In [412]:
model.embedding.weight.requires_grad = True
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_epocs2(model, optimizer, epochs=10)

train_loss 0.409 val_loss 0.425 val_accuracy 0.819
train_loss 0.391 val_loss 0.397 val_accuracy 0.834
train_loss 0.367 val_loss 0.384 val_accuracy 0.836
train_loss 0.349 val_loss 0.366 val_accuracy 0.846
train_loss 0.346 val_loss 0.348 val_accuracy 0.854
train_loss 0.325 val_loss 0.339 val_accuracy 0.860
train_loss 0.314 val_loss 0.331 val_accuracy 0.862
train_loss 0.297 val_loss 0.322 val_accuracy 0.867
train_loss 0.300 val_loss 0.318 val_accuracy 0.869
train_loss 0.314 val_loss 0.316 val_accuracy 0.869
