<a href="https://colab.research.google.com/github/SaumilShah-7/Unintended-Bias-in-Toxicity-Classification-Kaggle/blob/master/BiLSTM%20Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
import time
from tqdm.notebook import tqdm_notebook as tqdm
import pickle
from tensorflow.keras.preprocessing import text, sequence
import regex as re
import copy
from sklearn.metrics import roc_auc_score
import gc

max_features = 100000

In [None]:
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

print(train.shape)
print(test.shape)

In [None]:
# !pip install Unidecode
from unidecode import unidecode

words_only = re.compile(r'[^A-Za-z\']')
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = words_only.sub(' ', x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(x))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(x))

In [None]:
print(train['comment_text'][0])
print(train['clean_text'][0])

In [None]:
t = text.Tokenizer(num_words=max_features)
t.fit_on_texts(list(train['clean_text'])+list(test['clean_text']))

print(len(t.word_index))

In [None]:
word_index = t.word_index
word_index

In [None]:
X_train = t.texts_to_sequences(train['clean_text'])
X_test = t.texts_to_sequences(test['clean_text'])

l = list(map(len, X_train))
print('Min: %d, Mean: %d, Q3: %d, Max: %d' %(min(l), sum(l)/len(l), np.percentile(l, 75), max(l)))

In [None]:
MAX_LEN = 220

x_train = sequence.pad_sequences(X_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(X_test, maxlen=MAX_LEN)

y_train = np.where(train['target'] >= 0.5, 1, 0)

aux_columns = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']
y_aux_train = train[aux_columns].fillna(0)

print(x_train.shape, x_test.shape, y_train.shape, y_aux_train.shape)

In [None]:
identity_columns = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

weights = np.ones((len(train),)) / 4

# Subgroup
weights += (train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int) / 4

# Background Positive, Subgroup Negative
weights += (((train['target'].values >= 0.5).astype(np.int) + (train[identity_columns].fillna(0).values < 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(np.int) / 4

# Background Negative, Subgroup Positive
weights += (((train['target'].values < 0.5).astype(np.int) + (train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(np.int) / 4

In [None]:
np.save('x_train.npy', x_train)
np.save('x_test.npy', x_test)
np.save('y_train.npy', y_train)
np.save('y_aux_train.npy', y_aux_train)
np.save('weights.npy', weights)

with open('word_index.pickle', 'wb') as handle:
  pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
del train, X_train, X_test, x_train, x_test, y_train, y_aux_train, t, word_index, weights

gc.collect()

In [None]:
ft_path = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
gl_path = '../input/glove840b300dtxt/glove.840B.300d.txt'

In [None]:
# import gensim
# model = gensim.models.KeyedVectors.load_word2vec_format(ft_path)

# words = model.index2word

# w_rank = {}
# for i,word in enumerate(words):
#     w_rank[word] = i

# WORDS = w_rank

# del model, words, w_rank
# gc.collect()

In [None]:
# def words(text): return re.findall(r'\w+', text.lower())

# def P(word): 
#     "Probability of `word`."
#     # use inverse of rank as proxy
#     # returns 0 if the word isn't in the dictionary
#     return - WORDS.get(word, 0)

# def correction(word): 
#     "Most probable spelling correction for word."
#     return max(candidates(word), key=P)

# def candidates(word): 
#     "Generate possible spelling corrections for word."
#     return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

# def known(words): 
#     "The subset of `words` that appear in the dictionary of WORDS."
#     return set(w for w in words if w in WORDS)

# def edits1(word):
#     "All edits that are one edit away from `word`."
#     letters    = 'abcdefghijklmnopqrstuvwxyz'
#     splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
#     deletes    = [L + R[1:]               for L, R in splits if R]
#     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
#     replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
#     inserts    = [L + c + R               for L, R in splits for c in letters]
#     return set(deletes + transposes + replaces + inserts)

# def edits2(word): 
#     "All edits that are two edits away from `word`."
#     return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
def get_coefs(word,*arr):
  return word, np.asarray(arr, dtype='float32')

In [None]:
# with open('word_index.pickle', 'rb') as handle:
#     word_index = pickle.load(handle)

# nb_words = min(max_features, len(word_index))
# word_process = re.compile(r'[^A-Za-z]')

# def getword(embeddings_keys, word):
#     if word in embeddings_keys:
#         return word
#     elif word.lower() in embeddings_keys:
#         return word.lower()
#     elif word.upper() in embeddings_keys:
#         return word.upper()
#     elif word.capitalize() in embeddings_keys:
#         return word.capitalize()
#     elif word_process.sub('', word) in embeddings_keys:
#         return word_process.sub('', word)
#     elif len(word)>1 and len(word)<=15:
#         x = correction(word)
#         if x in embeddings_keys:
#             return x

#     return None

# def build_matrix(path, nb_words, embed_size):
#     embeddings = dict(get_coefs(*o.strip().split(' ')) for o in open(path))
#     embeddings_keys = list(embeddings.keys())
#     corrected = []
#     words_not_found = []
#     matrix = np.zeros((nb_words, embed_size))
    
#     for word, i in tqdm(word_index.items()):
#         if i >= nb_words:
#             break
#         else:
#             word2 = getword(embeddings_keys, word)
#             if word2 is not None:
#                 matrix[i] = embeddings.get(word2)
#                 if word2 != word:
#                     corrected.append((word, word2))
#             else:
#                 words_not_found.append(word)
#     return matrix, corrected, words_not_found

In [None]:
with open('word_index.pickle', 'rb') as handle:
    word_index = pickle.load(handle)

with open('../input/mappings/ft_map.pickle', 'rb') as handle:
    ft_map = pickle.load(handle)

with open('../input/mappings/gl_map.pickle', 'rb') as handle:
    gl_map = pickle.load(handle)

print(len(ft_map), len(gl_map))

nb_words = min(max_features, len(word_index))

def build_matrix_1(path, nb_words, embed_size, correction_map):
    embeddings = dict(get_coefs(*o.strip().split(' ')) for o in open(path))
    embeddings_keys = list(embeddings.keys())
    corrected = []
    words_not_found = []
    matrix = np.zeros((nb_words, embed_size))
    
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            break
        else:
            if embeddings.get(word) is not None:
                matrix[i] = embeddings.get(word)
            elif correction_map.get(word) is not None:
                matrix[i] = embeddings.get(correction_map.get(word))
                corrected.append((word, correction_map.get(word)))
            else:
                words_not_found.append(word)
    return matrix, corrected, words_not_found

In [None]:
embedding_ft, corrected_ft, words_not_found_ft = build_matrix_1(ft_path, nb_words, 300, ft_map)
embedding_gl, corrected_gl, words_not_found_gl = build_matrix_1(gl_path, nb_words, 300, gl_map)

embedding_matrix = np.concatenate((embedding_ft, embedding_gl), axis=-1)

print(embedding_matrix.shape)

In [None]:
print(len(corrected_ft), len(words_not_found_ft))

print(corrected_ft)
print(words_not_found_ft)

In [None]:
print(len(corrected_gl), len(words_not_found_gl))

print(corrected_gl)
print(words_not_found_gl)

In [None]:
np.save('embedding_matrix.npy', embedding_matrix)

del embedding_matrix, words_not_found_ft, words_not_found_gl, corrected_ft, corrected_gl, embedding_ft, embedding_gl
gc.collect()

In [None]:
x_train = np.load('x_train.npy')
x_test = np.load('x_test.npy')
y_train = np.load('y_train.npy')
y_aux_train = np.load('y_aux_train.npy')
weights = np.load('weights.npy')
embedding_matrix = np.load('embedding_matrix.npy')

In [None]:
NUM_MODELS = 1
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
    
class NeuralNet(nn.Module):
  def __init__(self, embedding_matrix, num_aux_targets):
    super(NeuralNet, self).__init__()
    embed_size = embedding_matrix.shape[1]

    self.embedding = nn.Embedding(max_features, embed_size)
    self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
    self.embedding.weight.requires_grad = False
    self.embedding_dropout = nn.Dropout2d(0.3)

    self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
    self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)

    self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
    self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)

    self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
    self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        
  def forward(self, x):
    h_embedding = self.embedding(x)

    embeddings = h_embedding.unsqueeze(2)    # (N, T, 1, K)
    embeddings = embeddings.permute(0, 3, 2, 1)  # (N, K, 1, T)
    embeddings = self.embedding_dropout(embeddings)  # (N, K, 1, T), some features are masked
    embeddings = embeddings.permute(0, 3, 2, 1)  # (N, T, 1, K)
    h_embedding = embeddings.squeeze(2)  # (N, T, K)

    h_lstm1, _ = self.lstm1(h_embedding)
    h_lstm2, _ = self.lstm2(h_lstm1)

    avg_pool = torch.mean(h_lstm2, 1)
    max_pool, _ = torch.max(h_lstm2, 1)

    h_conc = torch.cat((max_pool, avg_pool), 1)
    h_conc_linear1  = F.relu(self.linear1(h_conc))
    h_conc_linear2  = F.relu(self.linear2(h_conc))

    hidden = h_conc + h_conc_linear1 + h_conc_linear2

    # sigmoid layer included within BCEWithLogitLoss
    result = self.linear_out(hidden)
    aux_result = self.linear_aux_out(hidden)
    out = torch.cat([result, aux_result], 1)

    return out

In [None]:
def custom_loss(data, targets):
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:, 1:2])(data[:, :1], targets[:, :1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:, 1:2], targets[:, 2:3])
    bce_loss_3 = nn.BCEWithLogitsLoss()(data[:, 2:3], targets[:, 3:4])
    bce_loss_4 = nn.BCEWithLogitsLoss()(data[:, 3:4], targets[:, 4:5])
    bce_loss_5 = nn.BCEWithLogitsLoss()(data[:, 4:5], targets[:, 5:6])
    bce_loss_6 = nn.BCEWithLogitsLoss()(data[:, 5:6], targets[:, 6:7])
    bce_loss_7 = nn.BCEWithLogitsLoss()(data[:, 6:7], targets[:, 7:8])
    bce_loss_8 = nn.BCEWithLogitsLoss()(data[:, 7:8], targets[:, 8:9])

    return bce_loss_1 + bce_loss_2 + bce_loss_3 + bce_loss_4 + bce_loss_5 + bce_loss_6 + bce_loss_7 + bce_loss_8

In [None]:
model_d = NeuralNet(embedding_matrix, len(y_aux_train))
print(model_d)
del model_d
gc.collect()

In [None]:
from sklearn.model_selection import StratifiedKFold

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model_obj, x_train, y_train, x_test, seed, lr=0.001, batch_size=512, n_epochs=3, n_splits=5):
  
  output_dim = y_train.shape[-1]-1
  x_test_torch = torch.tensor(x_test, dtype=torch.long).cuda()
  test_loader = torch.utils.data.DataLoader(x_test_torch, batch_size=batch_size, shuffle=False)

  avg_losses = []
  avg_val_losses = []

  train_preds = np.zeros((len(x_train)))
  test_preds = np.zeros((len(x_test), output_dim))

  splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(x_train, y_train[:, 0]))
  
  for i, (train_idx, valid_idx) in enumerate(splits):
    x_train_fold = torch.tensor(x_train[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(y_train[train_idx], dtype=torch.float32).cuda()
    x_val_fold = torch.tensor(x_train[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(y_train[valid_idx], dtype=torch.float32).cuda()
      
    model = copy.deepcopy(model_obj)
    model.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)

    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
      
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

    print('Fold: ', i)
    for epoch in range(n_epochs):
      start_time = time.time()

      model.train()
      avg_loss = 0.

      for x_batch, y_batch in tqdm(train_loader):
        y_pred = model(x_batch)
        loss = custom_loss(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)
        
      model.eval()
      valid_preds_fold = np.zeros((x_val_fold.size(0)))
      test_preds_fold = np.zeros((len(x_test), output_dim))
      avg_val_loss = 0.

      for i, (x_batch, y_batch) in enumerate(valid_loader):
        y_pred = model(x_batch).detach()
        avg_val_loss += custom_loss(y_pred, y_batch).item() / len(valid_loader)
        valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy()[:, 0])
  
      scheduler.step()
      
      elapsed_time = time.time() - start_time 
      print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
          epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
      
    avg_losses.append(avg_loss)
    avg_val_losses.append(avg_val_loss)
    train_preds[valid_idx] = valid_preds_fold

    for i, (x_batch) in enumerate(test_loader):
      y_pred = sigmoid(model(x_batch).detach().cpu().numpy())
      test_preds_fold[i * batch_size:(i+1) * batch_size] = y_pred
          
    test_preds += test_preds_fold / len(splits)
    del model, optimizer, scheduler, loss
    torch.cuda.empty_cache()

  print('All \t loss={:.4f} \t val_loss={:.4f}'.format(np.average(avg_losses),np.average(avg_val_losses)))
  return train_preds, test_preds

In [None]:
def seed_everything(seed=1234):
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True

y_train_combined = np.hstack([y_train.T[:, np.newaxis], weights.T[:, np.newaxis], y_aux_train])
print(y_train_combined.shape)

all_train_preds = []
all_test_preds = []

for model_idx in range(NUM_MODELS):
  
  print('Model ', model_idx)
  SEED = 1234+model_idx
  seed_everything(SEED)
  model = NeuralNet(embedding_matrix, y_aux_train.shape[-1])
  
  train_preds, test_preds = train_model(model, x_train, y_train_combined, x_test, seed=SEED)
  
  all_train_preds.append(train_preds)
  all_test_preds.append(test_preds)

In [None]:
submission = pd.DataFrame({'id': test['id'], 'prediction': np.mean(all_test_preds, axis=0)[:, 0]})
submission.to_csv('submission.csv', index=False)