<a href="https://colab.research.google.com/github/SaumilShah-7/Airbus-Ship-Detection-Challenge-Kaggle/blob/master/Toxic_Comment_Classification_(LSTM%2BGRU).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import gc
from tqdm.notebook import tqdm_notebook as tqdm

import tensorflow as tf
print(tf.__version__)

from tensorflow.keras.preprocessing import text, sequence

In [None]:
!unzip -q -o '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'
!unzip -q -o '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
!unzip -q -o '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'

In [None]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

print(train.shape)
print(test.shape)

In [None]:
import regex as re
!pip install Unidecode
from unidecode import unidecode

words_only = re.compile(r'[^A-Za-z\']')
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = words_only.sub(' ', x_ascii)
    return x_clean

train['clean_text'] = train['comment_text'].apply(lambda x: clean_text(x))
test['clean_text'] = test['comment_text'].apply(lambda x: clean_text(x))

In [None]:
print(train['comment_text'][1])
print(train['clean_text'][1])

In [None]:
train['clean_text'].fillna('something')
print(train[train.clean_text=='something'])
test['clean_text'].fillna('something')
print(test[test.clean_text=='something'])

In [None]:
max_features = 250000

In [None]:
t = text.Tokenizer(num_words=max_features)
t.fit_on_texts(list(train['clean_text'])+list(test['clean_text']))

print(len(t.word_index))

In [None]:
word_index = t.word_index
word_index

In [None]:
X_train = t.texts_to_sequences(train['clean_text'])
X_test = t.texts_to_sequences(test['clean_text'])

print(X_train[0])

In [None]:
l = list(map(len, X_train))
print('Min: %d, Mean: %d, Q3: %d, Max: %d' %(min(l), sum(l)/len(l), np.percentile(l, 75), max(l)))

In [None]:
toxicity_columns = list(train.columns)[2:-1]
print(toxicity_columns)

In [None]:
maxlen = 900
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

y_train = train[toxicity_columns].values

print(x_train.shape, y_train.shape)
print(x_test.shape)
print(toxicity_columns)

In [None]:
np.save('x_train.npy', x_train)
np.save('x_test.npy', x_test)
np.save('y_train.npy', y_train)

with open('word_index.pickle', 'wb') as handle:
  pickle.dump(word_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
del X_train, X_test, x_train, x_test, y_train, t, word_index, l

gc.collect()

In [None]:
ft_path = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
gl_path = '../input/glovetwitter27b100dtxt/glove.twitter.27B.200d.txt'

In [None]:
def get_coefs(word,*arr):
  return word, np.asarray(arr, dtype='float32')

In [None]:
# import gensim
# model = gensim.models.KeyedVectors.load_word2vec_format(ft_path)

# words = model.index2word

# w_rank = {}
# for i,word in enumerate(words):
#     w_rank[word] = i

# WORDS = w_rank

# del model, words, w_rank
# gc.collect()

In [None]:
# def words(text): return re.findall(r'\w+', text.lower())

# def P(word): 
#     "Probability of `word`."
#     # use inverse of rank as proxy
#     # returns 0 if the word isn't in the dictionary
#     return - WORDS.get(word, 0)

# def correction(word): 
#     "Most probable spelling correction for word."
#     return max(candidates(word), key=P)

# def candidates(word): 
#     "Generate possible spelling corrections for word."
#     return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

# def known(words): 
#     "The subset of `words` that appear in the dictionary of WORDS."
#     return set(w for w in words if w in WORDS)

# def edits1(word):
#     "All edits that are one edit away from `word`."
#     letters    = 'abcdefghijklmnopqrstuvwxyz'
#     splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
#     deletes    = [L + R[1:]               for L, R in splits if R]
#     transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
#     replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
#     inserts    = [L + c + R               for L, R in splits for c in letters]
#     return set(deletes + transposes + replaces + inserts)

# def edits2(word): 
#     "All edits that are two edits away from `word`."
#     return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
with open('word_index.pickle', 'rb') as handle:
    word_index = pickle.load(handle)

nb_words = min(max_features, len(word_index))
embed_size = 500
word_process = re.compile(r'[^A-Za-z]')

def getword(embeddings_keys, word):
    if word in embeddings_keys:
        return word
    elif word.lower() in embeddings_keys:
        return word.lower()
    elif word.upper() in embeddings_keys:
        return word.upper()
    elif word.capitalize() in embeddings_keys:
        return word.capitalize()
    elif word_process.sub('', word) in embeddings_keys:
        return word_process.sub('', word)
    elif len(word)>1 and len(word)<=15:
        x = correction(word)
        if x in embeddings_keys:
            return x

    return None

def build_matrix(nb_words, embed_size):
    embeddings_ft = dict(get_coefs(*o.strip().split()) for o in open(ft_path))
    embeddings_gl = dict(get_coefs(*o.strip().split()) for o in open(gl_path))
    embeddings_keys_ft = list(embeddings_ft.keys())
    
    corrected = []
    words_not_found = []
    matrix = np.zeros((nb_words, embed_size))
    
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            break
        else:
            word2 = getword(embeddings_keys_ft, word)
            if word2 is not None:
                matrix[i, :300] = embeddings_ft.get(word2)
                if embeddings_gl.get(word2) is not None:
                    matrix[i, 300:] = embeddings_gl.get(word2)
                if word2 != word:
                    corrected.append((word, word2))
            else:
                words_not_found.append(word)
                matrix[i, :300]=embeddings_ft.get("something")
                matrix[i, 300:]=embeddings_gl.get("something")
                
    return matrix, corrected, words_not_found

def build_matrix_1(nb_words, embed_size, correction_map):
    embeddings_ft = dict(get_coefs(*o.strip().split()) for o in open(ft_path))
    embeddings_gl = dict(get_coefs(*o.strip().split()) for o in open(gl_path))
    embeddings_keys_ft = list(embeddings_ft.keys())
    
    corrected = []
    words_not_found = []
    matrix = np.zeros((nb_words, embed_size))
    
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            break
        else:
            if embeddings_ft.get(word) is not None:
                matrix[i, :300] = embeddings_ft.get(word)
                if embeddings_gl.get(word) is not None:
                    matrix[i, 300:] = embeddings_gl.get(word)
            elif correction_map.get(word) is not None:
                word2 = correction_map.get(word)
                matrix[i, :300] = embeddings_ft.get(word2)
                if embeddings_gl.get(word2) is not None:
                    matrix[i, 300:] = embeddings_gl.get(word2)
                corrected.append((word, word2))
            else:
                words_not_found.append(word)
                matrix[i, :300]=embeddings_ft.get("something")
                matrix[i, 300:]=embeddings_gl.get("something")
        
                
    return matrix, corrected, words_not_found

In [None]:
with open('../input/mapping/correction_map_final.pickle', 'rb') as handle:
    correction_map = pickle.load(handle)

print(len(correction_map))

In [None]:
# embedding_matrix, corrected, words_not_found = build_matrix(nb_words, embed_size)
embedding_matrix, corrected, words_not_found = build_matrix_1(nb_words, embed_size, correction_map)

print(embedding_matrix.shape)

In [None]:
print(len(corrected))
print(corrected)

In [None]:
print(len(words_not_found))
print(words_not_found)

In [None]:
np.save('embedding_matrix.npy', embedding_matrix)

del embedding_matrix, words_not_found, corrected
gc.collect()

In [None]:
x_train = np.load('x_train.npy')
x_test = np.load('x_test.npy')
y_train = np.load('y_train.npy')
embedding_matrix = np.load('embedding_matrix.npy')

In [None]:
import torch
from torch import nn
from torch.nn import functional as F

LSTM_UNITS = 40
DENSE_HIDDEN_UNITS = 6 * LSTM_UNITS
    
class NeuralNet(nn.Module):
  def __init__(self, embedding_matrix, output_dim):
    super(NeuralNet, self).__init__()
    embed_size = embedding_matrix.shape[1]

    self.embedding = nn.Embedding(max_features, embed_size)
    self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
    self.embedding.weight.requires_grad = False
    self.embedding_dropout = nn.Dropout2d(0.5)

    self.lstm = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
    self.gru = nn.GRU(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)

    self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, output_dim)
        
  def forward(self, x):
    h_embedding = self.embedding(x)

    embeddings = h_embedding.unsqueeze(2)    # (N, T, 1, K)
    embeddings = embeddings.permute(0, 3, 2, 1)  # (N, K, 1, T)
    embeddings = self.embedding_dropout(embeddings)  # (N, K, 1, T), some features are masked
    embeddings = embeddings.permute(0, 3, 2, 1)  # (N, T, 1, K)
    h_embedding = embeddings.squeeze(2)  # (N, T, K)

    h_lstm, _ = self.lstm(h_embedding)
    h_gru, _ = self.gru(h_lstm)
    h_gru_last = h_gru[:, -1, :]

    avg_pool = torch.mean(h_gru, 1)
    max_pool, _ = torch.max(h_gru, 1)

    hidden = torch.cat((avg_pool, h_gru_last, max_pool), 1)

    # sigmoid layer included within BCEWithLogitLoss
    result = self.linear_out(hidden)

    return result

In [None]:
model_d = NeuralNet(embedding_matrix, y_train.shape[-1])
print(model_d)
del model_d

In [None]:
# from sklearn.model_selection import StratifiedKFold, train_test_split
# from sklearn.metrics import roc_auc_score
# import copy
# import time

# def sigmoid(x):
#     return 1 / (1 + np.exp(-x))

# def train_model(model_obj, x_train, y_train, x_test, output_dim, loss_fn, seed, lr=0.001, batch_size=32, 
#                 n_epochs=7):
    
#     x_tra, x_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=seed)
    
#     train = torch.utils.data.TensorDataset(torch.tensor(x_tra, dtype=torch.long).cuda(), torch.tensor(y_tra, dtype=torch.float32).cuda())
#     valid = torch.utils.data.TensorDataset(torch.tensor(x_val, dtype=torch.long).cuda(), torch.tensor(y_val, dtype=torch.float32).cuda())
      
#     train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=False)
#     valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
#     test_loader = torch.utils.data.DataLoader(torch.tensor(x_test, dtype=torch.long).cuda(), batch_size=batch_size, shuffle=False)
    
#     best_score = 0.
#     wait_count = 0
#     test_preds = np.zeros((len(x_test), output_dim))
    
#     model = copy.deepcopy(model_obj)
#     model.cuda()
    
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)

#     for epoch in range(n_epochs):
#         start_time = time.time()
        
#         model.train()
#         avg_loss = 0.
        
#         for x_batch, y_batch in tqdm(train_loader):
#             y_pred = model(x_batch)
#             loss = loss_fn(y_pred, y_batch)
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             avg_loss += loss.item() / len(train_loader)
            
#         model.eval()
#         valid_preds = np.zeros((len(y_val), output_dim))
#         avg_val_loss = 0.
        
#         for i, (x_batch, y_batch) in tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False):
#             y_pred = model(x_batch).detach()
#             avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
#             valid_preds[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())
            
#         roc_auc_val = roc_auc_score(y_val, valid_preds)
        
#         if roc_auc_val > best_score:
#             print('Score improved from {:.4f} to {:.4f}'.format(best_score, roc_auc_val))
#             best_score = roc_auc_val
#             wait_count = 0
#             for i, (x_batch) in tqdm(enumerate(test_loader), total=len(test_loader), leave=False):
#                 y_pred = sigmoid(model(x_batch).detach().cpu().numpy())
#                 test_preds[i * batch_size:(i+1) * batch_size] = y_pred
#         else:
#             wait_count += 1
#             if wait_count > 3:
#                 print('Early stopping with score {:.4f}'.format(best_score))
#                 break
        
#         scheduler.step()
#         elapsed_time = time.time() - start_time
#         print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t ROC-AUC Val Score={:.4f} \t time={:.2f}s'.format(
#           epoch + 1, n_epochs, avg_loss, avg_val_loss, roc_auc_val, elapsed_time))
          
#     del model, optimizer, scheduler, loss
#     torch.cuda.empty_cache()
    
#     return test_preds

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import copy
import time

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model_obj, x_train, y_train, x_test, output_dim, loss_fn, seed, lr=0.001, batch_size=32, 
                n_epochs=7, n_splits=10):
    
    batch_size_1 = 4 * batch_size
    x_test_torch = torch.tensor(x_test, dtype=torch.long).cuda()
    test_loader = torch.utils.data.DataLoader(x_test_torch, batch_size=batch_size_1, shuffle=False)
    
    test_preds = np.zeros((len(x_test), output_dim))
    all_roc_auc = []
    splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(x_train, y_train.sum(axis=1) > 0))
    
    for i, (train_idx, valid_idx) in enumerate(splits):
        x_train_fold = torch.tensor(x_train[train_idx], dtype=torch.long).cuda()
        y_train_fold = torch.tensor(y_train[train_idx], dtype=torch.float32).cuda()
        x_val_fold = torch.tensor(x_train[valid_idx], dtype=torch.long).cuda()
        y_val_fold = torch.tensor(y_train[valid_idx], dtype=torch.float32).cuda()

        model = copy.deepcopy(model_obj)
        model.cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.7 ** (epoch/3))

        train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
        valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size_1, shuffle=False)

        best_score = 0.
        wait_count = 0
        test_preds_fold = np.zeros((len(x_test), output_dim))

        print('Fold: ', i+1)
        for epoch in range(n_epochs):
            start_time = time.time()

            model.train()
            avg_loss = 0.

            for x_batch, y_batch in tqdm(train_loader):
                y_pred = model(x_batch)
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)

            model.eval()
            valid_preds_fold = np.zeros((x_val_fold.size(0), output_dim))
            avg_val_loss = 0.

            for i, (x_batch, y_batch) in tqdm(enumerate(valid_loader), total=len(valid_loader), leave=False):
                y_pred = model(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                valid_preds_fold[i * batch_size_1:(i+1) * batch_size_1] = sigmoid(y_pred.cpu().numpy())

            roc_auc_val = roc_auc_score(y_val_fold.detach().cpu().numpy(), valid_preds_fold)

            if roc_auc_val > best_score:
                print('Score improved from {:.4f} to {:.4f}'.format(best_score, roc_auc_val))
                best_score = roc_auc_val
                wait_count = 0
                torch.save(model.state_dict(), 'best_model.pt')
            else:
                wait_count += 1
                if wait_count > 3:
                    print('Early stopping with score {:.4f}'.format(best_score))
                    break

            scheduler.step()
            elapsed_time = time.time() - start_time
            print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t ROC-AUC Val Score={:.4f} \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, roc_auc_val, elapsed_time))

        model.load_state_dict(torch.load('best_model.pt'))
        all_roc_auc.append(best_score)

        for i, (x_batch) in tqdm(enumerate(test_loader), total=len(test_loader), leave=False):
            y_pred = sigmoid(model(x_batch).detach().cpu().numpy())
            test_preds_fold[i * batch_size_1:(i+1) * batch_size_1] = y_pred

        test_preds += test_preds_fold / len(splits)
        del model, optimizer, scheduler, loss
        torch.cuda.empty_cache()
        print('Latest ROC-AUC Stack: ', all_roc_auc)
         
    print('All folds done. Average ROC_AUC={:.4f}'.format(np.average(all_roc_auc)))
    return test_preds

In [None]:
def seed_everything(seed=1234):
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True

NUM_MODELS = 1

all_test_preds = []

for model_idx in range(NUM_MODELS):
  
  print('Model ', model_idx+1)
  SEED = 1234+((model_idx+1)*7)
  seed_everything(SEED)
  model = NeuralNet(embedding_matrix, y_train.shape[-1])
  
  test_preds = train_model(model, x_train, y_train, x_test, output_dim=y_train.shape[-1], 
                                        loss_fn=nn.BCEWithLogitsLoss(reduction='mean'), seed=SEED)
  
  all_test_preds.append(test_preds)

In [None]:
submid = pd.DataFrame({'id': test['id']})
submission = pd.concat([submid, pd.DataFrame(np.mean(all_test_preds, axis=0), columns = toxicity_columns)], axis=1)
submission.to_csv('submission.csv', index=False)