In [None]:
import sys
package_dir = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT"
sys.path.append(package_dir)

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

%reload_ext autoreload
%autoreload 2
%matplotlib inline

import fastai
from fastai.train import Learner
from fastai.train import DataBunch
from fastai.callbacks import *
from fastai.basic_data import DatasetType
import fastprogress
from fastprogress import force_console_behavior
import numpy as np
from pprint import pprint
import pandas as pd
from pathlib import Path
import os
import time
import gc
import random
import pickle
from tqdm._tqdm_notebook import tqdm_notebook as tqdm

from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

# import torch.utils.data
from tqdm import tqdm
import warnings
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam
from pytorch_pretrained_bert import BertConfig
from nltk.tokenize.treebank import TreebankWordTokenizer

from gensim.models import KeyedVectors

In [None]:
def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    # reference: https://www.kaggle.com/coolcoder22/lightgbm-fast-compact-solution
    return len(re.findall(regexp, text))

def normal_feature_engineering(text_list):
    add_feats_matrix = np.zeros((len(text_list), 11))
    
    for idx in range(len(text_list)):
        target_text = text_list[idx]
        # num words
        add_feats_matrix[idx, 0] = len(target_text.split(' '))
        # unique num words
        add_feats_matrix[idx, 1] = len(set(target_text.split(' ')))
        # url
        add_feats_matrix[idx, 2] = count_regexp_occ(r'http[s]{0,1}://\S+', target_text)
        # mail 
        add_feats_matrix[idx, 3] = count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', target_text)
        # all upper words
        add_feats_matrix[idx, 4] = count_regexp_occ(r'^[A-Z]+[A-Z]$', target_text)
        
        # symbols 1
        symbol_cnt = 0
        for symbol in symbols_to_isolate:
            if symbol in target_text:
                symbol_cnt += 1
        
        add_feats_matrix[idx, 5] = symbol_cnt
        
        # symbols 2
        symbol_delete_cnt = 0
        for symbol_del in symbols_to_delete:
            if symbol_del in target_text:
                symbol_delete_cnt += 1
        
        add_feats_matrix[idx, 6] = symbol_delete_cnt

        # rate features
        add_feats_matrix[idx, 7] = add_feats_matrix[idx, 1] / add_feats_matrix[idx, 0]
        add_feats_matrix[idx, 8] = add_feats_matrix[idx, 4] / add_feats_matrix[idx, 0]
        add_feats_matrix[idx, 9] = add_feats_matrix[idx, 5] / add_feats_matrix[idx, 0]
        add_feats_matrix[idx, 10] = add_feats_matrix[idx, 6] / add_feats_matrix[idx, 0]
    
    print('normal feature enginnering done: {}'.format(add_feats_matrix.shape))
    return add_feats_matrix

In [None]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

def is_interactive():
    return 'SHLVL' not in os.environ

def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path,'rb') as f:
        emb_arr = pickle.load(f)
    return emb_arr

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((max_features + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        try:
                            embedding_matrix[i] = embedding_index[word.upper()]
                        except KeyError:
                            unknown_words.append(word)
    return embedding_matrix, unknown_words


class SequenceBucketCollator():
    def __init__(self, choose_length, normal_feats_idx, sequence_index, length_index, label_index=None):
        self.choose_length = choose_length
        self.sequence_index = sequence_index
        self.length_index = length_index
        self.label_index = label_index
        self.normal_feats_idx = normal_feats_idx

    def __call__(self, batch):
        batch = [torch.stack(x) for x in list(zip(*batch))]
        
        sequences = batch[self.sequence_index]
        lengths = batch[self.length_index]
        normal_feats = batch[self.normal_feats_idx]

        length = self.choose_length(lengths)
        mask = torch.arange(start=maxlen, end=0, step=-1) < length
        padded_sequences = sequences[:, mask]
        
        batch[self.sequence_index] = padded_sequences
        batch[self.normal_feats_idx] = normal_feats
        
        if self.label_index is not None:
            return [x for i, x in enumerate(batch) if i != self.label_index], batch[self.label_index]
    
        return batch
    
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x
    
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)
        
        self.normal_linear = nn.Linear(ADD_FEATS_UNITS, DENSE_HIDDEN_UNITS) 
    
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets) # num_aux_targets = 6 
        
    def forward(self, x, normal_feats, lengths=None):
        h_embedding = self.embedding(x.long())
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        normal_linear  = F.relu(self.normal_linear(normal_feats.float()))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2 + normal_linear
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1) # out = 7
        
        return out
    

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
tqdm.pandas()

warnings.filterwarnings(action='once')
device = torch.device('cuda')
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
batch_size = 512
BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
# bert_config = BertConfig('../input/bert-inference/bert/bert_config.json')
bert_config = BertConfig('../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'+'bert_config.json')

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)

if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

    fastprogress.fastprogress.NO_BAR = True
    master_bar, progress_bar = force_console_behavior()
    fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

seed_everything()

**BERT Part**

In [None]:
test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test_df['comment_text'] = test_df['comment_text'].astype(str) 

In [None]:
X_test = convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)

In [None]:
test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False)

In [None]:
my_bert_list = os.listdir('../input/vanilla-epoch1/')
bert_full_list = os.listdir('../input/bert-full-epoch1/')

test_dataset = torch.utils.data.TensorDataset(torch.tensor(X_test, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=512, shuffle=False)

In [None]:
MY_BERT_PATH = Path('../input/vanilla-epoch1/')
BERT_FULL_PATH = Path('../input/bert-full-epoch1/')

bert_test_preds = []

In [None]:
%%time

for i, weight in enumerate(my_bert_list):
    if i == 4:
        break
    print("{} bert model prediction starts".format(i+1))
    model = BertForSequenceClassification(bert_config, num_labels=7)
    model.load_state_dict(torch.load(MY_BERT_PATH / weight))
    model.to(device)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()

    test_preds = np.zeros((len(X_test)))
    for i, (x_batch,) in enumerate(test_loader):
        pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)
        test_preds[i * 512:(i + 1) * 512] = pred[:, 0].detach().cpu().squeeze().numpy()

    test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()
    bert_test_preds.append(test_pred)
    del test_pred, model
    
gc.collect()

In [None]:
for i, weight in enumerate(bert_full_list):
    print("{} bert model prediction starts".format(i+1))
    model = BertForSequenceClassification(bert_config, num_labels=7)
#     state = torch.load(OMER_BERT_PATH / weight)
    model.load_state_dict(torch.load(BERT_FULL_PATH / weight))
#     model.load_state_dict(state['model_state_dict'])
    model.to(device)
    for param in model.parameters():
        param.requires_grad = False
    model.eval()

    test_preds = np.zeros((len(X_test)))
    for i, (x_batch,) in enumerate(test_loader):
        pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device), labels=None)
        test_preds[i * 512:(i + 1) * 512] = pred[:, 0].detach().cpu().squeeze().numpy()

    test_pred = torch.sigmoid(torch.tensor(test_preds)).numpy().ravel()
    bert_test_preds.append(test_pred)
    del test_pred, model
    
gc.collect()

In [None]:
bert_test_preds = np.mean(bert_test_preds, axis=0)

In [None]:
submission_bert = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': bert_test_preds
})
submission_bert.to_csv('submission.csv', index=False)

In [None]:
submission_bert.head()

In [None]:
del test_dataset
del test_loader
gc.collect()

**LSTM Part**

In [None]:
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'

In [None]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()


isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}


def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    return x

In [None]:
CRAWL_EMBEDDING_PATH = '../input/pickled-crawl300d2m-for-kernel-competitions/crawl-300d-2M.pkl'
GLOVE_EMBEDDING_PATH = '../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl'
PARA_EMBEDDING_PATH = '../input/pickled-paragram-300-vectors-sl999/paragram_300_sl999.pkl'

LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
maxlen = 220


In [None]:
train_df = reduce_mem_usage(pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv'))

with open('../input/toxic2019-preprocessed/x_train.pkl', 'rb') as f:    
    x_train = pickle.load(f)

In [None]:
x_test = test_df['comment_text'].progress_apply(lambda x:preprocess(x))
y_aux_train = train_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]

identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

max_features = 400000

In [None]:
%%time
test_add_feats_matrix = normal_feature_engineering(test_df["comment_text"].values)

ADD_FEATS_UNITS = test_add_feats_matrix.shape[1]

test_normal_feats = torch.from_numpy(test_add_feats_matrix)

del test_add_feats_matrix
gc.collect()

In [None]:
tokenizer = text.Tokenizer(num_words = max_features, filters='',lower=False)

tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [None]:
def predict_test(model, test_loader):
        
    model.eval()
    test_preds = np.zeros((len(x_test)))
    test_preds_fold = np.zeros((len(x_test)))
    with torch.no_grad():
        for i, x in enumerate(test_loader):
            x[0] = x[0].cuda()
            x[1] = x[1].cuda()
            x[2] = x[2].cuda()
            
            y_pred = model(x[0], x[1], x[2]).detach()
            test_preds_fold[i * batch_size:(i + 1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
            
        test_preds += test_preds_fold
            
    return test_preds


def fold_load_model(path):
    model = NeuralNet(embedding_matrix, y_aux_train.shape[-1])
    temp_dict = torch.load(path)
    temp_dict['embedding.weight'] = torch.tensor(embedding_matrix)
    model.load_state_dict(temp_dict)
    return model

In [None]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

In [None]:
embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
print(embedding_matrix.shape)

del crawl_matrix, unknown_words_crawl
del glove_matrix, unknown_words_glove
gc.collect()

In [None]:
x_test_1 = tokenizer.texts_to_sequences(x_test)
x_test_padded = torch.from_numpy(sequence.pad_sequences(x_test_1, maxlen=maxlen))
test_lengths = torch.from_numpy(np.array([len(x) for x in x_test_1]))

In [None]:
batch_size = 512
test_dataset = data.TensorDataset(x_test_padded, test_normal_feats, test_lengths)
test_collator = SequenceBucketCollator(lambda lengths: lengths.max(), sequence_index=0, length_index=2, normal_feats_idx=1)

test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collator)

In [None]:
del test_lengths
del x_test_padded
del test_dataset
del test_collator
gc.collect()

In [None]:
# glove_crawl prediction
glove_crawl_seed1 = { 
              'fold1': Path('../input/toxic-glove-crawl/10_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/10_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/10_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/10_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/10_5fold.pt5'),
             }
glove_crawl_seed2 = { 
              'fold1': Path('../input/toxic-glove-crawl/20_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/20_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/20_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/20_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/20_5fold.pt5'),
             }
glove_crawl_seed3 = { 
              'fold1': Path('../input/toxic-glove-crawl/30_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/30_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/30_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/30_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/30_5fold.pt5'),
             }
glove_crawl_seed4 = { 
              'fold1': Path('../input/toxic-glove-crawl/40_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/40_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/40_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/40_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/40_5fold.pt5'),
             }
glove_crawl_seed5 = { 
              'fold1': Path('../input/toxic-glove-crawl/50_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/50_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/50_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/50_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/50_5fold.pt5'),
             }
glove_crawl_seed6 = { 
              'fold1': Path('../input/toxic-glove-crawl/60_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/60_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/60_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/60_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/60_5fold.pt5'),
            }            
glove_crawl_seed7 = { 
              'fold1': Path('../input/toxic-glove-crawl/70_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/70_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/70_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/70_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/70_5fold.pt5'),
            }            
glove_crawl_seed8 = { 
              'fold1': Path('../input/toxic-glove-crawl/80_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/80_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/80_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/80_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/80_5fold.pt5'),
            }            
glove_crawl_seed9 = { 
              'fold1': Path('../input/toxic-glove-crawl/90_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/90_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/90_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/90_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/90_5fold.pt5'),
            }            
glove_crawl_seed10 = { 
              'fold1': Path('../input/toxic-glove-crawl/100_1fold.pt5'),
              'fold2': Path('../input/toxic-glove-crawl/100_2fold.pt5'),
              'fold3': Path('../input/toxic-glove-crawl/100_3fold.pt5'),
              'fold4': Path('../input/toxic-glove-crawl/100_4fold.pt5'),
              'fold5': Path('../input/toxic-glove-crawl/100_5fold.pt5'),
            }            


seed_list = [glove_crawl_seed1, glove_crawl_seed2, glove_crawl_seed3, glove_crawl_seed4, glove_crawl_seed5, 
             glove_crawl_seed6, glove_crawl_seed7, glove_crawl_seed8, glove_crawl_seed9, glove_crawl_seed10]

In [None]:
%%time

all_test_preds = []

for i, seed in enumerate(seed_list):
    
    print("glove_crawl seed {} prediction starts".format(i+1))
    
    # for each loop adds 5 predictions
    fold_preds = np.zeros((len(x_test)))

    # 한 개 seed마다 fold prediction
    for fold, path in seed.items():
#         print("{}".format(fold))
        model = fold_load_model(path)
        model.cuda()

        test_preds = predict_test(model, test_loader)
        fold_preds += test_preds
        del test_preds, model
        gc.collect()
        
    all_test_preds.append(fold_preds)    

In [None]:
submission_predict = np.zeros((len(x_test)))
for _ in all_test_preds:
    submission_predict += _

In [None]:
submission_chris = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': submission_predict / 10
})
# submission_glove_crawl.to_csv('submission.csv', index=False)

## Omer_lstms

In [None]:
def load_model(path, lstm_units=128):
    model = omer_lstm(embedding_matrix, 6, lstm_units)
    temp_dict = torch.load(path)
    temp_dict['embedding.weight'] = torch.tensor(embedding_matrix)
    model.load_state_dict(temp_dict)
    model.cuda()
    model.eval()
    return model

In [None]:
def predict(model):

    all_test_preds = []
    test_preds = np.zeros((len(x_test), 7)) # output_dim=7

    for i, x_batch in enumerate(test_loader):
    
        X = x_batch[0].cuda()    
        test_preds[i * batch_size:(i+1) * batch_size, :] = sigmoid(model(X).detach().cpu().numpy())

    all_test_preds.append(test_preds)
    del model
    gc.collect()
    return(all_test_preds)

In [None]:
def build_matrix_1(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((max_features + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words

In [None]:
class omer_lstm(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets, lstm_units):
        super(omer_lstm, self).__init__()
        embed_size = embedding_matrix.shape[1]
        self.DENSE_HIDEN_UNITS = lstm_units * 4
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, lstm_units, bidirectional=True, batch_first=True) 
        self.lstm2 = nn.LSTM(lstm_units * 2, lstm_units, bidirectional=True, batch_first=True)
    
        self.linear1 = nn.Linear(self.DENSE_HIDEN_UNITS, self.DENSE_HIDEN_UNITS)
        self.linear2 = nn.Linear(self.DENSE_HIDEN_UNITS, self.DENSE_HIDEN_UNITS)
        
        self.linear_out = nn.Linear(self.DENSE_HIDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(self.DENSE_HIDEN_UNITS, num_aux_targets)
        
    def forward(self, x, lengths=None):
        h_embedding = self.embedding(x.long())
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

In [None]:
maxlen = 300
batch_size = 128

x_test = tokenizer.texts_to_sequences(x_test)
test_lengths = torch.from_numpy(np.array([len(x) for x in x_test]))
x_test_padded = torch.from_numpy(sequence.pad_sequences(x_test, maxlen=maxlen))
test_dataset = data.TensorDataset(x_test_padded, test_lengths)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

del test_lengths
del x_test_padded
del test_dataset
gc.collect()

In [None]:
crawl_matrix, unknown_words_crawl = build_matrix_1(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

glove_matrix, unknown_words_glove = build_matrix_1(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

max_features = max_features or len(tokenizer.word_index) + 1
print("max features :",max_features)

embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
embedding_matrix.shape

del crawl_matrix
del glove_matrix
gc.collect()

In [None]:
%%time 

# lstm 128 prediction
omer_lstm_preds=[]
lstm128_num = len(os.listdir('../input/omer-lstm-128'))

for _ in range(lstm128_num):
    PATH = '../input/omer-lstm-128/modellstm'+str(_)+'.pt'
    model = load_model(PATH)
    all_test_preds = predict(model)
    test_preds = all_test_preds[-1]
    test_preds  = np.mean(all_test_preds, axis=0)[:, 0]
    omer_lstm_preds.append(test_preds)
    del model
    gc.collect()

In [None]:
%%time 

# lstm 256 prediction

# DENSE_HIDDEN_UNITS = 1024

lstm256_num = len(os.listdir('../input/toxic-lstm256'))

for _ in range(lstm256_num):
    PATH = '../input/toxic-lstm256/lstm256_'+str(_)+'.pt'
    model = load_model(PATH, 256)
    all_test_preds = predict(model)
    test_preds = all_test_preds[-1]
    test_preds  = np.mean(all_test_preds, axis=0)[:, 0]
    omer_lstm_preds.append(test_preds)
    del model
    gc.collect()

In [None]:
omer_lstm_preds = np.mean(omer_lstm_preds, axis=0)

In [None]:
submission_omer = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': omer_lstm_preds
})

In [None]:
# ensemble mine and Omer's lstms
lstm_ensemble_prediction = submission_chris.prediction*0.5 + submission_omer.prediction*0.5

**Blending part**

In [None]:
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv')
submission['prediction'] = ((lstm_ensemble_prediction + submission_bert.prediction) / 2 + submission_bert.prediction)/2 # => weighting more to berts
# submission['prediction'] = ((lstm_ensemble_prediction + submission_bert.prediction) / 2 + lstm_ensemble_prediction)/2
# submission['prediction'] = (lstm_ensemble_prediction + submission_bert.prediction) / 2
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()