In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import fastai
from fastai.train import Learner
from fastai.train import DataBunch
from fastai.basic_data import DatasetType
import fastprogress
from fastprogress import force_console_behavior
import numpy as np
from pprint import pprint
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

import torch.utils.data
from tqdm import tqdm
from nltk.tokenize.treebank import TreebankWordTokenizer
from scipy.stats import rankdata

from gensim.models import KeyedVectors

import warnings

from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
import string
import re
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
pd.set_option('max_colwidth',400)
pd.set_option('max_columns', 50)
import json
import gc
import os

import copy

import pickle

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted

In [None]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

def is_interactive():
    return 'SHLVL' not in os.environ

def seed_everything(seed=123):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    #with open(path,'rb') as f:
    emb_arr = KeyedVectors.load(path)
    return emb_arr

def build_matrix(word_index, path, dim=300):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((max_features + 1, dim))
    unknown_words = []
    
    for word, i in word_index.items():
        if i <= max_features:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    return x

class SequenceBucketCollator():
    def __init__(self, choose_length, sequence_index, length_index, label_index=None):
        self.choose_length = choose_length
        self.sequence_index = sequence_index
        self.length_index = length_index
        self.label_index = label_index
        
    def __call__(self, batch):
        batch = [torch.stack(x) for x in list(zip(*batch))]
        
        sequences = batch[self.sequence_index]
        lengths = batch[self.length_index]
        
        length = self.choose_length(lengths)
        mask = torch.arange(start=maxlen, end=0, step=-1) < length
        padded_sequences = sequences[:, mask]
        
        batch[self.sequence_index] = padded_sequences
        
        if self.label_index is not None:
            return [x for i, x in enumerate(batch) if i != self.label_index], batch[self.label_index]
    
        return batch

    
class SoftmaxPooling(nn.Module):
    def __init__(self, dim=1):
        super(self.__class__, self).__init__()
        self.dim = dim
        
    def forward(self, x):
        return (x * x.softmax(dim=self.dim)).sum(dim=self.dim)


class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]
        
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)

        self.linear_out = nn.Sequential(
            nn.Dropout(0.5),
            nn.BatchNorm1d(DENSE_HIDDEN_UNITS),
            nn.Linear(DENSE_HIDDEN_UNITS, 1)
        )
        
        self.linear_aux_out = nn.Sequential(
            nn.Dropout(0.5),
            nn.BatchNorm1d(DENSE_HIDDEN_UNITS),
            nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
        )
        
        self.softmaxpool = SoftmaxPooling()
        
    def forward(self, x, lengths=None):
        h_embedding = self.embedding(x.long())
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
        # softmax pooling
        soft_pool = self.softmaxpool(h_lstm2)
        
        h_conc = torch.cat((max_pool, avg_pool, soft_pool), 1)
        hidden = h_conc
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out
    
def custom_loss(data, targets):
    bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:,1:2])(data[:,:1],targets[:,:1])
    bce_loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    return (bce_loss_1 * loss_weight) + bce_loss_2

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def ensemble_predictions(predictions, weights, type_="linear", axis=1):
    assert np.isclose(np.sum(weights), 1.0)
    if type_ == "linear":
        res = np.average(predictions, weights=weights, axis=axis)
    # CAREFUL WITH WHAT IS BELOW, IT IS DIFFERENT FOR LSTM BLENDING AND LSTM/BERT BLENDING
    elif type_ == "harmonic":
        res = np.average([1 / p for p in predictions], weights=weights, axis=axis)
        return 1 / res
    elif type_ == "geometric":
        numerator = np.average(
            [np.log(p) for p in predictions], weights=weights, axis=axis
        )
        res = np.exp(numerator / sum(weights))
        return res
    elif type_ == "rank":
        res = np.average([rankdata(p) for p in predictions], weights=weights, axis=axis)
        return res / (len(res) + 1)
    return res

In [None]:
warnings.filterwarnings(action='once')
device = torch.device('cuda')
SEED = 1234
BATCH_SIZE = 512
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

tqdm.pandas()
CRAWL_EMBEDDING_PATH = '../input/gensim-embeddings-dataset/crawl-300d-2M.gensim'
NUMBERBATCH_EMBEDDING_PATH = '../input/gensim-embeddings-dataset/numberbatch-en.gensim'
PARAGRAM_EMBEDDING_PATH = '../input/gensim-embeddings-dataset/paragram_300_sl999.gensim'
TWITTER_EMBEDDING_PATH = '../input/gensim-embeddings-dataset/glove.twitter.27B.200d.gensim'
if not is_interactive():
    def nop(it, *a, **k):
        return it

    tqdm = nop

    fastprogress.fastprogress.NO_BAR = True
    master_bar, progress_bar = force_console_behavior()
    fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar, progress_bar

seed_everything()

## BiLSTM Crawl + Paragram

In [None]:
# only here the values are like this
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 768

In [None]:
tokenizer = TreebankWordTokenizer()

test_df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
test_df['comment_text'] = test_df['comment_text'].astype(str) 

In [None]:
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'

In [None]:
isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}

In [None]:
x_test = test_df['comment_text'].progress_apply(lambda x:preprocess(x))

loss_weight = 3.209226860170181

max_features = 400000

In [None]:
tokenizer = text.Tokenizer(num_words = max_features, filters='',lower=False)

In [None]:
with open('../input/bilstm-crawl-paragram-0/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

paragram_matrix, unknown_words_paragram = build_matrix(tokenizer.word_index, PARAGRAM_EMBEDDING_PATH)
print('n unknown words (paragram): ', len(unknown_words_paragram))

max_features = max_features or len(tokenizer.word_index) + 1
max_features

embedding_matrix = np.concatenate([crawl_matrix, paragram_matrix], axis=-1)
embedding_matrix.shape

del crawl_matrix
del paragram_matrix
gc.collect()

In [None]:
x_test_seq = tokenizer.texts_to_sequences(x_test)

In [None]:
maxlen = 300
test_lengths = torch.from_numpy(np.array([len(x) for x in x_test_seq]))

x_test_padded = torch.from_numpy(sequence.pad_sequences(x_test_seq, maxlen=maxlen))

In [None]:
batch_size = 512
test_dataset = data.TensorDataset(x_test_padded, test_lengths)
test_collator = SequenceBucketCollator(lambda lenghts: lenghts.max(), sequence_index=0, length_index=1)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_collator)

databunch = DataBunch(train_dl=test_loader, valid_dl=test_loader, collate_fn=test_collator)

In [None]:
def test_model_cp(model_num, model_epoch,output_dim=7):
    model = NeuralNet(embedding_matrix, 6).cuda()
    model.load_state_dict(torch.load('../input/bilstm-crawl-paragram-{}/model_1_{}.pth'.format(model_num, model_epoch))['model'])
    model.eval()
    with torch.no_grad():
        test_preds = np.zeros((len(x_test_seq), output_dim)) 
        for i, x_batch in enumerate(test_loader):
            X = x_batch[0].cuda()
            y_pred = sigmoid(model(X).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred
    return test_preds

In [None]:
test_model_preds = pd.DataFrame()
test_model_preds['lstm_cp_0_3'] = test_model_cp(0, 3)[:, 0]
test_model_preds['lstm_cp_1_3'] = test_model_cp(1, 3)[:, 0]
test_model_preds['lstm_cp_2_3'] = test_model_cp(2, 3)[:, 0]

## BiLSTM clip target [0.05, 0.95]

In [None]:
# since this one - like this
LSTM_UNITS = 256
DENSE_HIDDEN_UNITS = 1536

In [None]:
def test_model_clip(model_num, model_epoch,output_dim=7):
    model = NeuralNet(embedding_matrix, 6).cuda()
    model.load_state_dict(torch.load('../input/lstm-clip-{}/model_{}.pth'.format(model_num, model_epoch))['model'])
    model.eval()
    with torch.no_grad():
        test_preds = np.zeros((len(x_test_seq), output_dim)) 
        for i, x_batch in enumerate(test_loader):
            X = x_batch[0].cuda()
            y_pred = sigmoid(model(X).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred
    return test_preds

In [None]:
test_model_preds['lstm_clip_0_2'] = test_model_clip(0, 2)[:, 0]
test_model_preds['lstm_clip_1_3'] = test_model_clip(1, 3)[:, 0]
test_model_preds['lstm_clip_2_3'] = test_model_clip(2, 3)[:, 0]

## BiLSTM Numberbatch

In [None]:
numberbatch_matrix, unknown_words_numberbatch = build_matrix(tokenizer.word_index, NUMBERBATCH_EMBEDDING_PATH)
print('n unknown words (numberbatch): ', len(unknown_words_numberbatch))

embedding_matrix = numberbatch_matrix
print(embedding_matrix.shape)

del numberbatch_matrix
gc.collect()

In [None]:
def test_model_nb(model_num, model_epoch,output_dim=7):
    model = NeuralNet(embedding_matrix, 6).cuda()
    model.load_state_dict(torch.load('../input/lstm-numberbatch-{}/model_{}.pth'.format(model_num, model_epoch))['model'])
    model.eval()
    with torch.no_grad():
        test_preds = np.zeros((len(x_test_seq), output_dim))
        for i, x_batch in enumerate(test_loader):
            X = x_batch[0].cuda()
            y_pred = sigmoid(model(X).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred
    return test_preds

In [None]:
test_model_preds['lstm_nb_0_4'] = test_model_nb(0, 4)[:, 0]
test_model_preds['lstm_nb_1_4'] = test_model_nb(1, 4)[:, 0]
test_model_preds['lstm_nb_2_4'] = test_model_nb(2, 4)[:, 0]

## BiLSTM Twitter

In [None]:
twitter_matrix, unknown_words_twitter = build_matrix(tokenizer.word_index, TWITTER_EMBEDDING_PATH, dim=200)
print('n unknown words (twitter): ', len(unknown_words_twitter))

embedding_matrix = twitter_matrix
print(embedding_matrix.shape)

del twitter_matrix
gc.collect()

In [None]:
def test_model_tw(model_num, model_epoch,output_dim=7):
    model = NeuralNet(embedding_matrix, 6).cuda()
    model.load_state_dict(torch.load('../input/lstm-twitter-{}/model_{}.pth'.format(model_num, model_epoch))['model'])
    model.eval()
    with torch.no_grad():
        test_preds = np.zeros((len(x_test_seq), output_dim)) 
        for i, x_batch in enumerate(test_loader):
            X = x_batch[0].cuda()
            y_pred = sigmoid(model(X).detach().cpu().numpy())
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred
    return test_preds

In [None]:
test_model_preds['lstm_tw_0_3'] = test_model_tw(0, 3)[:, 0]
test_model_preds['lstm_tw_1_3'] = test_model_tw(1, 3)[:, 0]
test_model_preds['lstm_tw_2_3'] = test_model_tw(2, 3)[:, 0]

## TF-IDF

In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

cont_patterns = [
        (b'US', b'United States'),
        (b'IT', b'Information Technology'),
        (b'(W|w)on\'t', b'will not'),
        (b'(C|c)an\'t', b'can not'),
        (b'(I|i)\'m', b'i am'),
        (b'(A|a)in\'t', b'is not'),
        (b'(\w+)\'ll', b'\g<1> will'),
        (b'(\w+)n\'t', b'\g<1> not'),
        (b'(\w+)\'ve', b'\g<1> have'),
        (b'(\w+)\'s', b'\g<1> is'),
        (b'(\w+)\'re', b'\g<1> are'),
        (b'(\w+)\'d', b'\g<1> would'),
    ]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]

def prepare_for_char_n_gram(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    clean = re.sub(b" ", b"# #", clean)  # Replace space
    clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')

def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

def get_indicators_and_clean_comments(df):
    """
    Check all sorts of content as it may help find toxic comment
    Though I'm not sure all of them improve scores
    """
    # Count number of \n
#     df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
    # Get length in words and characters
    df["raw_word_len"] = df["comment_text"].apply(lambda x: len(x.split()))
    df["raw_char_len"] = df["comment_text"].apply(lambda x: len(x))
    # TODO chars per row
    # Check number of upper case, if you're angry you may write in upper case
    df["nb_upper"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
    # Number of F words - f..k contains folk, fork,
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    # Number of S word
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    # Number of D words
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    # Number of occurence of You, insulting someone usually needs someone called : you
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    # Just to check you really refered to my mother ;-)
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    # Just checking for toxic 19th century vocabulary
    df["nb_ng"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))
    # Some Sentences start with a <:> so it may help
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    # Check for time stamp
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Check for date short 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    # Check for http links
#     df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # check for mail
    df["has_mail"] = df["comment_text"].apply(
        lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x)
    )
    # Looking for words surrounded by == word == or """" word """"
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))

    # Now clean comments
    df["clean_comment"] = df["comment_text"].apply(lambda x: prepare_for_char_n_gram(x))

    # Get the new length in words and characters
    df["clean_word_len"] = df["clean_comment"].apply(lambda x: len(x.split()))
    df["clean_char_len"] = df["clean_comment"].apply(lambda x: len(x))
    # Number of different characters used in a comment
    # Using the f word only will reduce the number of letters required in the comment
    df["clean_chars"] = df["clean_comment"].apply(lambda x: len(set(x)))
    df["clean_chars_ratio"] = df["clean_comment"].apply(lambda x: len(set(x))) / df["clean_comment"].apply(
        lambda x: 1 + min(99, len(x)))

fts = ["raw_word_len", "raw_char_len", "nb_upper", "nb_fk", "nb_sk", "nb_dk", "nb_you", "nb_mother", "nb_ng", "start_with_columns",
       "has_timestamp", "has_date_long", "has_date_short", "has_mail", "has_emphasize_equal", "has_emphasize_quotes", "clean_word_len",
       "clean_char_len", "clean_chars", "clean_chars_ratio"]
    
def preprocess(df):
    keys = [i for i in repl.keys()]

    new_data = []
    ltr = df["comment_text"].tolist()
    for i in tqdm(ltr):
        arr = str(i).split()
        xx = ""
        for j in arr:
            j = str(j).lower()
            if j[:4] == 'http' or j[:3] == 'www':
                continue
            if j in keys:
                # print("inn")
                j = repl[j]
            xx += j + " "
        new_data.append(xx)
    df["new_comment_text"] = new_data
    
    trate = df["new_comment_text"].tolist()
    for i, c in enumerate(trate):
        trate[i] = re.sub('[^a-zA-Z ?!]+', '', str(trate[i]).lower())
    df["comment_text"] = trate
    df.drop(["new_comment_text"], axis=1, inplace=True)

    df_text = df['comment_text']
    
    
    re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
    
    get_indicators_and_clean_comments(df)
    
    return df

In [None]:
test_df = preprocess(test_df)

In [None]:
fts = ["raw_word_len", "raw_char_len", "nb_upper", "nb_fk", "nb_sk", "nb_dk", "nb_you", "nb_mother", "nb_ng", "start_with_columns",
       "has_timestamp", "has_date_long", "has_date_short", "has_mail", "has_emphasize_equal", "has_emphasize_quotes", "clean_word_len",
       "clean_char_len", "clean_chars", "clean_chars_ratio"]

test_text = test_df['clean_comment'].apply(lambda x: re.sub('#', '', x)).fillna('')

word_vectorizer = TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        min_df=5,
        ngram_range=(1, 2),
        max_features=60000)

with open('../input/jigsaw-tfidf-models/word_vectorizer.pickle', 'rb') as handle:
    word_vectorizer = pickle.load(handle)

test_word_features = word_vectorizer.transform(test_text)
test_features = hstack([test_df[fts], test_word_features]).tocsr()
del test_word_features

with open('../input/jigsaw-tfidf-models/gbm_model.pickle', 'rb') as handle:
    gbm = pickle.load(handle)
    
test_model_preds['tfidf_gbm'] = gbm.predict(test_features)

text = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s):
    return text.sub(r' \1 ', s)

word_vectorizer = TfidfVectorizer(ngram_range=(1,2),
               min_df=5, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1, max_features=50000)

with open('../input/simple-tfidf-models/word_vectorizer.pickle', 'rb') as handle:
    word_vectorizer = pickle.load(handle)

test_tfidf = word_vectorizer.transform(test_df['comment_text'].fillna(''))

lr = LogisticRegression(solver='lbfgs', random_state=13)
with open('../input/simple-tfidf-models/lr_model.pickle', 'rb') as handle:
    lr = pickle.load(handle)

test_model_preds['tfidf_lr_simple'] = lr.predict_proba(test_tfidf)[:, 1]

with open('../input/simple-tfidf-models/gbm_model.pickle', 'rb') as handle:
    gbm = pickle.load(handle)
    
test_model_preds['tfidf_gbm_simple'] = gbm.predict(test_tfidf)

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        y = y
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)
        
        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

NbSvm = NbSvmClassifier(C=1.5, dual=True, n_jobs=-1)
with open('../input/simple-tfidf-nbsvm/nbsvm_model.pickle', 'rb') as handle:
    NbSvm = pickle.load(handle)
    
test_model_preds['tfidf_nbsvm_simple'] = NbSvm.predict_proba(test_tfidf)[:, 1]

## BERT

In [None]:
! md5sum ../input/jigsaw2019code/*.py
! ls ../input/
! find ../input/ -name config.json | grep -v deps

In [None]:
deps_path = '../input/kagglejigsaw2019deps/kaggle-jigsaw-2019-deps/kaggle-jigsaw-2019-deps/'
! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" $deps_path/apex
! pip install $deps_path/pytorch-pretrained-BERT/

In [None]:
test_size = 0

In [None]:
! python ../input/jigsaw2019code/bert.py \
  _runs/bert-base-uncased-pretrained-ep1 \
  --model ../input/jigsaw2019bertbaseuncasedpretrainedep1/bert-base-uncased-pretrained-ep1/bert-base-uncased-pretrained-ep1 \
  --submission --test-size $test_size

In [None]:
! python ../input/jigsaw2019code/bert.py \
  _runs/gpt2-ep2-lr8e-5 \
  --bucket 0 \
  --model ../input/gpt2ep2lr8e5/gpt2-ep2-lr8e-5/gpt2-ep2-lr8e-5/ \
  --submission --test-size $test_size

In [None]:
! python ../input/jigsaw2019code/bert.py \
  _runs/bert-base-cased-pretrained-ep1 \
  --model ../input/bertbasecasedpretrainedep1/bert-base-cased-pretrained-ep1/bert-base-cased-pretrained-ep1/ \
  --submission --test-size $test_size

In [None]:
! python ../input/jigsaw2019code/bert.py \
  _runs/resume-lr0.1e-5-from-bert-large-pretrained-uncased-ep1-lr0.5e-5-as16 \
  --model ../input/bertluncased1redo/ \
  --submission --test-size $test_size

In [None]:
! python ../input/jigsaw2019code/bert.py \
  _runs/bert-base-uncased-fresh-ep1 \
  --model ../input/bertbaseuncasedfreshep1/bert-base-uncased-fresh-ep1/bert-base-uncased-fresh-ep1/ \
  --submission --test-size $test_size

In [None]:
bert_uncased1 = pd.read_csv('_runs/bert-base-uncased-pretrained-ep1/submission.csv')['prediction']
bert_cased1 = pd.read_csv('_runs/bert-base-cased-pretrained-ep1/submission.csv')['prediction']
bert_uncased_fresh1 = pd.read_csv('_runs/bert-base-uncased-fresh-ep1/submission.csv')['prediction']
bert_large1 = pd.read_csv('_runs/resume-lr0.1e-5-from-bert-large-pretrained-uncased-ep1-lr0.5e-5-as16/submission.csv')['prediction']

In [None]:
test_model_preds['bert_uncased1'] = bert_uncased1
test_model_preds['bert_cased1'] = bert_cased1
test_model_preds['bert_uncased_fresh1'] = bert_uncased_fresh1
test_model_preds['bert_large1'] = bert_large1

## GPT2

In [None]:
gpt22 = pd.read_csv('_runs/gpt2-ep2-lr8e-5/submission.csv')['prediction']

In [None]:
test_model_preds['gpt22'] = gpt22

In [None]:
test_model_preds.to_csv('test_model_preds.csv', index=False)

In [None]:
! cp _runs/bert-base-cased-pretrained-ep1/submission.csv bert-base-cased-pretrained-ep1.csv
! cp _runs/bert-base-uncased-pretrained-ep1/submission.csv bert-base-uncased-pretrained-ep1.csv
! cp _runs/bert-base-uncased-fresh-ep1/submission.csv bert-base-uncased-fresh-ep1.csv
! cp _runs/gpt2-ep2-lr8e-5/submission.csv gpt2-ep2-lr8e-5.csv
! cp _runs/resume-lr0.1e-5-from-bert-large-pretrained-uncased-ep1-lr0.5e-5-as16/submission.csv \
  resume-lr0.1e-5-from-bert-large-pretrained-uncased-ep1-lr0.5e-5-as16.csv

## Blend

In [None]:
best_weights = {
    'cp': [50, 40, 40],
    'clip': [40, 40, 44],
    'nb': [40, 39, 44],
    'tw': [40, 39, 40],
    'tfidf': [42, 44, 41, 37],
    'bert': [59, 30, -28, 65],
    'blend': [17, 33, -13, -1, -3, 132, 42]
}


def custom_predict(X_models, weights=best_weights, type_='linear'):
    model_cp = ensemble_predictions(X_models[[col for col in X_models.columns if col.startswith('lstm_cp')]], weights=(np.array(weights['cp']) + 1e-15) / (sum(weights['cp']) + 1e-15), type_=type_)
    model_clip = ensemble_predictions(X_models[[col for col in X_models.columns if col.startswith('lstm_clip')]], weights=(np.array(weights['clip']) + 1e-15) / (sum(weights['clip']) + 1e-15), type_=type_)
    model_nb = ensemble_predictions(X_models[[col for col in X_models.columns if col.startswith('lstm_nb')]], weights=(np.array(weights['nb']) + 1e-15) / (sum(weights['nb']) + 1e-15), type_=type_)
    model_tw = ensemble_predictions(X_models[[col for col in X_models.columns if col.startswith('lstm_tw')]], weights=(np.array(weights['tw']) + 1e-15) / (sum(weights['tw']) + 1e-15), type_=type_)
    model_tfidf = ensemble_predictions(X_models[[col for col in X_models.columns if col.startswith('tfidf')]], weights=(np.array(weights['tfidf']) + 1e-15) / (sum(weights['tfidf']) + 1e-15), type_=type_)
    model_bert = ensemble_predictions(X_models[[col for col in X_models.columns if col.startswith('bert')]], weights=(np.array(weights['bert']) + 1e-15) / (sum(weights['bert']) + 1e-15), type_=type_)
    model_gpt = X_models['gpt22']
    
    models = [model_cp, model_clip, model_nb, model_tw, model_tfidf, model_bert, model_gpt]
    model_blend = np.zeros_like(model_bert) + 1e-15
    for i in range(len(models)):
        model_blend += weights['blend'][i] * models[i]
    model_blend /= (sum(weights['blend']) + 1e-15)
    return model_blend

In [None]:
submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': custom_predict(test_model_preds)
})
submission.to_csv("submission.csv", index=False)

submission.head()

In [None]:
! rm -rf _runs