# Install & Load Library

In [None]:
%load_ext autoreload
%autoreload 2

* ignore warnings

In [None]:
import warnings
warnings.filterwarnings('ignore')

* load library

In [None]:
import os
import gc
import re
import sys
import pickle
import random

import pandas as pd
import numpy as np

from tqdm import tqdm, tqdm_notebook

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
from torch.nn import functional as F

from nltk.tokenize.treebank import TreebankWordTokenizer

# Util functions

* set seed functions

In [None]:
# set seed
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

## Preprocessing

* preprocessing of public kernel

In [None]:
# preprocessing of public kernel
class Preprocess_pb_kernel:
    
    def __init__(self):
        self.tokenizer_preprocess = TreebankWordTokenizer()
   
        symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
        symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'

        self.isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
        self.remove_dict = {ord(c):f'' for c in symbols_to_delete}
    
    def _handle_punctuation(self, x):
        x = x.translate(self.remove_dict)
        x = x.translate(self.isolate_dict)
        return x

    def _handle_contractions(self, x):
        x = self.tokenizer_preprocess.tokenize(x)
        return x

    def _fix_quote(self, x):
        x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
        x = ' '.join(x)
        return x
    
    def preprocess(self, text):
        text = self._handle_punctuation(text)
        text = self._handle_contractions(text)
        text = self._fix_quote(text)

        return text

* preprocessing of sogna kernel

In [None]:
# preprocessing of public kernel
class Preprocess_sogna_kernel:
    
    def __init__(self):
        contraction_mapping = {
            "'cause": 'because',
            ',cause': 'because',
            ';cause': 'because',
            '´cause': 'because',
            '’cause': 'because',

            "ain't": 'am not',
            'ain,t': 'am not', 
            'ain;t': 'am not',
            'ain´t': 'am not',
            'ain’t': 'am not',

            "aren't": 'are not', "arn't": "are not",
            'aren,t': 'are not', "arn,t": "are not",
            'aren;t': 'are not', "arn;t": "are not",
            'aren´t': 'are not', "arn´t": "are not",
            'aren’t': 'are not', "arn’t": "are not",

            "isn't": 'is not', "is'nt": "is not",
            'isn,t': 'is not', "is,nt": "is not",
            'isn;t': 'is not', "is;nt": "is not",
            'isn´t': 'is not', "is´nt": "is not",
            'isn’t': 'is not', "is’nt": "is not",

            "wasn't": 'was not', "was'nt": "was not",
            'wasn,t': 'was not', "was,nt": "was not",
            'wasn;t': 'was not', "was;nt": "was not",
            'wasn´t': 'was not', "was´nt": "was not",
            'wasn’t': 'was not', "was’nt": "was not",

            "weren't": 'were not',
            'weren,t': 'were not',
            'weren;t': 'were not',
            'weren´t': 'were not',
            'weren’t': 'were not',

            "didn't": 'did not', "d'int": "did not", "did'nt": "did not", "din't": "did not",
            'didn,t': 'did not', "d,int": "did not", "did,nt": "did not", "din,t": "did not",
            'didn;t': 'did not', "d;int": "did not", "did;nt": "did not", "din;t": "did not",
            'didn´t': 'did not', "d´int": "did not", "did´nt": "did not", "din´t": "did not",
            'didn’t': 'did not', "d’int": "did not", "did’nt": "did not", "din’t": "did not",

            "doesn't": 'does not', "doens't": "does not", "dosen't": "does not", "dosn't": "does not",
            'doesn,t': 'does not', "doens,t": "does not", "dosen,t": "does not", "dosn,t": "does not",
            'doesn;t': 'does not', "doens;t": "does not", "dosen;t": "does not", "dosn;t": "does not",
            'doesn´t': 'does not', "doens´t": "does not", "dosen´t": "does not", "dosn´t": "does not",
            'doesn’t': 'does not', "doens’t": "does not", "dosen’t": "does not", "dosn’t": "does not",

            "don't": 'do not', "dont't": "do not",
            'don,t': 'do not', "dont,t": "do not",
            'don;t': 'do not', "dont;t": "do not",
            'don´t': 'do not', "dont´t": "do not",
            'don’t': 'do not', "dont’t": "do not",

            'don""t': "do not",

            "hadn't": 'had not', "hadn't've": 'had not have', "hasn't": 'has not', "haven't": 'have not', "havn't": 'have not',
            'hadn,t': 'had not', 'hadn,t,ve': 'had not have', 'hasn,t': 'has not', 'haven,t': 'have not', "havn,t": 'have not',
            'hadn;t': 'had not', 'hadn;t;ve': 'had not have', 'hasn;t': 'has not', 'haven;t': 'have not', "havn;t": 'have not',
            'hadn´t': 'had not', 'hadn´t´ve': 'had not have', 'hasn´t': 'has not', 'haven´t': 'have not', "havn´t": 'have not',
            'hadn’t': 'had not', 'hadn’t’ve': 'had not have', 'hasn’t': 'has not', 'haven’t': 'have not', "havn’t": 'have not',

            "won't": 'will not', "will've": "will have", "won't've": "will not have",
            'won,t': 'will not', "will,ve": "will have", "won,t,ve": "will not have",
            'won;t': 'will not', "will;ve": "will have", "won;t;ve": "will not have",
            'won´t': 'will not', "will´ve": "will have", "won´t´ve": "will not have",
            'won’t': 'will not', "will’ve": "will have", "won’t’ve": "will not have",

            "wouldn't": 'would not', "would've": "would have", "wouldn't've": "would not have",
            'wouldn,t': 'would not', "would,ve": "would have", "wouldn,t,ve": "would not have",
            'wouldn;t': 'would not', "would;ve": "would have", "wouldn;t;ve": "would not have",
            'wouldn´t': 'would not', "would´ve": "would have", "wouldn´t´ve": "would not have",
            'wouldn’t': 'would not', "would’ve": "would have", "wouldn’t’ve": "would not have",

            "can't": 'cannot',"can't've": 'cannot have',
            'can,t': 'cannot','can,t,ve': 'cannot have',
            'can;t': 'cannot','can;t;ve': 'cannot have',
            'can´t': 'cannot','can´t´ve': 'cannot have',
            'can’t': 'cannot','can’t’ve': 'cannot have',

            "could've": 'could have', "couldn't": 'could not', "couldn't've": 'could not have', "could'nt": 'could not',
            'could,ve': 'could have', 'couldn,t': 'could not', 'couldn,t,ve': 'could not have', "could,nt": 'could not',
            'could;ve': 'could have', 'couldn;t': 'could not', 'couldn;t;ve': 'could not have', "could;nt": 'could not',
            'could´ve': 'could have', 'couldn´t': 'could not', 'couldn´t´ve': 'could not have', "could´nt": 'could not',
            'could’ve': 'could have', 'couldn’t': 'could not', 'couldn’t’ve': 'could not have', "could’nt": 'could not',

            "sha'n't": 'shall not', "shan't": 'shall not', "shan't've": "shall not have", 
            'sha,n,t': 'shall not', 'shan,t': 'shall not', "shan,t,ve": "shall not have", 
            'sha;n;t': 'shall not', 'shan;t': 'shall not', "shan;t;ve": "shall not have", 

            "should've": 'should have', "shouldn't": 'should not', "shouldn't've": "should not have", "shoudn't": "should not",
            'should,ve': 'should have', 'shouldn,t': 'should not', "shouldn,t,ve": "should not have", "shoudn,t": "should not",
            'should;ve': 'should have', 'shouldn;t': 'should not', "shouldn;t;ve": "should not have", "shoudn;t": "should not",

            "mayn't": 'may not',
            'mayn,t': 'may not',
            'mayn;t': 'may not',
            'mayn´t': 'may not',
            'mayn’t': 'may not',

            "might've": 'might have', "mightn't": 'might not', "mightn't've": "might not have", 
            'might,ve': 'might have', 'mightn,t': 'might not', "mightn,t,ve": "might not have", 
            'might;ve': 'might have', 'mightn;t': 'might not', "mightn;t;ve": "might not have", 

            "must've": 'must have', "mustn't": 'must not', "mustn't've": "must not have",  
            'must,ve': 'must have', 'mustn,t': 'must not', "mustn,t,ve": "must not have", 
            'must;ve': 'must have', 'mustn;t': 'must not', "mustn;t;ve": "must not have", 

            "needn't": 'need not', "needn't've": "need not have",
            'needn,t': 'need not', "needn,t,ve": "need not have",
            'needn;t': 'need not', "needn;t;ve": "need not have",

            "oughtn't": 'ought not', "oughtn't've": "ought not have",
            'oughtn,t': 'ought not', "oughtn,t,ve": "ought not have",
            'oughtn;t': 'ought not', "oughtn;t;ve": "ought not have",

            "he'd": 'he would', "he'd've": 'he would have', "he'll": 'he will', "he's": 'he is', "he'll've": "he will have",
            'he,d': 'he would', 'he,d,ve': 'he would have', 'he,ll': 'he will', 'he,s': 'he is', "he,ll,ve": "he will have",
            'he;d': 'he would', 'he;d;ve': 'he would have', 'he;ll': 'he will', 'he;s': 'he is', "he;ll;ve": "he will have",
            'he´d': 'he would', 'he´d´ve': 'he would have', 'he´ll': 'he will', 'he´s': 'he is', "he´ll´ve": "he will have",
            'he’d': 'he would', 'he’d’ve': 'he would have', 'he’ll': 'he will', 'he’s': 'he is', "he’ll’ve": "he will have",

            "she'd": 'she would', "she'll": 'she will' ,"she's": 'she is', "she'd've": "she would have", "she'll've": "she will have",
            'she,d': 'she would', 'she,ll': 'she will', 'she,s': 'she is', "she,d,ve": "she would have", "she,ll,ve": "she will have",
            'she;d': 'she would', 'she;ll': 'she will', 'she;s': 'she is', "she;d;ve": "she would have", "she;ll;ve": "she will have",
            'she´d': 'she would', 'she´ll': 'she will', 'she´s': 'she is', "she´d´ve": "she would have", "she´ll´ve": "she will have",
            'she’d': 'she would', 'she’ll': 'she will', 'she’s': 'she is', "she’d’ve": "she would have", "she’ll’ve": "she will have",

            "i'd": 'i would', "i'll": 'i will', "i'm": 'i am', "i've": 'i have', "i'd've": "i would have", "i'll've": "i will have", "i'ma": "i am", "i'am": 'i am', "i'l": "i will", "i'v": 'i have',
            'i,d': 'i would', 'i,ll': 'i will', 'i,m': 'i am', 'i,ve': 'i have', "i,d,ve": "i would have", "i,ll,ve": "i will have", "i,ma": "i am", "i,am": 'i am', "i,l": "i will", "i,v": 'i have',
            'i;d': 'i would', 'i;ll': 'i will', 'i;m': 'i am', 'i;ve': 'i have', "i;d;ve": "i would have", "i;ll;ve": "i will have", "i;ma": "i am", "i;am": 'i am', "i;l": "i will", "i;v": 'i have',
            'i´d': 'i would', 'i´ll': 'i will', 'i´m': 'i am', 'i´ve': 'i have', "i´d´ve": "i would have", "i´ll´ve": "i will have", "i´ma": "i am", "i´am": 'i am', "i´l": "i will", "i´v": 'i have',
            'i’d': 'i would', 'i’ll': 'i will', 'i’m': 'i am', 'i’ve': 'i have', "i’d’ve": "i would have", "i’ll’ve": "i will have", "i’ma": "i am", "i’am": 'i am', "i’l": "i will", "i’v": 'i have',

            'i""m': 'i am',

            "we'd": 'we would', "we'll": 'we will', "we're": 'we are', "we've": 'we have', "we'd've": "we would have", "we'll've": "we will have",
            'we,d': 'we would', 'we,ll': 'we will', 'we,re': 'we are', 'we,ve': 'we have', "we,d,ve": "we would have", "we,ll,ve": "we will have",
            'we;d': 'we would', 'we;ll': 'we will', 'we;re': 'we are', 'we;ve': 'we have', "we;d;ve": "we would have", "we;ll;ve": "we will have",
            'we´d': 'we would', 'we´ll': 'we will', 'we´re': 'we are', 'we´ve': 'we have', "we´d´ve": "we would have", "we´ll´ve": "we will have",
            'we’d': 'we would', 'we’ll': 'we will', 'we’re': 'we are', 'we’ve': 'we have', "we’d’ve": "we would have", "we’ll’ve": "we will have",

            "you'd": 'you would', "you'll": 'you will', "you're": 'you are', "you've": "you have", "your'e": "you are", "u're": "you are", "ya'll": "you all", "you'r": "you are",
            'you,d': 'you would', 'you,ll': 'you will', 'you,re': 'you are', "you,ve": "you have", "your,e": "you are", "u,re": "you are", "ya,ll": "you all", "you,r": "you are", 
            'you;d': 'you would', 'you;ll': 'you will', 'you;re': 'you are', "you;ve": "you have", "your;e": "you are", "u;re": "you are", "ya;ll": "you all", "you;r": "you are",
            'you´d': 'you would', 'you´ll': 'you will', 'you´re': 'you are', "you´ve": "you have", "your´e": "you are", "u´re": "you are", "ya´ll": "you all", "you´r": "you are",
            'you’d': 'you would', 'you’ll': 'you will', 'you’re': 'you are', "you’ve": "you have", "your’e": "you are", "u’re": "you are", "ya’ll": "you all", "you’r": "you are",

            "y'all": "you all", "y'know": "you know", "y'all'd": "you all would", "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
            "y,all": "you all", "y,know": "you know", "y,all,d": "you all would", "y,all,d,ve": "you all would have","y,all,re": "you all are","y,all,ve": "you all have", 
            "y;all": "you all", "y;know": "you know", "y;all;d": "you all would", "y;all;d;ve": "you all would have","y;all;re": "you all are","y;all;ve": "you all have", 
            "y´all": "you all", "y´know": "you know", "y´all´d": "you all would", "y´all´d´ve": "you all would have","y´all´re": "you all are","y´all´ve": "you all have", 
            "y’all": "you all", "y’know": "you know", "y’all’d": "you all would", "y’all’d’ve": "you all would have","y’all’re": "you all are","y’all’ve": "you all have", 

            "you.i": "you i",

            "they'd": 'they would', "they'll": 'they will', "they're": 'they are', "they've": 'they have', "they'd've": "they would have", "they'll've": "they will have",
            'they,d': 'they would', 'they,ll': 'they will', 'they,re': 'they are', 'they,ve': 'they have', "they,d,ve": "they would have", "they,ll,ve": "they will have",
            'they;d': 'they would', 'they;ll': 'they will', 'they;re': 'they are', 'they;ve': 'they have', "they;d;ve": "they would have", "they;ll;ve": "they will have",
            'they´d': 'they would', 'they´ll': 'they will', 'they´re': 'they are', 'they´ve': 'they have', "they´d´ve": "they would have", "they´ll´ve": "they will have",
            'they’d': 'they would', 'they’ll': 'they will', 'they’re': 'they are', 'they’ve': 'they have', "they’d’ve": "they would have", "they’ll’ve": "they will have",

            "it'd": 'it would', "it'll": 'it will', "it's": 'it is', "it'd've": "it would have", "it'll've": "it will have",
            'it,d': 'it would', 'it,ll': 'it will', 'it,s': 'it is', "it,d,ve": "it would have", "it,ll,ve": "it will have", 
            'it;d': 'it would', 'it;ll': 'it will', 'it;s': 'it is', "it;d;ve": "it would have", "it;ll;ve": "it will have",
            'it´d': 'it would', 'it´ll': 'it will', 'it´s': 'it is', "it´d´ve": "it would have", "it´ll´ve": "it will have",
            'it’d': 'it would', 'it’ll': 'it will', 'it’s': 'it is', "it’d’ve": "it would have", "it’ll’ve": "it will have",

            "this'll": "this all", "this's": "this is",
            "this,ll": "this all", "this,s": "this is",
            "this;ll": "this all", "this;s": "this is",

            "that'd": 'that would', "that's": 'that is', "that'll": "that will", "that'd've": "that would have",
            'that,d': 'that would', 'that,s': 'that is', "that,ll": "that will", "that,d,ve": "that would have",
            'that;d': 'that would', 'that;s': 'that is', "that;ll": "that will", "that;d;ve": "that would have",
            'that´d': 'that would', 'that´s': 'that is', "that´ll": "that will", "that´d´ve": "that would have",
            'that’d': 'that would', 'that’s': 'that is', "that’ll": "that will", "that’d’ve": "that would have",

            "there'd": 'there had', "there's": 'there is', "there'll": "there will","there're": "there are", "there'd've": "there would have",
            'there,d': 'there had', 'there,s': 'there is', "there,ll": "there will","there,re": "there are", "there,d,ve": "there would have",
            'there;d': 'there had', 'there;s': 'there is', "there;ll": "there will","there;re": "there are", "there;d;ve": "there would have",
            'there´d': 'there had', 'there´s': 'there is', "there´ll": "there will","there´re": "there are", "there´d´ve": "there would have",
            'there’d': 'there had', 'there’s': 'there is', "there’ll": "there will","there’re": "there are", "there’d’ve": "there would have",

            "here's": "here is", "here're": "here are",
            "here,s": "here is", "here,re": "here are",
            "here;s": "here is", "here;re": "here are",

            "when's": "when is", "when've": "when have", "when're": "when are",
            "when's": "when is", "when've": "when have", "when're": "when are",
            "when's": "when is", "when've": "when have", "when're": "when are",

            "where'd": 'where did', "where's": 'where is', "where've": "where have", "where're": "where are",
            'where,d': 'where did', 'where,s': 'where is', "where,ve": "where have", "where,re": "where are",
            'where;d': 'where did', 'where;s': 'where is', "where;ve": "where have", "where;re": "where are",

            "who'll": 'who will', "who's": 'who is', "who'd": "who would", "who're": "who are","who've": "who have", "who'll've": "who will have",
            'who,ll': 'who will', 'who,s': 'who is', "who,d": "who would", "who,re": "who are","who,ve": "who have", "who,ll,ve": "who will have",
            'who;ll': 'who will', 'who;s': 'who is', "who;d": "who would", "who;re": "who are","who;ve": "who have", "who;ll;ve": "who will have",

            "how'd": 'how did', "how'll": 'how will', "how's": 'how is', "how'd'y": "how do you",
            'how,d': 'how did', 'how,ll': 'how will', 'how,s': 'how is', "how,d,y": "how do you",
            'how;d': 'how did', 'how;ll': 'how will', 'how;s': 'how is', "how;d;y": "how do you",

            "what'll": 'what will', "what're": 'what are', "what's": 'what is', "what've": 'what have', "what'll've": "what will have",
            'what,ll': 'what will', 'what,re': 'what are', 'what,s': 'what is', 'what,ve': 'what have', "what,ll,ve": "what will have",
            'what;ll': 'what will', 'what;re': 'what are', 'what;s': 'what is', 'what;ve': 'what have', "what;ll;ve": "what will have",

            "why'd": "why would", "why'll": "why will", "why're": "why are", "why's": "why is", "why've": "why have",
            "why,d": "why would", "why,ll": "why will", "why,re": "why are", "why,s": "why is", "why,ve": "why have",
            "why;d": "why would", "why;ll": "why will", "why;re": "why are", "why;s": "why is", "why;ve": "why have",

            "let's": 'let us', 'let,s': 'let us', 'let;s': 'let us',

            "ma'am": 'madam', 'ma,am': 'madam', 'ma;am': 'madam',

            "wan't": 'want', "wan,t": 'want', "wan;t": 'want',

            "agains't": "against", "agains,t": "against", "agains;t": "against",

            "c'mon": "common", "c,mon": "common", "c;mon": "common",

            "gov't": "government", "gov,t": "government", "gov;t": "government",

            'ᴀɴᴅ':'and','ᴛʜᴇ':'the','ʜᴏᴍᴇ':'home','ᴜᴘ':'up','ʙʏ':'by','ᴀᴛ':'at', 'ᴄʜᴇᴄᴋ':'check','ғᴏʀ':'for','ᴛʜɪs':'this','ᴄᴏᴍᴘᴜᴛᴇʀ':'computer',
            'ᴍᴏɴᴛʜ':'month','ᴡᴏʀᴋɪɴɢ':'working','ᴊᴏʙ':'job','ғʀᴏᴍ':'from','Sᴛᴀʀᴛ':'start','CO₂':'carbon dioxide','ғɪʀsᴛ':'first','ᴇɴᴅ':'end',
            'ᴄᴀɴ':'can','ʜᴀᴠᴇ':'have','ᴛᴏ':'to','ʟɪɴᴋ':'link','ᴏғ':'of','ʜᴏᴜʀʟʏ':'hourly','ᴡᴇᴇᴋ':'week','ᴇɴᴅ':'end','ᴇxᴛʀᴀ':'extra','Gʀᴇᴀᴛ':'great',
            'sᴛᴜᴅᴇɴᴛs':'student','sᴛᴀʏ':'stay','ᴍᴏᴍs':'mother','ᴏʀ':'or','ᴀɴʏᴏɴᴇ':'anyone','ɴᴇᴇᴅɪɴɢ':'needing','ᴀɴ':'an','ɪɴᴄᴏᴍᴇ':'income',
            'ʀᴇʟɪᴀʙʟᴇ':'reliable','ғɪʀsᴛ':'first','ʏᴏᴜʀ':'your','sɪɢɴɪɴɢ':'signing','ʙᴏᴛᴛᴏᴍ':'bottom','ғᴏʟʟᴏᴡɪɴɢ':'following','Mᴀᴋᴇ':'make',
            'ᴄᴏɴɴᴇᴄᴛɪᴏɴ':'connection','ɪɴᴛᴇʀɴᴇᴛ':'internet', 'ʜaᴠᴇ':' have ', 'ᴄaɴ':' can ', 'Maᴋᴇ':' make ', 'ʀᴇʟɪaʙʟᴇ':' reliable ', 
            'ɴᴇᴇᴅ':' need ','ᴏɴʟʏ':' only ', 'ᴇxᴛʀa':' extra ', 'aɴ':' an ', 'aɴʏᴏɴᴇ':' anyone ', 'sᴛaʏ':' stay ', 'Sᴛaʀᴛ':' start',
        }
        
        self.contraction_mapping = {**contraction_mapping, **self._get_upper_contraction(contraction_mapping)}
    
    
        self.spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0']

        self.special_punc_mappings = {
            "—": "-", "–": "-", "_": "-", '−': '-', '•': '-', "—": "-", "–": "-", "_": "-", '−': '-', 

            '”': '"', '″': '"', '“': '"', '“': '"', '”': '"', '“': '"', 
            "’": "'", "‘": "'", "´": "'", "`": "'", "‘": "'", "´": "'", "’": "'", "`": "'",

            "₹": "e", "€": "e", "™": "tm",  "×": "x", "²": "2", "£": "e", 'à': 'a', '³': '3', 
            "√": " sqrt ", 'α': 'alpha', 'β': 'beta', 'θ': 'theta', 'π': 'pi', '∞': 'infinity', '÷': '/', '∅': '',

            '،':'', '„':'', "°": "", 'करना': '', 'है': '',
            '…': ' ... '
        }

        self.punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

    def _get_upper_contraction(self, mapping):
        result_dict = {}
        for key, value in mapping.items():
            if key[0].isalpha():
                key = key[0].upper() + key[1:]
                value = value[0].upper() + value[1:]
                result_dict[key] = value
        return result_dict
    

    def _clean_contractions(self, text, mapping):
        specials = ["’", "‘", "´", "`"]
        for s in specials:
            text = text.replace(s, "'")
        text = ' '.join([mapping[t] if t in mapping else t for t in text.split()])
        return text
    
    
    # remove space
    def _remove_space(self, text):
        for space in self.spaces:
            text = text.replace(space, ' ')

        text = text.strip()
        text = re.sub('\s+', ' ', text)

        return text
    
    # remove special punctuations
    def _clean_special_punctuations(self, text):
        for punc in self.special_punc_mappings:
            text = text.replace(punc, self.special_punc_mappings[punc])

        return text
    
    def _spacing_punctuations(self, text):
        for p in self.punct:
            text = text.replace(p, f' {p} ')

        return text
    
    def _preprocess_punctuations(self, text):
        text = self._remove_space(text)
        text = self._clean_special_punctuations(text)
        text = self._spacing_punctuations(text)
        text = self._remove_space(text)

        return text
    
    def preprocess(self, text):
        text = self._clean_contractions(text, self.contraction_mapping)
        text = self._preprocess_punctuations(text)
        
        return text

## Model

### LSTM

* spatial dropout

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2) # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1) # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x) # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1) # (N, T, 1, K)
        x = x.squeeze(2) # (N, T, K)
        return x

* embedding

In [None]:
class Embedding_LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        super(Embedding_LSTM, self).__init__()
        
        self.embedding = nn.Embedding(embedding_matrix.shape[0], EMBED_SIZE * 2)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.embedding_dropout(x)
        return x

* encoder

In [None]:
class Encoder_LSTM(nn.Module):
    def __init__(self, num_aux_targets):
        super(Encoder_LSTM, self).__init__()
    
        self.cell1 = nn.LSTM(EMBED_SIZE * 2, HIDDEN_SIZE, bidirectional=True, batch_first=True)
        self.cell2 = nn.LSTM(HIDDEN_SIZE * 2, HIDDEN_SIZE, bidirectional=True, batch_first=True)
        
        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        
        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)
                
    def forward(self, x):
        cell1, _ = self.cell1(x)
        cell2, _ = self.cell2(cell1)
        
        avg_pool = torch.mean(cell2, 1)
        max_pool, _ = torch.max(cell2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)
        
        return out

* lstm model

In [None]:
class Model_LSTM(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(Model_LSTM, self).__init__()
        
        self.embedding = Embedding_LSTM(embedding_matrix)
        self.encoder = Encoder_LSTM(num_aux_targets)

    def forward(self, x):
        
        x = self.embedding(x)
        out = self.encoder(x) 
        
        return out

## Inference

In [None]:
class InferenceTemplate:
    def __init__(self, preprocess_funcs=None, debug=False):
        self.preprocess_funcs = preprocess_funcs
        self.debug = debug
    
    # Data Related Functions
    def _load_text(self, category="test"):
        
        # load raw text
        print(f"Load {category} Text ...")
        path = f'../input/jigsaw-unintended-bias-in-toxicity-classification/{category}.csv'
        text = pd.read_csv(path).comment_text.astype(str)
        
        # debug mode
        if self.debug:
            print("Debug Mode ...")
            text = text[:10000]
        
        # preprocessing
        if self.preprocess_funcs:
            print("Preprocessing ...")
            
            for i, preprocess_func in enumerate(self.preprocess_funcs):
                print(f"---- Preprocessing {i}'th")
                text = text.apply(lambda v: preprocess_func.preprocess(v))
        else:
            print("No preprocessing ...")
            
        print('\n')
        return text
    
    # Inference Related Functions
    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

* lstm inference

In [None]:
class InferenceLstm(InferenceTemplate):

    def make_data(self, preprocess_type=None):
        
        
        # load text
        if preprocess_type == 'sogna':
            print("Train Text Preprocessing ... [sogna]")
            train_text = pd.read_csv('../input/toxic-preprocessed-train-text/train_text_preprocessed_sogna.csv')['comment_text_sogna'].astype(str)
        elif preprocess_type == 'pb':
            print("Train Text Preprocessing ... [pb]")
            train_text = pd.read_csv('../input/toxic-preprocessed-train-text/train_text_preprocessed_pb.csv')['comment_text_pb'].astype(str)
        elif preprocess_type == 'sogna_pb':
            print("Train Text Preprocessing ... [sogna + pb]")
            train_text = pd.read_csv('../input/toxic-preprocessed-train-text/train_text_preprocessed_sogna_pb.csv')['comment_text_sogna_pb'].astype(str)
        else:
            train_text = self._load_text('train')
            
        if self.debug:
            train_text = train_text[:10000]
        
        # train_text = self._load_text('train')
        test_text = self._load_text('test')
        
        # tokenizer
        tokenizer = Tokenizer(lower=False, filters="")
        tokenizer.fit_on_texts(list(train_text) + list(test_text))
        
        # get token (sequence)
        test_text = tokenizer.texts_to_sequences(list(test_text))
        
        return test_text, tokenizer
    
    
    # Embedding Related Functions
    def _load_embeddings(self, path):
        with open(path,'rb') as f:
            emb_arr = pickle.load(f)
        return emb_arr
    
    def _build_matrix(self, word_index, path):
        embedding_index = self._load_embeddings(path)
        embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
        unknown_words = []

        for word, i in word_index.items():
            try: embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try: embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try: embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError: unknown_words.append(word)

        return embedding_matrix, unknown_words
    
    def make_embedding(self, tokenizer):
        
        # crawl
        crawl_matrix, unkown_words = self._build_matrix(tokenizer.word_index,
                                                        CRAWL_EMBEDDING_PATH)
        
        print('n unknown words (crawl): ', len(unkown_words))
        
        # glove
        glove_matrix, unkown_words = self._build_matrix(tokenizer.word_index, 
                                                        GLOVE_EMBEDDING_PATH)
        
        print('n unknown words (glove): ', len(unkown_words))
        
        # embedding_matrix
        embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
        print("Embedding Matrix Shape: ", embedding_matrix.shape)
        
        del crawl_matrix, glove_matrix, unkown_words
        gc.collect()
        
        return embedding_matrix
    
    # Model Related Functions
    def make_model(self, embedding_matrix, path):
        
        # template
        model = Model_LSTM(embedding_matrix, 7)
        
        # load weight
        model.encoder.load_state_dict(torch.load(path)['model_state_dict'])
        return model
    
    # Inference Related Functions
    def _get_padding_size(self, lengths):
        padding_size = 0
        _max = np.max(lengths)
        _threshold = np.ceil(np.quantile(lengths, 0.95))

        if _max >= MAX_LEN:
            padding_size = MAX_LEN

        else:
            padding_size = _max

        return int(padding_size)
    
    def inference(self, test, model):
        
        model.cuda()
        model.eval()
        
        total_batch = int(len(test) / BATCH_SIZE)
        test_preds = np.zeros((len(test), 8))
        
        print("total batch: {}, test size: {}, batch size: {}".format(total_batch, len(test), BATCH_SIZE))
        
        for i in range(total_batch + 1):

            # get batch
            batch_X = test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]

            # when data size divides by batch size
            if len(batch_X) == 0: break

            # adaptive padding size
            padding_size = self._get_padding_size([len(v) for v in batch_X])
            batch_X = pad_sequences(batch_X, maxlen=padding_size, padding='post')

            # torch tensor
            batch_X = torch.tensor(batch_X, dtype=torch.long).cuda()

            # prediction
            pred_y = model(batch_X).detach().squeeze(dim=-1)

            # predict test target
            test_preds[i * BATCH_SIZE: (i+1) * BATCH_SIZE, :] = self._sigmoid(pred_y.cpu().numpy())
            
            del batch_X, pred_y
            torch.cuda.empty_cache()
            
        return test_preds

* bert inference

In [None]:
class InferenceBert(InferenceTemplate):
    
    # Data Related Functions
    def _convert_lines(self, example, max_seq_length, tokenizer):
        max_seq_length -=2
        all_tokens = []
        longer = 0
        for text in tqdm_notebook(example):
            tokens_a = tokenizer.tokenize(text)
            if len(tokens_a) > max_seq_length:
                tokens_a = tokens_a[:max_seq_length]
                longer += 1
            one_token = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a))
            all_tokens.append(one_token)

        print("Num Of Lines Over Max Sequences: {}/{}".format(longer, len(all_tokens)))
        return np.array(all_tokens)
    
    def make_data(self):
        
        # load text
        test_text = self._load_text('test')
            
        # tokenize
        tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True)
        
        # get token (sequence)
        test_text = self._convert_lines(test_text.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
        
        return test_text
    
    # Model Related Functions
    def make_model(self, path):
        ckpt = torch.load(f'{path}', map_location='cuda:0')
        
        # bert config
        bert_config = ckpt['bert_config']
        
        # model
        model = BertForSequenceClassification(bert_config, num_labels=8)
        model.load_state_dict(ckpt['model_state_dict'])
        
        return model
    
    # Inference Related Functions
    def inference(self, test, model):
        
        model = model.to(device)

        test_preds = np.zeros((len(test)))
        test_dataset = data.TensorDataset(torch.tensor(test, dtype=torch.long))
        test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        for param in model.parameters():
            param.requires_grad=False
        
        model.eval()
        
        tq = tqdm_notebook(test_loader)
        for i, (x_batch,)  in enumerate(tq):
            pred = model(x_batch.to(device),
                         attention_mask = (x_batch > 0).to(device), 
                         labels=None)
            
            # print(pred[:, 0].detach().cpu().squeeze().numpy())

            test_preds[i*BATCH_SIZE:(i+1)*BATCH_SIZE]= self._sigmoid(pred[:,0].detach().cpu().squeeze().numpy())

            del x_batch, pred
            torch.cuda.empty_cache()

        return test_preds

* gpt2 inference

In [None]:
class InferenceGPT2(InferenceTemplate):
    
    # Data Related Functions
    def _convert_lines(self, example, max_seq_length, tokenizer):
        all_tokens = []
        longer = 0
        for text in tqdm_notebook(example):
            tokens_a = tokenizer.tokenize(text)
            if len(tokens_a) > max_seq_length:
                tokens_a = tokens_a[:max_seq_length]
                longer += 1
            one_token = tokenizer.convert_tokens_to_ids(tokens_a) + [0] * (max_seq_length - len(tokens_a))
            all_tokens.append(one_token)

        print("Num Of Lines Over Max Sequences: {}/{}".format(longer, len(all_tokens)))
        return np.array(all_tokens)
    
    def make_data(self):
        
        # load text
        test_text = self._load_text('test')
            
        # tokenize
        tokenizer = GPT2Tokenizer.from_pretrained(GPT2_MODEL_PATH, cache_dir=None)
        
        # get token (sequence)
        test_text = self._convert_lines(test_text.fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
        
        return test_text
    
    # Model Related Functions
    def make_model(self, path):
        ckpt = torch.load(f'{path}')
        
        # bert config
        bert_config = ckpt['bert_config']
        
        # model
        model = GPT2ClassificationHeadModel(bert_config)
        model.load_state_dict(ckpt['model_state_dict'])
        
        return model
    
    # Inference Related Functions
    def inference(self, test, model):
        
        test_preds = np.zeros((len(test)))
        test_dataset = data.TensorDataset(torch.tensor(test, dtype=torch.long))
        test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        for param in model.parameters():
            param.requires_grad=False
        
        model.cuda()
        model.eval()
        
        tq = tqdm_notebook(test_loader)
        for i, (x_batch,)  in enumerate(tq):
            pred = model(x_batch.cuda())
            
            # print(self._sigmoid(pred[:, 0].detach().cpu().squeeze().numpy()))
            
            test_preds[i*BATCH_SIZE:(i+1)*BATCH_SIZE]= self._sigmoid(pred[:,0].detach().cpu().squeeze().numpy())

            del x_batch, pred
            torch.cuda.empty_cache()

        return test_preds

# Main

In [None]:
debug = False

In [None]:
random_state = 42
seed_everything(random_state)

* preprocess pipeline

In [None]:
preprocess_pb = Preprocess_pb_kernel()
preprocess_sogna = Preprocess_sogna_kernel()

## LSTM

In [None]:
preprocess_pipe = [preprocess_sogna]

* constants

In [None]:
# constants
CRAWL_EMBEDDING_PATH = '../input/pickled-crawl300d2m/pickled-crawl300d2m/crawl-300d-2M.pkl'
GLOVE_EMBEDDING_PATH = '../input/pickled-glove840b300d/pickled-glove840b300d/glove.840B.300d.pkl'
EMBED_SIZE = 300
HIDDEN_SIZE = 128
DENSE_HIDDEN_UNITS = 4 * HIDDEN_SIZE
BATCH_SIZE = 512 * 8
MAX_LEN = 220

* inference class

In [None]:
inference_lstm = InferenceLstm(
    preprocess_funcs=preprocess_pipe, 
    debug=debug
)

* make data and tokenizer

In [None]:
test_text, tokenizer = inference_lstm.make_data(preprocess_type='sogna')

* make embedding

In [None]:
embedding_matrix = inference_lstm.make_embedding(tokenizer)

* inference

In [None]:
%%time
all_test_preds = []

for model_idx in range(2):
    test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(4)]

    for epoch in range(4):  
        path = f'../input/sub-toxic-lstm-weight/lstm_wh_model_idx_{model_idx}_epcoh_{epoch}.pt'
        model = inference_lstm.make_model(embedding_matrix, path=path)
        test_preds.append(inference_lstm.inference(test_text, model))

    all_test_preds.append(np.average(test_preds, weights=checkpoint_weights, axis=0))

test_preds_lstm = np.mean(all_test_preds, axis=0)[:, 0]

In [None]:
del embedding_matrix
gc.collect()

## GPT2

* load library

In [None]:
PYTORCH_BERT_DIR = "../input/gpt2source/gpt2-pytorch/pytorch-pretrained-BERT-master/"
sys.path.insert(0, PYTORCH_BERT_DIR)

from pytorch_pretrained_bert import GPT2Tokenizer, GPT2ClassificationHeadModel

* preprocess pipeline

In [None]:
preprocess_pipe = [preprocess_sogna, preprocess_pb]

* constants

In [None]:
# constants
GPT2_MODEL_PATH = '../input/gpt2-pretrained-models/gpt2-models/'
MAX_SEQUENCE_LENGTH = 260
BATCH_SIZE = 256

In [None]:
inference_gpt2 = InferenceGPT2(
    preprocess_funcs=preprocess_pipe, 
    debug=debug
)

In [None]:
test_text = inference_gpt2.make_data()

In [None]:
model = inference_gpt2.make_model('../input/sub-toxic-gpt2-weight/gpt2_dis_260len_2epoch_32batch_4accum_8e-05lr_0.005warmup_0.2dropout_all_full.pt')
test_preds_gpt2 = inference_gpt2.inference(test_text, model)

* delete library

In [None]:
lib_modules_list = [s for s in list(sys.modules.keys()) if "pytorch_pretrained_bert" in s]

del GPT2Tokenizer, GPT2ClassificationHeadModel
for m in lib_modules_list :
    del sys.modules[m]

sys.path.remove(PYTORCH_BERT_DIR)
sys.path

## BERT

* load library

In [None]:
PYTORCH_BERT_DIR = "../input/ppbert/pytorch-pretrained-bert/pytorch-pretrained-BERT/"
sys.path.insert(0, PYTORCH_BERT_DIR)

from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification

* preprocess pipe

In [None]:
preprocess_pipe = [preprocess_sogna, preprocess_pb]

* constants

In [None]:
device = torch.device('cuda')

BERT_MODEL_PATH = '../input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
MAX_SEQUENCE_LENGTH = 300
BATCH_SIZE = 256

* inference class

In [None]:
inference_bert = InferenceBert(
    preprocess_funcs=preprocess_pipe, 
    debug=debug
)

* make data

In [None]:
test_text = inference_bert.make_data()

In [None]:
model = inference_bert.make_model('../input/sub-toxic-bert-weight/model_1_1_300len_2epoch_64batch_2accum_2e-05lr_all_full_0.01warm_1.3weight.pt')
test_preds_bert_94560 = inference_bert.inference(test_text, model)

In [None]:
model = inference_bert.make_model('../input/sub-toxic-bert-weight/model_1_1_pretrained_300len_2epoch_64batch_2accum_2e-05lr_all_full_0.01warm_94501.pt')
test_preds_bert_94501 = inference_bert.inference(test_text, model)

In [None]:
model = inference_bert.make_model('../input/sub-toxic-bert-weight/model_1_1_300len_2epoch_32batch_1accum_2e-05lr_full.pt')
test_preds_bert_94427 = inference_bert.inference(test_text, model)

* delete library

In [None]:
lib_modules_list = [s for s in list(sys.modules.keys()) if "pytorch_pretrained_bert" in s]

del BertTokenizer, BertForSequenceClassification
for m in lib_modules_list :
    del sys.modules[m]

sys.path.remove(PYTORCH_BERT_DIR)
sys.path

## BERT - KERAS

In [None]:
import pprint
import time
import json
import os
import sys
import collections
import csv
import pandas as pd
import numpy as np
import tensorflow as tf
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm_notebook

os.environ['TF_KERAS'] = '1'

In [None]:
os.system('pip install --no-index --find-links="../input/kerasbert/keras-bert-lib/" keras-bert')

In [None]:
sys.path.insert(0, '../input/pretrained-bert-including-scripts/master/bert-master')

# import python modules defined by BERT
import run_classifier
import modeling
import optimization
import tokenization

In [None]:
DATA_PATH = '../input/jigsaw-unintended-bias-in-toxicity-classification/'
BERT_PRETRAINED_PATH = '../input/bert-seq-360/'
BERT_MODEL_PATH = '../input/pretrained-bert-including-scripts/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'

In [None]:
model_list = os.listdir(BERT_PRETRAINED_PATH)

In [None]:
MODEL_NAME = 'keras-bert-toxic-vocab-360-02.h5'

In [None]:
# parameter
MAX_SEQUENCE_LENGTH = 360
LR = 2e-5
loss_weight = 3.2092275837114372

In [None]:
VOCAB_FILE = os.path.join(BERT_MODEL_PATH, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_MODEL_PATH, 'bert_config.json')
checkpoint_file = os.path.join(BERT_MODEL_PATH, 'bert_model.ckpt')
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=True)

In [None]:
symbols_to_isolate = '.,?!-;*"…:—()%#$&_/@＼・ω+=”“[]^–>\\°<~•≠™ˈʊɒ∞§{}·τα❤☺ɡ|¢→̶`❥━┣┫┗Ｏ►★©―ɪ✔®\x96\x92●£♥➤´¹☕≈÷♡◐║▬′ɔː€۩۞†μ✒➥═☆ˌ◄½ʻπδηλσερνʃ✬ＳＵＰＥＲＩＴ☻±♍µº¾✓◾؟．⬅℅»Вав❣⋅¿¬♫ＣＭβ█▓▒░⇒⭐›¡₂₃❧▰▔◞▀▂▃▄▅▆▇↙γ̄″☹➡«φ⅓„✋：¥̲̅́∙‛◇✏▷❓❗¶˚˙）сиʿ✨。ɑ\x80◕！％¯−ﬂﬁ₁²ʌ¼⁴⁄₄⌠♭✘╪▶☭✭♪☔☠♂☃☎✈✌✰❆☙○‣⚓年∎ℒ▪▙☏⅛ｃａｓǀ℮¸ｗ‚∼‖ℳ❄←☼⋆ʒ⊂、⅔¨͡๏⚾⚽Φ×θ￦？（℃⏩☮⚠月✊❌⭕▸■⇌☐☑⚡☄ǫ╭∩╮，例＞ʕɐ̣Δ₀✞┈╱╲▏▕┃╰▊▋╯┳┊≥☒↑☝ɹ✅☛♩☞ＡＪＢ◔◡↓♀⬆̱ℏ\x91⠀ˤ╚↺⇤∏✾◦♬³の｜／∵∴√Ω¤☜▲↳▫‿⬇✧ｏｖｍ－２０８＇‰≤∕ˆ⚜☁'
symbols_to_delete = '\n🍕\r🐵😑\xa0\ue014\t\uf818\uf04a\xad😢🐶️\uf0e0😜😎👊\u200b\u200e😁عدويهصقأناخلىبمغر😍💖💵Е👎😀😂\u202a\u202c🔥😄🏻💥ᴍʏʀᴇɴᴅᴏᴀᴋʜᴜʟᴛᴄᴘʙғᴊᴡɢ😋👏שלוםבי😱‼\x81エンジ故障\u2009🚌ᴵ͞🌟😊😳😧🙀😐😕\u200f👍😮😃😘אעכח💩💯⛽🚄🏼ஜ😖ᴠ🚲‐😟😈💪🙏🎯🌹😇💔😡\x7f👌ἐὶήιὲκἀίῃἴξ🙄Ｈ😠\ufeff\u2028😉😤⛺🙂\u3000تحكسة👮💙فزط😏🍾🎉😞\u2008🏾😅😭👻😥😔😓🏽🎆🍻🍽🎶🌺🤔😪\x08‑🐰🐇🐱🙆😨🙃💕𝘊𝘦𝘳𝘢𝘵𝘰𝘤𝘺𝘴𝘪𝘧𝘮𝘣💗💚地獄谷улкнПоАН🐾🐕😆ה🔗🚽歌舞伎🙈😴🏿🤗🇺🇸мυтѕ⤵🏆🎃😩\u200a🌠🐟💫💰💎эпрд\x95🖐🙅⛲🍰🤐👆🙌\u2002💛🙁👀🙊🙉\u2004ˢᵒʳʸᴼᴷᴺʷᵗʰᵉᵘ\x13🚬🤓\ue602😵άοόςέὸתמדףנרךצט😒͝🆕👅👥👄🔄🔤👉👤👶👲🔛🎓\uf0b7\uf04c\x9f\x10成都😣⏺😌🤑🌏😯ех😲Ἰᾶὁ💞🚓🔔📚🏀👐\u202d💤🍇\ue613小土豆🏡❔⁉\u202f👠》कर्मा🇹🇼🌸蔡英文🌞🎲レクサス😛外国人关系Сб💋💀🎄💜🤢َِьыгя不是\x9c\x9d🗑\u2005💃📣👿༼つ༽😰ḷЗз▱ц￼🤣卖温哥华议会下降你失去所有的钱加拿大坏税骗子🐝ツ🎅\x85🍺آإشء🎵🌎͟ἔ油别克🤡🤥😬🤧й\u2003🚀🤴ʲшчИОРФДЯМюж😝🖑ὐύύ特殊作戦群щ💨圆明园קℐ🏈😺🌍⏏ệ🍔🐮🍁🍆🍑🌮🌯🤦\u200d𝓒𝓲𝓿𝓵안영하세요ЖљКћ🍀😫🤤ῦ我出生在了可以说普通话汉语好极🎼🕺🍸🥂🗽🎇🎊🆘🤠👩🖒🚪天一家⚲\u2006⚭⚆⬭⬯⏖新✀╌🇫🇷🇩🇪🇮🇬🇧😷🇨🇦ХШ🌐\x1f杀鸡给猴看ʁ𝗪𝗵𝗲𝗻𝘆𝗼𝘂𝗿𝗮𝗹𝗶𝘇𝗯𝘁𝗰𝘀𝘅𝗽𝘄𝗱📺ϖ\u2000үսᴦᎥһͺ\u2007հ\u2001ɩｙｅ൦ｌƽｈ𝐓𝐡𝐞𝐫𝐮𝐝𝐚𝐃𝐜𝐩𝐭𝐢𝐨𝐧Ƅᴨןᑯ໐ΤᏧ௦Іᴑ܁𝐬𝐰𝐲𝐛𝐦𝐯𝐑𝐙𝐣𝐇𝐂𝐘𝟎ԜТᗞ౦〔Ꭻ𝐳𝐔𝐱𝟔𝟓𝐅🐋ﬃ💘💓ё𝘥𝘯𝘶💐🌋🌄🌅𝙬𝙖𝙨𝙤𝙣𝙡𝙮𝙘𝙠𝙚𝙙𝙜𝙧𝙥𝙩𝙪𝙗𝙞𝙝𝙛👺🐷ℋ𝐀𝐥𝐪🚶𝙢Ἱ🤘ͦ💸ج패티Ｗ𝙇ᵻ👂👃ɜ🎫\uf0a7БУі🚢🚂ગુજરાતીῆ🏃𝓬𝓻𝓴𝓮𝓽𝓼☘﴾̯﴿₽\ue807𝑻𝒆𝒍𝒕𝒉𝒓𝒖𝒂𝒏𝒅𝒔𝒎𝒗𝒊👽😙\u200cЛ‒🎾👹⎌🏒⛸公寓养宠物吗🏄🐀🚑🤷操美𝒑𝒚𝒐𝑴🤙🐒欢迎来到阿拉斯ספ𝙫🐈𝒌𝙊𝙭𝙆𝙋𝙍𝘼𝙅ﷻ🦄巨收赢得白鬼愤怒要买额ẽ🚗🐳𝟏𝐟𝟖𝟑𝟕𝒄𝟗𝐠𝙄𝙃👇锟斤拷𝗢𝟳𝟱𝟬⦁マルハニチロ株式社⛷한국어ㄸㅓ니͜ʖ𝘿𝙔₵𝒩ℯ𝒾𝓁𝒶𝓉𝓇𝓊𝓃𝓈𝓅ℴ𝒻𝒽𝓀𝓌𝒸𝓎𝙏ζ𝙟𝘃𝗺𝟮𝟭𝟯𝟲👋🦊多伦🐽🎻🎹⛓🏹🍷🦆为和中友谊祝贺与其想象对法如直接问用自己猜本传教士没积唯认识基督徒曾经让相信耶稣复活死怪他但当们聊些政治题时候战胜因圣把全堂结婚孩恐惧且栗谓这样还♾🎸🤕🤒⛑🎁批判检讨🏝🦁🙋😶쥐스탱트뤼도석유가격인상이경제황을렵게만들지않록잘관리해야합다캐나에서대마초와화약금의품런성분갈때는반드시허된사용🔫👁凸ὰ💲🗯𝙈Ἄ𝒇𝒈𝒘𝒃𝑬𝑶𝕾𝖙𝖗𝖆𝖎𝖌𝖍𝖕𝖊𝖔𝖑𝖉𝖓𝖐𝖜𝖞𝖚𝖇𝕿𝖘𝖄𝖛𝖒𝖋𝖂𝕴𝖟𝖈𝕸👑🚿💡知彼百\uf005𝙀𝒛𝑲𝑳𝑾𝒋𝟒😦𝙒𝘾𝘽🏐𝘩𝘨ὼṑ𝑱𝑹𝑫𝑵𝑪🇰🇵👾ᓇᒧᔭᐃᐧᐦᑳᐨᓃᓂᑲᐸᑭᑎᓀᐣ🐄🎈🔨🐎🤞🐸💟🎰🌝🛳点击查版🍭𝑥𝑦𝑧ＮＧ👣\uf020っ🏉ф💭🎥Ξ🐴👨🤳🦍\x0b🍩𝑯𝒒😗𝟐🏂👳🍗🕉🐲چی𝑮𝗕𝗴🍒ꜥⲣⲏ🐑⏰鉄リ事件ї💊「」\uf203\uf09a\uf222\ue608\uf202\uf099\uf469\ue607\uf410\ue600燻製シ虚偽屁理屈Г𝑩𝑰𝒀𝑺🌤𝗳𝗜𝗙𝗦𝗧🍊ὺἈἡχῖΛ⤏🇳𝒙ψՁմեռայինրւդձ冬至ὀ𝒁🔹🤚🍎𝑷🐂💅𝘬𝘱𝘸𝘷𝘐𝘭𝘓𝘖𝘹𝘲𝘫کΒώ💢ΜΟΝΑΕ🇱♲𝝈↴💒⊘Ȼ🚴🖕🖤🥘📍👈➕🚫🎨🌑🐻𝐎𝐍𝐊𝑭🤖🎎😼🕷ｇｒｎｔｉｄｕｆｂｋ𝟰🇴🇭🇻🇲𝗞𝗭𝗘𝗤👼📉🍟🍦🌈🔭《🐊🐍\uf10aლڡ🐦\U0001f92f\U0001f92a🐡💳ἱ🙇𝗸𝗟𝗠𝗷🥜さようなら🔼'

In [None]:
from nltk.tokenize.treebank import TreebankWordTokenizer
tokenizer_preprocess = TreebankWordTokenizer()

isolate_dict = {ord(c):f' {c} ' for c in symbols_to_isolate}
remove_dict = {ord(c):f'' for c in symbols_to_delete}

def handle_punctuation(x):
    x = x.translate(remove_dict)
    x = x.translate(isolate_dict)
    return x

def handle_contractions(x):
    x = tokenizer_preprocess.tokenize(x)
    return x

def fix_quote(x):
    x = [x_[1:] if x_.startswith("'") else x_ for x_ in x]
    x = ' '.join(x)
    return x

def preprocess(x):
    x = handle_punctuation(x)
    x = handle_contractions(x)
    x = fix_quote(x)
    return x

# Converting the lines to BERT format
# Thanks to https://www.kaggle.com/httpwwwfszyc/bert-in-keras-taming
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

In [None]:
df_test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
df_test['comment_text'] = df_test['comment_text'].astype(str) 
df_test['comment_text'] = df_test['comment_text'].apply(lambda x:preprocess(x))
X_test = convert_lines(df_test['comment_text'].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH,tokenizer)

X_seg_input = np.zeros((X_test.shape[0], MAX_SEQUENCE_LENGTH))
X_mask_input = np.ones((X_test.shape[0], MAX_SEQUENCE_LENGTH))

In [None]:
from keras_bert import load_trained_model_from_checkpoint

base_model = load_trained_model_from_checkpoint(CONFIG_FILE, checkpoint_file, training=True, seq_len=MAX_SEQUENCE_LENGTH)

In [None]:
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras import backend as K

def custom_target_loss(y_true, y_pred):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    target_loss = binary_crossentropy(K.reshape(y_true[:, 0], shape=(-1, 1)), y_pred) * y_true[:, 1]
    return (target_loss * loss_weight)

def custom_aux_loss(y_true, y_pred):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    aux_loss = binary_crossentropy(y_true, y_pred)

    return aux_loss

In [None]:
from tensorflow.python import keras
from keras_bert import calc_train_steps

extract = base_model.get_layer('Extract').output

final_layer = keras.layers.Dense(256, activation='relu')(extract)
final_layer = keras.layers.Dropout(0.25)(final_layer)

target_layer = keras.layers.Dense(1, activation='sigmoid', name='target_layer')(final_layer)
aux_layer = keras.layers.Dense(6, activation='sigmoid', name='aux_layer')(final_layer)

model = keras.models.Model(inputs=base_model.input, outputs=[target_layer, aux_layer])

In [None]:
model.load_weights(os.path.join(BERT_PRETRAINED_PATH, MODEL_NAME))
model.compile(loss=[custom_target_loss, custom_aux_loss], optimizer=tf.train.AdamOptimizer(learning_rate=LR))

In [None]:
test_preds_bert_939 = model.predict([X_test, X_seg_input, X_mask_input], batch_size=128, verbose=1, use_multiprocessing=True)[0]

## Prediction

In [None]:
test_preds_lstm

In [None]:
test_preds_gpt2

In [None]:
test_preds_bert_94560

In [None]:
test_preds_bert_94501

In [None]:
test_preds_bert_94427

In [None]:
test_preds_bert_939

In [None]:
meta = pd.DataFrame(index=range(len(test_text)))
meta['lstm_939'] = test_preds_lstm
meta['gpt2_941'] = test_preds_gpt2
meta['bert_94560'] = test_preds_bert_94560
meta['bert_94501'] = test_preds_bert_94501
meta['bert_94427'] = test_preds_bert_94427
meta['bert_939'] = test_preds_bert_939

meta.head()

In [None]:
submission = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv')
final_preds = (meta.bert_94560 ** 2) * 0.4 + (meta.bert_94501 ** 2) * 0.17 + (meta.bert_94427 ** 2) * 0.12 + (meta.gpt2_941 ** 2) * 0.11 + (meta.lstm_939 ** 2) * 0.1 + (meta.bert_939 ** 2) * 0.1
submission.prediction = final_preds
submission.to_csv('submission.csv', index=False)