In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import *
from keras.models import *
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.initializers import *
from keras.optimizers import *
import keras.backend as K
from keras.callbacks import *
import os
import time
import gc
import re
from unidecode import unidecode

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train shape : ", train.shape)
print("Test shape : ", test.shape)

In [None]:
train["question_text"] = train["question_text"].str.lower()
test["question_text"] = test["question_text"].str.lower()

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
def clean_text(x):

    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


train["question_text"] = train["question_text"].apply(lambda x: clean_text(x))
test["question_text"] = test["question_text"].apply(lambda x: clean_text(x))

In [None]:
mispell = [('quorans', 'quoran'),
 ('brexit', 'exit'),
 ('cryptocurrencies', 'cryptocurrency'),
 ('redmi', 'xiaomi'),
 ('paytm', 'paypal'),
 ('kvpy', 'fellowship program'),
 ('iiser', 'indian institutes of science education and research'),
 ('ethereum', 'bitcoins'),
 ('iisc', 'a public institute for research and higher education in science'),
 ('₹', 'inr'),
 ('jinping', 'jintao'),
 ('viteee', 'Entrance Test'),
 ('iocl', 'oil corporation'),
 ('nmims', 'Institute of Management Studies'),
 ('rohingya', 'Kashmiri'),
 ('upes', 'University of Petroleum and Energy Studies'),
 ('fortnite', 'csgo'),
 ('coinbase', 'bitcoin'),
 ('nsit', 'institute of technology'),
 ('cpec', 'china – pakistan economic corridor'),
 ('iitians', 'students graduated from an indian institute'),
 ('oneplus', 'huawei'),
 ('jadavpur', ''),
 ('udemy', 'coursera'),
 ('lyft', 'uber'),
 ('bahubali', 'deadpool'),
 ('uceed', 'Undergraduate common entrance examination for design'),
 ('afcat', 'Air Force Common Admission Test'),
 ('coep', 'College of Engineering'),
           
 ('bhakts', 'something with great influence'),
 ('demonetisation', 'demonetization'),
 ('upwork', 'odesk'),
 ('loy machedo', 'Personal Branding Strategist'),
 ('gdpr', 'general data protection regulation'),
 ('nlu', 'national law university'),
 ('yogi adityanath', 'current Chief Minister'),
 ('upsee', 'entrance exam'),
 ('boruto', "naruto ' s son"),
 ('bnbr', 'moderation'),
 ('ssc chsl', 'Staff Selection commission Combined Higher Secondary Level'),
 ('sean kernan', 'writer'),
 ('amcat', 'Aspiring Minds Computer Adaptive Test'),
 ('udacity', 'coursera'),
 ('josaa', 'joint seat allocation authority'),
 ('kylo ren', 'fictional character'),
 ('ali alshamsi', 'Entrepreneur'),
 ('balaji vishwanathan', 'CEO of Invento Robotics'),
 ('iitian', 'students graduated from an indian institute'),
 ('dceu', 'american media franchise'),
 ('litecoin', 'bitcoin'),
 ('unacademy', 'indian largest learning platform'),
 ('iiest', 'Indian Institute of Engineering Science and Technology'),
 ('laravel', 'php Framework'),
 ('sjws', 'social justice warrior'),
 ('rvce', 'College of Engineering'),
 ('qoura', 'quora'),
 ('vjti', 'Technological Institute'),
 ('zerodha', 'Indian financial service company'),
 ('jeremy corbyn', 'british politician'),
 ('xlri', 'School of Management'),
 ('msrit', 'Institute of Technology'),
 ('iitb', 'Indian Institute of Technology'),
 ('tensorflow', 'scala'),
 ('intps', 'intp'),
 ('ctmu', 'Cognitive - Theoretic Model of the Universe'),
 ('jiit', 'Institute of Information Technology'),
 ('gitam', 'Institute of Technology and Management'),
 ('doklam', 'china indian border'),
 ('hyperloop', 'high - speed train'),
 ('gopal kavalireddi', 'Maverick'),
 ('lnmiit', 'Institute of Information Technology'),
 ('myntra', 'indian fashion e - commerce company'),
 ('intjs', 'intj'),
 ('muoet', 'entrance exam'),
 ('mnnit', 'indian institute'),

 ('xamarin', 'html5'),
 ('nitk', 'National Institute of Technology'),
 ('tywin', 'fictional character'),
 ('kotlin', 'java'),
 ('nicmar', 'National Institute of Construction Management and Research'),
 ('shibpur', 'Howrah'),
 ('nptel', 'National Programme on Technology Enhanced Learning'),
 ('niser', 'National Institute of Science Education and Research'),
 ('ggsipu', 'Indraprastha University'),
 ('moocs', 'coursera'),
 ('banasthali vidyapeeth', "women ' s university"),
 ('modiji', 'modi'),
 ('vajiram and ravi', 'institute for exam preparation'),
 ('adhaar', 'id'),
 ('manafort', 'political consultant'),
 ('duterte', 'filipino politician'),
 ('zebpay', 'bitcoins'),
 ('elitmus', 'assessment and recruitment company'),
 ('infjs', 'infj'),
 ('srmjee', 'Joint Entrance Exam'),
 ('aurangzeb', 'sixth mughal emperor'),
 ('biharis', 'kashmiris'),
 ('pubg', 'csgo'),
 ('altcoins', 'bitcoin'),
 ('altcoin', 'bitcoin'),
 ('hackerrank', 'code website'),
 ('awdhesh', 'Educator'),
 ('jiren', 'goku'),
 ('wakanda', 'fictional country'),
 ('draupadi', 'Daughter of Drupada'),
 ('vnit', 'National Institute of Technology'),
 ('nitie', 'indian institute'),
 ('sibm', 'indian business institute'),
 ('aiq', 'all indian quota'),
 ('crispr', 'gene editing'),
 ('mamc', 'indian medical college'),
 ('pichai', 'google ceo'),
 ('ryzen', 'intel'),
 ('nit hamirpur', 'National Institute of Technology'),
 ('duolingo', 'language learning app'),
 ('rohingyas', 'kashmirs'),
 ('hpcl', 'Petroleum Corporation'),
 ('baahubali', 'deadpool'),
 ('koinex', 'bitcoin company'),
 ('tifr', 'indian institute'),
 ('srcc', 'indian institute'),
 ('tennesseans', 'tennessean'),
 ('binance', 'bitcoin'),
 ('mhcet', 'entrance exam'),
 ('nabard', 'an apex development financial institution'),
 ('byju', 'the learning app'),
 ('snoke', 'fictional character'),
 ('tillerson', 'former American government official'),
 ('franklin veaux', 'writer'),
 ('srmjeee', 'entrance exam'),
 ('zomato', 'swiggy'),
 ('beerus', 'gogeta'),
 ('sgsits', 'indian institute'),
 ('skripal', 'former russian military intelligence officer'),
 ('ximb', 'indian institute'),
 ('ftre', 'talent reward exam'),
 ('littlefinger', 'fictional character'),
 ('mindtree', 'infosys'),
 ('kulbhushan', 'Indian national'),
 ('nanodegree', 'the certificate of the bachelor degree'),
 ('gurugram', 'gurgaony'),
 ('hotstar', 'youtube'),
 ('mhtcet', 'entrance exam'),
 ('sadhguru', 'indian yogi'),
 ('bmsce', 'indian institute'),
 ('sindri', 'character'),
 ('ramanujan', 'an indian mathematician'),
 ('npat', 'national political awareness test'),
 ('pdpu', 'indian college'),
 ('bipc', 'biology physics chemistry'),
 ('jiofi', 'wifi routers'),
 ('rhaegar', 'fictional character'),
 ('dangal', '2016 indian hindi - language biographical sports drama film'),
 ('microservices', 'micro services'),
 ('kaggle', 'online community of data scientists and machine learners'),
 ('travis kalanick', 'an american billionaire businessman , '),
 ('ramapuram', 'institute'),
 ('iima', 'indian public business school '),
 ('kathua', 'person studied at master of science in computer science'),
 ('jbims', 'indian college'),
 ('azerbaijanis', 'azerbaijani'),
 ('surathkal', 'indian city'),
 ('cbit', 'an indian college'),
 ('obor', 'a development strategy'),
 ('swachh bharat', 'cleanliness campaign'),
 ('ramaiah', 'Belgaum'),
 ('truecaller', 'whatsapp'),
 ('rera', 'an act of the parliament of india in the real estate industry'),
 ('usict', 'indian college'),
 ('gionee', 'nokia'),
 ('sarahah', 'a social networking service'),
 ('clickbait', 'attention - grabbing headlines'),
 ('zenfone', 'vivo'),
 ('afsb', 'Air force selection board'),
 ('bams', 'bachelor of ayurvedic medicine and surgery'),
 ('bhms', 'bachelor of homeopathic medicine and surgery'),
 ('patreon', 'kickstarter'),
 ('lbsnaa', 'research and training institute'),
 ('lyanna', 'fictional character'),
 ('manit', 'National Institute of Technology '),
 ('altucher', 'american hedge fund manager , entrepreneur and podcaster'),
 ('dushka zapata', 'Amateur writer'),
 ('reactjs', 'javascript library'),
 ('quara', 'quora'),
 ('daiict', 'indian college'),
 ('gofundme', 'kickstarter'),
 ('pesit', 'indian college'),
 ('bpcl', 'government of india controlled maharatna oil and gas company'),
 ('nchmct', 'entrance exam'),
 ('cdse', 'center for development of security excellence'),
 ('chromecast', 'ipod'),
 ('bittrex', 'us - based bitcoin exchange'),
 ('dream11', 'xbox'),
 ('iitk', 'indian college'),
 ('codechef', 'competitive programming website'),
 ('bosniaks', ' bosnian'),
 ('kasol', 'hamlet in the district of indian'),
 ('sjce', 'indian college'),
 ('infps', 'infp'),
 ('milo yiannopoulos', 'british polemicist , political commentator , public speaker and writer'),
 ('pessat', 'online exam'),
 ('pcod', 'Polycystic Ovarian Syndrome'),
 ('nimhans', 'indian college'),
 ('msit', 'master of science in information technology'),
 ('demonitisation', 'demonetization'),
 ('iisers', 'indian college'),
 ('gujaratis', 'gujarati people'),
           
 ('bhubaneshwar', 'capital of Odisha'),
 ('ahmadabad', 'ahmedabad'),
 ('jungkook', 'South Korean singer'),
 ('bs4', '50 ppm emission standards'),
 ('kj somaiya', 'indian college'),
 ('aktu', 'indian college'),
 ('rosenstein', 'American attorney'),
 ('acio', 'Recruitment Exam'),
 ('psir', 'political science and international relations'),
 ('mppsc', 'public service commission'),
 ('ucms', 'University College of Medical Sciences'),
 ('euron', 'fictional character'),
 ('eecs', 'electrical engineering and computer sciences'),
 ('lstm', 'units of a recurrent neural network'),
 ('bitconnect', 'cryptocurrency'),
 ('kalpit veerwal', 'computer science sophomore'),
 ('marathis', 'marathi people'),
 ('deepmind', 'british artificial intelligence company'),
 ('monero', 'cryptocurrency'),
 #('galgotia', 'indian college'),
 ('tapmi', 'indian management institute'),
 ('alabamians', 'alabama people'),
 ('svnit', 'National Institute of Technology'),
 ('venmo', 'paypal'),
 ('trumpcare', 'obamacare'),
 ('aiats', 'All India Aakash Test Series'),
 ('davv', 'University'),
 ('wbcs', ' Civil Service'),
 ('fyers', 'brokerage company'),
 ('iiits', 'Indian Institutes of Information Technology'),
 ('wannacry', 'ransomware worm'),
 ('iimc', 'indian college'),
 ('kellyanne conway', ' american pollster , political consultant'),
 ('capf', 'Central Armed Police Forces'),
 ('mmmut', 'University of Technology'),
 ('airpods', 'headphones'),
 ('owaisi', 'indian politician'),
 ('xxxtentacion', 'american rapper'),
 #('narsee', 'location'),
 #('hbtu', 'government technical university'),
 ('vssut', 'University of Technology'),
 ('nlus', 'national law universities'),
 ('enfps', 'enfp'),
 ('kmno4', 'h2so4'),
 ('hkust', 'Hong Kong University of Science and Technology'),
 ('keam', 'Engineering Architecture Medical'),
 ('rlwl', 'remote location waiting list'),
 ('despacito', 'song'),
 ('epfo', 'Employees Provident Fund Organisation'),
 ('csab', 'central seat allocation board'),
 ('raghuram rajan', 'Indian economist'),
 ('onedrive', 'skydrive'),
 ('ropar', 'guwahati'),
 ('lnct', 'College of Technology'),
 ('trumpers', 'current president supporters'),
 ('sambhaji', 'second ruler of the Maratha kingdom'),
 ('codeforces', 'competitive programming contests website'),
 ('cbcs', 'choice based credit system'),
 ('arrowverse', 'superhero'),
 ('gamora', 'fictional character'),
 ('spjimr', 'Institute of Management And Research'),
 ('grammarly', 'grammar check app'),
 ('ftii', 'Film and Television Institute of India'),
 ('mohajirs', 'Muhajir people'),
 ('fz25', 'bike type'),
 ('zamasu', 'gogeta'),
 ('electroneum', 'cryptocurrency'),
 ('irodov', 'physics'),
 ('jorah', 'fictional character'),
 ('nlsiu', 'indian college'),
 ('mgtow', 'men going their own way'),
 ('thaad', 'missile defense system'),
 ('kingsman', 'film'),
 ('internshala', 'training platform'),
 ('epfl', 'research institute and university'),
 ('nimcet', 'entrance exam'),
 ('hno3', 'h2so4'),
 ('simpliv', 'coursera'),
 ('iiith', 'International Institute of Information Technology'),
 ('telugus', 'Telugu people'),
 ('kovind', '14th President of India'),
 ('genderfluid', 'intersex'),
 ('monjee', 'affiliated and constituent college'),
 ('eflu', 'English and Foreign Languages University'),
 ('baelish', 'fictional character'),
 ('g5s', ''),
 ('jallikattu', 'bull'),
 ('nepalis', 'Nepali'),
 ('whydo', 'why do'),
 ('chapterwise', 'chapter wise'),
 ('gauri lankesh', 'indian journalist - turned - activist'),
 ('tarly', 'fictional character'),
 ('ncerts', 'National Council of Educational Research and Training'),
 ('wework', 'shared workspace company'),
 ('pqwl', 'pooled quota waiting list'),
 ('icbms', 'intercontinental ballistic missile'),
 ('isil', 'isis'),
 ('enfj', 'entp'),
 ('ivan tregear', 'popular teenage writers on Quora'),
 ('entj', 'intj'),
 ('igdtuw', 'Technical University for Women'),
 ('gorkhaland', 'gurkha'),
 ('aricent', 'global design and engineering company'),
 ('icj', 'international court of justice'),
 ('satya nadella', 'microsoft ceo'),
 #('ravindrababu', 'the online teacher'),
 ('twinflame', 'soulmate'),
 #('chitkara', 'indian college'),
 ('iiitd', 'Institute of Information Technology'),
 ('hbti', 'institution'),
 ('padmaavat', 'padmavati'),
 ('kubernetes', 'open - source system for automating deployment'),
 ('tissnet', 'national Entrance Test'),
 ('magoosh', 'online test preparation'),
 ('xiomi', 'xiaomi'),
 ('blockchains', 'block chain'),
 ('jcpoa', 'iran nuclear deal'),
 ('undergraduation', 'Undergraduate education'),
 ('amdocs', 'cisco'),
 ('incels', 'involuntary celibates'),
 ('nsep', 'national security education program'),
 ('iitd', 'indian college'),
 ('kanhaiya', 'leader of the All India Students Federation'),
 ('dgca', 'directorate general of civil aviation '),
 #('tezpur', 'city and urban agglomeration'),
 ('schizoids', 'schizoid'),
 ('byjus', 'online learning platform'),
 ('pitampura', 'Pitam Pura'),
 ('madheshi', 'nepal'),
 ('hackerearth', 'coding platform'),
 ('sicsr', 'Institute of Computer Studies and Research'),
 ('swach bharat abhiyan', 'nation - wide campaign in india'),
 ('lgbtqi', 'lesbian , gay , bisexual , transgender , queer , intersex'),('ra - apist', 'rapist'),
 ('pmay', 'housing benefits'),
 ('topcoder', 'coding platform'),
 ('iitm', 'Indian Institute of Technology'),
 ('odoo', 'enterprise management system'),
 ('vitee', 'entrance exam'),
 ('mscs', 'Mathematics Statistics Computer Science'),
 ('ecil', 'electronics corporation of india limited'),
 ('buet', 'university of engineering and technology'),
 ('ddr4', 'gddr5'),
 ('galgotias', 'indian college'),
 ('martin shkreli', 'former american businessman'),
 ('ahca', 'American Health Care Act'),
 ('zhihu', 'ask and question website'),
 ('veerwal', 'computer science sophomore'),
 ('wikitribune', 'news platform'),
 ('duryodhana', 'major character'),
 ('kiitee', 'entrance exam'),
 ('ipmat', 'Integrated Program in Management Aptitude Test'),
 ('moviepass', 'movie ticketing service'),
 ('extc', 'electronics and telecommunication engineering'),
 ('czechia', 'czech'),
 ('bookmyshow', 'flipkart'),
 ('dhinchak pooja', 'pop singer'),
 ('bhagwad', 'Bhagavad'),
 ('kaneki', 'comic character'),
 ('undertale', 'minecraft'),
 ('peter strzok', 'former united states federal bureau of investigation ( fbi ) agent'),
 ('siddaramaiah', 'indian politician'),
 ('wbut', 'University of Technology'),
 ('arryn', 'fictional character'),
 ('inmo', 'Indian National Mathematics Olympiad'),
 ('gpsc', 'public service commission'),
 ('steemit', 'reddit'),
 ('istp', 'intp'),
 ('chandragupta', 'founder of the Maurya Empire'),
 ('iimb', 'indian college'),
 ('sscbs', 'college of business studies'),
 ('dps bokaro', 'Delhi Public School'),
 ('iitg', 'indian institute of technology'),
 ('quikr', 'advertising platform'),
 ('yourquote', 'microblogging platform'),
 ('jimin', 'singer'),
 ('dima vorobiev', 'one of the pioneers of sports medicine in russia'),
 ('vidyamandir classes', 'Coaching Institute'),
 #('ecosport', 'subcompact crossover suv'),
 ('scmhrd', 'Symbiosis Centre for Management and Human Resource Development'),
 ('cardano', 'cryptocurrency'),
 ('gorsuch', 'associate justice of the supreme court of the united states'),
 ('zootopia', 'american comedy film'),
 ('varc', 'Verbal Ability and Reading Comprehension'),
 ('remainers', 'the group in favour of the united kingdom remaining in the european union'),
 ('bachelore', "bachelor"),
 ('shibpur', "hindi"),
 ('pcmb', "Physics chemistry maths and biology together"),
 ('mnit', "National Institute of Technology"),
 ('what\u200b', 'what'),
 #('rapper,', "rapper"),
 ('whydo', "why do"),
 ('overbrace', '+ -'),
 #('apist', "rapist"),
 ('\ufeff', " "),
 ('how\u200b', "how"),
 ('broglie', 'physicist'),
 #('drumpf', "trump"),
 ('istj', 'intj'),
 ('fullform', 'full form'),
 ('etherium', 'bitcoin'),
 ('nahco3', 'caco3'),
 ('qidian', 'sina'),
 ('na2co3', "caco3"),
 ('cos2x', 'cosx'),
 ('don´t', 'do not'),
 ('2k17', '2017'),
 ('cacl2', 'caco3'),
 ('whyis', 'why is')
]
mispell_dict = {}
for words in mispell:
    if words in ['whyis', 'whydo', 'how\u200b']:
        mispell_dict[words[0]+' '] = words[1]+' '

    mispell_dict[' '+words[0]+' '] = ' '+words[1]+' '

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

train["question_text"] = train["question_text"].apply(lambda x: replace_typical_misspell(x))
test["question_text"] = test["question_text"].apply(lambda x: replace_typical_misspell(x))

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = None # how many unique words to use (i.e num rows in embedding vector)
maxlen = 72 # max number of words in a question to use #99.99%

## fill up the missing values
X = train["question_text"].fillna("_na_").values
X_test = test["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features, filters='')
tokenizer.fit_on_texts(list(X)+list(X_test))

X = tokenizer.texts_to_sequences(X)
X_test = tokenizer.texts_to_sequences(X_test)

## Pad the sentences 
X = pad_sequences(X, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

## Get the target values
Y = train['target'].values

sub = test[['qid']]

In [None]:
del train, test
gc.collect()

In [None]:
word_index = tokenizer.word_index
max_features = len(word_index)+1
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if o.split(" ")[0] in word_index)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100 and o.split(" ")[0] in word_index )

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100 and o.split(" ")[0] in word_index)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_features, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [None]:
embedding_matrix_1 = load_glove(word_index)
#embedding_matrix_2 = load_fasttext(word_index)
embedding_matrix_3 = load_para(word_index)
embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_3), axis=1)  
del embedding_matrix_1, embedding_matrix_3
gc.collect()
np.shape(embedding_matrix)

In [None]:
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [None]:
class AdamW(Optimizer):
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/4)
                 epsilon=1e-8, decay=0., **kwargs):
        super(AdamW, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')
            self.wd = K.variable(weight_decay, name='weight_decay') # decoupled weight decay (2/4)
        self.epsilon = epsilon
        self.initial_decay = decay

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        wd = self.wd # decoupled weight decay (3/4)

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p # decoupled weight decay (4/4)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'beta_1': float(K.get_value(self.beta_1)),
                  'beta_2': float(K.get_value(self.beta_2)),
                  'decay': float(K.get_value(self.decay)),
                  'weight_decay': float(K.get_value(self.wd)),
                  'epsilon': self.epsilon}
        base_config = super(AdamW, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [None]:
#config 
embed_size = 600

In [None]:
def singleGRU():
    K.clear_session()       
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_siz, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(rate=0.22, seed=1024)(x)
    x = Bidirectional(CuDNNGRU(120, return_sequences=True, 
                                kernel_initializer=glorot_normal(seed=12300), 
                               recurrent_initializer=orthogonal(gain=1.0, seed=10000)))(x)

    x = GlobalMaxPool1D()(x)
    x = Dense(1, activation="sigmoid",kernel_initializer=glorot_normal(seed=12300))(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.02),)
    return model

In [None]:
def singleGRU_II():
    K.clear_session()       
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(rate=0.22, seed=1024)(x)
    x, x_h, x_c = Bidirectional(CuDNNGRU(120, return_sequences=True, return_state=True,
                                kernel_initializer=glorot_normal(seed=12300), 
                               recurrent_initializer=orthogonal(gain=1.0, seed=10000)))(x)

    x1 = GlobalMaxPool1D()(x)
    x2 = GlobalAvgPool1D()(x)
    c = concatenate([x1, x2], axis=-1)
    x = Dense(1, activation="sigmoid",kernel_initializer=glorot_normal(seed=12300))(c)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.06),)
    return model

In [None]:
def GRU_Attention():
    K.clear_session()       
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(rate=0.22, seed=1024)(x)
    x = Bidirectional(CuDNNGRU(120, return_sequences=True, 
                                kernel_initializer=glorot_normal(seed=12300), 
                               recurrent_initializer=orthogonal(gain=1.0, seed=10000)))(x)

    x = AttentionWeightedAverage()(x)
    x = Dense(1, activation="sigmoid",kernel_initializer=glorot_normal(seed=12300))(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.02),)
    return model

In [None]:
#epoch=5
def parallelRNN():
    K.clear_session()
    recurrent_units = 128
    inp = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features,
                                embed_size,
                                weights=[embedding_matrix],
                                input_length=maxlen,
                                trainable=False)(inp)
    embedding_layer = SpatialDropout1D(0.2, seed=1024)(embedding_layer)

    x = Bidirectional(CuDNNGRU(64, return_sequences=True, 
                                   kernel_initializer=glorot_uniform(seed=125422), 
                                   recurrent_initializer=Orthogonal(gain=1.0, seed=123000)))(embedding_layer)
    y = Bidirectional(CuDNNLSTM(64, return_sequences=True,
                                  kernel_initializer=glorot_uniform(seed=111000), 
                                  recurrent_initializer=Orthogonal(gain=1.0, seed=123000)))(embedding_layer)
    c = concatenate([x, y], axis=2)

    #last = Lambda(lambda t: t[:, -1], name='last')(rnn_1)
    #x = Conv1D(filters=72, kernel_size=2, padding='same', activation='relu', kernel_initializer=glorot_uniform(seed=10000))(x)
    #y = Conv1D(filters=72, kernel_size=2, padding='same', activation='relu', kernel_initializer=glorot_uniform(seed=101000))(y)
    #a = Multiply()([x, y])
    #c = AttentionWithContext()(c)
    c = GlobalMaxPooling1D()(c)
    #c = BatchNormalization()(c) 
    #c = concatenate([x, y])

    output_layer = Dense(1, activation="sigmoid")(c)
    model = Model(inputs=inp, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.06))
    return model

In [None]:
#epoch=7
def poolrnn():
    K.clear_session()
    inp = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features,
                                embed_size,
                                weights=[embedding_matrix],
                                input_length=maxlen,
                                trainable=False)(inp)
    embedding_layer = SpatialDropout1D(0.22, seed=1024)(embedding_layer)

    rnn_1 = Bidirectional(CuDNNGRU(120, return_sequences=True, 
                                   kernel_initializer=glorot_uniform(seed=111000), 
                                   recurrent_initializer=Orthogonal(gain=1.0, seed=123000)))(embedding_layer)

    #last = Lambda(lambda t: t[:, -1], name='last')(rnn_1)
    maxpool = GlobalMaxPooling1D()(rnn_1)
    #attn = AttentionWeightedAverage()(rnn_1)
    average = GlobalAveragePooling1D()(rnn_1)

    c = concatenate([maxpool, average], axis=1)
    #c = Reshape((4, -1))(c)
    #c = Lambda(lambda x:K.sum(x, axis=1))(c)
    #x = BatchNormalization()(c)
    #c = GlobalMaxPooling1D()(c)
    #x = Dense(100, activation='relu', kernel_initializer=glorot_uniform(seed=111000),)(x)
    #x = Dropout(0.12)(x)
    #x = BatchNormalization()(x)
    #x = Dense(100, activation="relu", kernel_initializer=glorot_uniform(seed=111000),)(x)
    #x = Dropout(0.2)(x)
    #x = BatchNormalization()(x)
    output_layer = Dense(1, activation="sigmoid", kernel_initializer=glorot_uniform(seed=111000))(c)
    model = Model(inputs=inp, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.02))
    return model

In [None]:
def doubleRNN():
    K.clear_session()       
    x_input = Input(shape=(maxlen,))
    
    emb = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False, name='Embedding')(x_input)
    emb = SpatialDropout1D(0.22, seed=11110000)(emb)

    rnn1 = Bidirectional(CuDNNGRU(64, return_sequences=True, kernel_initializer=glorot_uniform(seed=111100), 
                           recurrent_initializer=Orthogonal(gain=1.0, seed=123000)))(emb)
    rnn2 = Bidirectional(CuDNNGRU(64, return_sequences=True, kernel_initializer=glorot_uniform(seed=111000), 
                           recurrent_initializer=Orthogonal(gain=1.0, seed=1203000)))(rnn1)

    x = concatenate([rnn1, rnn2])
    x = GlobalMaxPooling1D()(x)  
    x_output = Dense(1, activation='sigmoid', kernel_initializer=glorot_uniform(seed=111100))(x)
    model = Model(inputs=x_input, outputs=x_output)
    model.compile(loss='binary_crossentropy', optimizer=AdamW(weight_decay=0.06),)
    return model

In [None]:
def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

In [None]:
kfold = StratifiedKFold(n_splits=7, random_state=10, shuffle=True)
bestscore = []
logloss = []
y_test = np.zeros((X_test.shape[0], ))
oof = np.zeros((X.shape[0], ))
epochs = [4, 4, 5, 5, 5]

for i, (train_index, valid_index) in enumerate(kfold.split(X, Y)):
    print('Fold %s'%(i+1))
    X_train, X_val, Y_train, Y_val = X[train_index], X[valid_index], Y[train_index], Y[valid_index]
    filepath="weights_best.h5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=1, min_lr=0.0001, verbose=2)
    earlystopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=2, verbose=2, mode='auto')
    callbacks = [checkpoint, reduce_lr]
    if i == 0:
        model = singleGRU_II()
    elif i == 1:
        model = doubleRNN()
    elif i == 2:
        model = parallelRNN()
    elif i == 3:
        model = GRU_Attention()
    elif i == 4:
        model = singleGRU_II()
    print(model.summary())
    model.fit(X_train, Y_train, batch_size=512, epochs=7, validation_data=(X_val, Y_val), verbose=2, callbacks=callbacks, 
              class_weight={0:1, 1:1.25}
             )
    model.load_weights(filepath)
    y_pred = model.predict([X_val], batch_size=1024, verbose=2)
    y_test += np.squeeze(model.predict([X_test], batch_size=1024, verbose=2))/3
    oof[valid_index] = np.squeeze(y_pred)
    f1, threshold = f1_smart(np.squeeze(Y_val), np.squeeze(y_pred))
    print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold))
    bestscore.append(threshold)
    logloss.append(np.min(model.history.history['val_loss']))
    print('*'*50)
    print('\n')
    if i == 2:break

f1, threshold = f1_smart(np.squeeze(Y), oof)
print('Optimal F1: {:.4f} at threshold: {:.4f}'.format(f1, threshold))

In [None]:
np.mean(logloss), np.mean(bestscore)

In [None]:
y_test = y_test.reshape((-1, 1))
pred_test_y = (y_test>np.mean(bestscore)).astype(int)
sub['prediction'] = pred_test_y
sub.to_csv("submission.csv", index=False)