In [1]:
#import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import re
import time
import gc
import random
import os
import math

import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from textblob import TextBlob
import spacy
NLP = spacy.load('en')

import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from torch.optim.optimizer import Optimizer

In [3]:
def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [4]:
embed_size = 300 # how big is each word vector
max_features = 98000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 45 # max number of words in a question to use
batch_size = 2048
train_epochs = 4 #5

SEED = 1029

In [5]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x: #speed up
            x = x.replace(punct, ' {} '.format(punct))
    return x

def Sp_Tokenizer(text): 
    #Tokenize the text
    return [tok.text for tok in NLP.tokenizer(text)]

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    x = re.sub(u'\u200b', '', x) # \u200b
    x = re.sub(r'([A-Za-z])\1{3,}', r'\1', x) # same char more than 3 times
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [6]:
def add_features(df):
    
    df['question_text'] = df['question_text'].progress_apply(lambda x:str(x))
    df['total_length'] = df['question_text'].progress_apply(len)
    df['capitals'] = df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.progress_apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.question_text.str.count('\S+')
    df['num_unique_words'] = df['question_text'].progress_apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  

    return df

In [7]:
def load_and_prec():
    ## reading
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    ##test_df = pd.concat([test_df]*7) # 压力测试
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: str(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: str(x))
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    ## aux before token
    train_df['init_len'] = train_df["question_text"].progress_apply(len) # initial total len
    test_df['init_len'] = test_df["question_text"].progress_apply(len)
    
    train_df['init_words'] = train_df["question_text"].progress_apply(lambda x: len(x.split(' '))) # initial total words
    test_df['init_words'] = test_df["question_text"].progress_apply(lambda x: len(x.split(' ')))
    
    train_df['caps'] = train_df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper())) # initial upper
    test_df['caps'] = test_df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))

    train_df['nums'] = train_df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isdecimal())) # initial num
    test_df['nums'] = test_df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isdecimal()))
    
    train_df['symbols'] = train_df['question_text'].progress_apply(lambda x: len(''.join(e for e in x if not e.isalnum())) - len(''.join(e for e in x if x==' '))) # initial symbols
    test_df['symbols'] = test_df['question_text'].progress_apply(lambda x: len(''.join(e for e in x if not e.isalnum())) - len(''.join(e for e in x if x==' '))) 
    
    train_df['caps_ratio'] = train_df.progress_apply(lambda row: float(row['caps'])/float(row['init_len']), axis=1) # upper ratio
    test_df['caps_ratio'] = test_df.progress_apply(lambda row: float(row['caps'])/float(row['init_len']), axis=1)
    
    train_df['nums_ratio'] = train_df.progress_apply(lambda row: float(row['nums'])/float(row['init_len']), axis=1) # upper ratio
    test_df['nums_ratio'] = test_df.progress_apply(lambda row: float(row['nums'])/float(row['init_len']), axis=1)

    train_df['symbol_ratio'] = train_df.progress_apply(lambda row: float(row['symbols'])/float(row['init_len']), axis=1) # symbols ratio
    test_df['symbol_ratio'] = test_df.progress_apply(lambda row: float(row['symbols'])/float(row['init_len']), axis=1)

    train_df['word_len_max'] = train_df['question_text'].progress_apply(lambda x: max(len(c) for c in x)) # longest word
    test_df['word_len_max'] = test_df['question_text'].progress_apply(lambda x: max(len(c) for c in x))

    train_df['len_ratio'] = train_df.progress_apply(lambda row: float(row['word_len_max'])/float(row['init_len']), axis=1) # longest word ratio
    test_df['len_ratio'] = test_df.progress_apply(lambda row: float(row['word_len_max'])/float(row['init_len']), axis=1)
    
    features = train_df[['init_len', 'init_words', 'caps', 'nums', 'symbols', 'caps_ratio',
                         'nums_ratio', 'symbol_ratio', 'word_len_max', 'len_ratio']].fillna(0)
    test_features = test_df[['init_len', 'init_words', 'caps', 'nums', 'symbols', 'caps_ratio',
                             'nums_ratio', 'symbol_ratio', 'word_len_max', 'len_ratio']].fillna(0)
    
    # scale
    ss = StandardScaler()
    for i in features.columns:
        ss.fit(np.vstack((features[[i]], test_features[[i]])))
        features[i] = ss.transform(features[[i]].values)
        test_features[i] = ss.transform(test_features[[i]].values)
    
    features = features.values
    test_features = test_features.values
    
    ## lower    
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: x.lower())
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: x.lower())
    
    ## Clean the text
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_text(x))
    
    ## Clean numbers
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_numbers(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_numbers(x))
    
    ## Clean speelings
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    
    ## Fill up the missing values
    train_X = train_df["question_text"].fillna("_##_")
    test_X = test_df["question_text"].fillna("_##_")    

    ## Tokenize the sentences
    train_vocab = train_X.apply(Sp_Tokenizer)
    tokenizer = Tokenizer(filters='', lower=False, num_words=max_features)
    tokenizer.fit_on_texts(train_vocab)
    del train_vocab
    gc.collect()

    train_X = train_X.values
    test_X = test_X.values
    
    #tokenizer = Tokenizer(num_words=max_features)
    #tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    
    #shuffling the data
    np.random.seed(SEED)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    features = features[trn_idx] #
    #return train_X, test_X, train_y, tokenizer.word_index
    return train_X, test_X, train_y, features, test_features, tokenizer.word_index
    
    

In [8]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index)) + 1 # keras word_index start from 1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    embedding_matrix[0] = np.zeros(embed_size,)
    return embedding_matrix 

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index)) + 1 # keras word_index start from 1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    embedding_matrix[0] = np.zeros(embed_size,)
    return embedding_matrix

def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index)) + 1 # keras word_index start from 1
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    embedding_matrix[0] = np.zeros(embed_size,)
    return embedding_matrix  

In [9]:
from tqdm import tqdm
tqdm.pandas()

start_time = time.time()

train_X, test_X, train_y, features, test_features, word_index = load_and_prec()
embedding_matrix_1 = load_glove(word_index)
embedding_matrix_2 = load_para(word_index)
###embedding_matrix_3 = load_fasttext(word_index)
total_time = (time.time() - start_time) / 60
print("Took {:.2f} minutes".format(total_time))

#embedding_matrix = np.mean([embedding_matrix_1, embedding_matrix_2], axis=0)
###embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2,embedding_matrix_3), axis=1)
embedding_matrix = np.concatenate((embedding_matrix_1, embedding_matrix_2), axis=1)###
#embedding_matrix = 0.6*embedding_matrix_1+0.4*embedding_matrix_2
print(np.shape(embedding_matrix))

del embedding_matrix_1, embedding_matrix_2###,embedding_matrix_3
gc.collect()

100%|██████████| 1306122/1306122 [00:01<00:00, 783878.59it/s]
100%|██████████| 56370/56370 [00:00<00:00, 775904.74it/s]
  4%|▍         | 54843/1306122 [00:00<00:02, 548425.29it/s]

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


100%|██████████| 1306122/1306122 [00:01<00:00, 812116.08it/s] 
100%|██████████| 56370/56370 [00:00<00:00, 804760.21it/s]
100%|██████████| 1306122/1306122 [00:02<00:00, 493348.43it/s]
100%|██████████| 56370/56370 [00:00<00:00, 451762.21it/s]
100%|██████████| 1306122/1306122 [00:09<00:00, 142541.30it/s]
100%|██████████| 56370/56370 [00:00<00:00, 141389.23it/s]
100%|██████████| 1306122/1306122 [00:08<00:00, 146593.69it/s]
100%|██████████| 56370/56370 [00:00<00:00, 144683.21it/s]
100%|██████████| 1306122/1306122 [00:13<00:00, 95522.52it/s]
100%|██████████| 56370/56370 [00:00<00:00, 92977.05it/s]
100%|██████████| 1306122/1306122 [00:34<00:00, 37610.79it/s]
100%|██████████| 56370/56370 [00:01<00:00, 39353.98it/s]
100%|██████████| 1306122/1306122 [00:34<00:00, 38236.17it/s]
100%|██████████| 56370/56370 [00:01<00:00, 37271.07it/s]
100%|██████████| 1306122/1306122 [00:34<00:00, 38080.61it/s]
100%|██████████| 56370/56370 [00:01<00:00, 38238.91it/s]
100%|██████████| 1306122/1306122 [00:13<00:00, 

Took 13.53 minutes
(98001, 600)


24

In [10]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [11]:
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

In [12]:
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train_X, train_y)) #5

In [13]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

'''
def sigmoid(x):
    return .5 * (1 + np.tanh(.5 * x))
'''

'\ndef sigmoid(x):\n    return .5 * (1 + np.tanh(.5 * x))\n'

In [14]:
'''
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result
'''

def threshold_search(y_true, y_proba, plot=False):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    if plot:
        plt.plot(thresholds, F, '-b')
        plt.plot([best_th], [best_score], '*r')
        plt.show()
    search_result = {'threshold': best_th , 'f1': best_score}
    return search_result 

In [15]:
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

In [16]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        hidden_size = 120
        
        self.embedding = nn.Embedding(max_features, embed_size*2)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False

        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size*2, hidden_size, bidirectional=True, batch_first=True)
        #for weight in self.lstm.parameters():##
            #if len(weight.size()) > 1:
                #torch.nn.init.xavier_normal_(weight.data) ##
        self.gru = nn.GRU(hidden_size*2, hidden_size, bidirectional=True, batch_first=True)
        #for weight in self.gru.parameters():##
            #if len(weight.size()) > 1:##
                #torch.nn.init.xavier_normal_(weight.data) ##orthogonal_
        
        self.lstm_attention = Attention(hidden_size*2, maxlen)
        self.gru_attention = Attention(hidden_size*2, maxlen)
        
        self.linear = nn.Linear(10*hidden_size+10, 16) #
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.batchnorm = nn.BatchNorm1d(16)##
        self.out = nn.Linear(16, 1)
        
    def forward(self, x):
        h_embedding = self.embedding(x[0])#
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)
        m2_pool = torch.topk(h_gru, 2, dim=1)[0][:,1]
        f = torch.tensor(x[1], dtype=torch.float).cuda()##
        
        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool, m2_pool, f), 1)#
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        conc = self.batchnorm(conc)##
        out = self.out(conc)
        
        return out

In [17]:
train_preds = np.zeros((len(train_X)))
test_preds = np.zeros((len(test_X)))
features = np.array(features)##

seed_torch(SEED)

x_test_cuda = torch.tensor(test_X, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

#class_weight = torch.FloatTensor([2.5]).cuda()##

for i, (train_idx, valid_idx) in enumerate(splits):
    x_train_fold = torch.tensor(train_X[train_idx], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(train_y[train_idx, np.newaxis], dtype=torch.float32).cuda()
    kfold_X_features = features[train_idx.astype(int)]##
    kfold_X_valid_features = features[valid_idx.astype(int)]##
    x_val_fold = torch.tensor(train_X[valid_idx], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(train_y[valid_idx, np.newaxis], dtype=torch.float32).cuda()
    
    model = NeuralNet()
    model.cuda()
    
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum").cuda()# pos_weight , pos_weight=class_weight
    #optimizer = torch.optim.Adam(model.parameters(), lr=2.4e-3)#, weight_decay=1e-6
    #scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3, 4], gamma=0.1) ##
    
    ################################################################################################
    step_size = 256*2
    base_lr, max_lr = 0.00023, 0.0024   
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                             lr=max_lr)
    
    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
               step_size=step_size, mode='exp_range',
               gamma=0.99994)
    ################################################################################################
    
    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)

    train = MyDataset(train)##
    valid = MyDataset(valid)##
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    
    print('Fold {}'.format(i + 1))
    
    for epoch in range(train_epochs):
        start_time = time.time()
        #scheduler.step() ##
        model.train()
        avg_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
        #for x_batch, y_batch in tqdm(train_loader, disable=True):
            f = kfold_X_features[index]##
            y_pred = model([x_batch,f])
            #y_pred = model(x_batch)
            if scheduler:
                scheduler.batch_step()
                
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad() 
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
        
        model.eval()
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros(len(test_X))
        avg_val_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
        #for i, (x_batch, y_batch) in enumerate(valid_loader):
            f = kfold_X_valid_features[index]##
            y_pred = model([x_batch,f]).detach()
            #y_pred = model(x_batch).detach()
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
        elapsed_time = time.time() - start_time 
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
            epoch + 1, train_epochs, avg_loss, avg_val_loss, elapsed_time))
        
    for i, (x_batch,) in enumerate(test_loader):
    #for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * batch_size:(i+1) * batch_size]##
        y_pred = model([x_batch,f]).detach()
        #y_pred = model(x_batch).detach()

        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / len(splits)    

Fold 1
Epoch 1/4 	 loss=648.1626 	 val_loss=221.9109 	 time=292.91s
Epoch 2/4 	 loss=231.9831 	 val_loss=197.8455 	 time=294.53s
Epoch 3/4 	 loss=214.9310 	 val_loss=203.8569 	 time=294.82s
Epoch 4/4 	 loss=203.9398 	 val_loss=194.8260 	 time=292.76s
Fold 2
Epoch 1/4 	 loss=649.2590 	 val_loss=230.6950 	 time=294.46s
Epoch 2/4 	 loss=231.9146 	 val_loss=200.9555 	 time=293.92s
Epoch 3/4 	 loss=216.1642 	 val_loss=202.2483 	 time=294.00s
Epoch 4/4 	 loss=205.1613 	 val_loss=195.7174 	 time=294.67s
Fold 3
Epoch 1/4 	 loss=604.9511 	 val_loss=241.4589 	 time=294.74s
Epoch 2/4 	 loss=234.9155 	 val_loss=203.0195 	 time=294.00s
Epoch 3/4 	 loss=219.5371 	 val_loss=207.6238 	 time=295.03s
Epoch 4/4 	 loss=209.7984 	 val_loss=196.3482 	 time=294.26s
Fold 4


  


Epoch 1/4 	 loss=626.1130 	 val_loss=224.9655 	 time=294.90s
Epoch 2/4 	 loss=232.3809 	 val_loss=202.7327 	 time=295.45s
Epoch 3/4 	 loss=217.1527 	 val_loss=204.8001 	 time=294.41s
Epoch 4/4 	 loss=206.3008 	 val_loss=198.6197 	 time=293.85s
Fold 5
Epoch 1/4 	 loss=595.7119 	 val_loss=218.8594 	 time=294.73s
Epoch 2/4 	 loss=233.5230 	 val_loss=198.5901 	 time=294.28s
Epoch 3/4 	 loss=216.7403 	 val_loss=200.9458 	 time=293.92s
Epoch 4/4 	 loss=205.7086 	 val_loss=193.1808 	 time=293.78s


In [18]:
search_result = threshold_search(train_y, train_preds)
search_result



{'threshold': 0.3100835680961609, 'f1': 0.6934346861092263}

In [19]:
sub = pd.read_csv("../input/test.csv")
sub = sub[['qid']]
##test_preds = test_preds[:56370]##压力测试
sub['prediction'] = test_preds > search_result['threshold']
sub.to_csv("submission.csv", index=False)

In [20]:
!head submission.csv

qid,prediction
00014894849d00ba98a9,False
000156468431f09b3cae,False
000227734433360e1aae,False
0005e06fbe3045bd2a92,False
00068a0f7f41f50fc399,False
000a2d30e3ffd70c070d,False
000b67672ec9622ff761,False
000b7fb1146d712c1105,False
000d665a8ddc426a1907,False
