Greetings! This is my first Kaggle competition.
I've taken an awesome notebook https://www.kaggle.com/sergeifironov/bowl-stabilize-coefs-cntrs-all5 by https://www.kaggle.com/sergeifironov and tried adding a few features.
The misses count from https://www.kaggle.com/bhavikapanara/2019-dsb-with-more-features-qwk-0-549 by https://www.kaggle.com/bhavikapanara worked pretty well and increased the private score to 0.563

In [None]:
# %%writefile code_counts.py
import pandas as pd
import gc
from tqdm import tqdm
def create_counters(fout):
    fname = '/kaggle/input/data-science-bowl-2019/'+ fout +'.csv'
    df = pd.read_csv(fname)[['timestamp','installation_id','event_code']]
    
    #cut off 90% for test purpose
    #if fout == 'train': df = df[int(len(df)*0.9):]
        
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['installation_id', 'timestamp']).reset_index(drop=True)
    df['event_global_enc'] = (df['event_code'] == 2000).astype(int)  
    df['event_global_enc'] = df.event_global_enc.cumsum()

    agg_df = df.groupby(['event_global_enc','event_code']).agg({'timestamp':'count'}).reset_index()
    agg_df.columns = ['event_global_enc','event_code','event_code_count']
    del df
    gc.collect()
    
    event_codes = ['2000','3010','3110','4020','4021','4030','4035','4070',
                   '4090','2020','2030','2040','2050','2080','2083','3020',
                   '3021','3120','3121','4010','2060','2070','4031','4025',
                   '5000','5010','2081','2025','4022','2010','2035','4040',
                   '4100','4110','4045','4095','4220','2075','4230','4235',
                   '4080','4050']
    dcts = []
    for t,g in tqdm(agg_df.groupby('event_global_enc')):
        dct = {'event_global_enc': t}
        g.index = g['event_code']
        g = g['event_code_count'].to_dict()
        for k in event_codes:
            dct['event_code_' + k] = g.get(int(k), 0)
        dcts.append(dct)
    pd.DataFrame(dcts).to_csv(fout + '_code_counts.csv', index=False)  

In [None]:
# %%writefile create_code_counters_train.py
# from code_counts import create_counters
create_counters('train')

In [None]:
# %%writefile create_code_counters_test.py
# from code_counts import create_counters
create_counters('test')

In [None]:
# %%time 
# !python create_code_counters_train.py
# !python create_code_counters_test.py

In [None]:
def get_miss(x):
    try:
        return json.loads(x)['misses']
    except:
        return 0

In [None]:
miss_cols = ['cum_misses']


In [None]:
# %%writefile preproc.py

import numpy as np 
import pandas as pd 
import json
from pandas.io.json import json_normalize
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import functools
from multiprocessing import Pool
import logging
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import pickle

def load_csv(filename, fout):
    df = pd.read_csv(filename)
    
    #if fout == 'train': df = df[int(len(df)*0.9):]
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['installation_id', 'timestamp']).reset_index(drop=True)
    
    df['correct'] = df['event_data'].str.contains('"correct":true').astype(int)
    df['incorrect'] = df['event_data'].str.contains('"correct":false').astype(int)
    
    def get_miss(x):
        try:
            return json.loads(x)['misses']
        except:
            return 0
        
    # we add misses counter 
    df['misses'] = df['event_data'].apply(get_miss)
    grp = df[['installation_id', 'game_session', 'misses']].groupby(['installation_id', 'game_session'])
    df['cum_misses'] = grp['misses'].transform(pd.Series.cumsum)

    
    df.drop(['event_data', 'misses'], axis=1, inplace=True)
    gc.collect()
    df['super_token'] = df['title'] + df['event_code'].astype(str)+df['correct'].astype(str)+df['incorrect'].astype(str)
    df['super_token'] = df['super_token'].str.replace(' ','').str.replace('-','')
    
    df['attempt'] = (((df.event_code == 4100) & (df.title != 'Bird Measurer (Assessment)')) |\
                     ((df.event_code == 4110) & (df.title == 'Bird Measurer (Assessment)'))) &\
                    (df['type'] == 'Assessment')
    df['attempt'] = df['attempt'].astype(int)
    df['correct'] = df['correct'] * df['attempt']
    df['incorrect'] = df['incorrect'] * df['attempt']
    
    df['start_event'] = (df['event_code'] == 2000).astype(int)  
    df['start_assessment'] = (df['type'] == 'Assessment').astype(int) * df['start_event']
    df['end_event'] = df.start_event.shift(-1, fill_value=1)
    df['event_global_enc'] = df.start_event.cumsum()
    gc.collect()
    
    agg_df = df.groupby('event_global_enc').agg({'installation_id':'first', 'correct': 'sum', 'incorrect': 'sum', 'timestamp': ['min','max']}).reset_index()
    agg_df.columns = ['event_global_enc', 'installation_id', 'correct_attempts', 'incorrect_attempts', 'ts_min','ts_max']
    agg_df['game_duration'] = (agg_df['ts_min'] - agg_df['ts_min'].shift(1)).dt.days*3600*24 +\
    (agg_df['ts_min'] - agg_df['ts_min'].shift(1)).dt.seconds +\
    (agg_df['ts_min'] - agg_df['ts_min'].shift(1)).dt.microseconds / 1e6
    agg_df['gs'] = 1
    agg_df['gs'] = agg_df.groupby('installation_id')['gs'].transform(pd.Series.cumsum)
    agg_df.loc[agg_df.gs==1,'game_duration'] = 0
    agg_df['game_duration'] = np.log1p(agg_df['game_duration'])
        
    aggcols = ['event_global_enc', 'correct_attempts', 'incorrect_attempts', 'game_duration']
    df = df.merge(agg_df[aggcols], on='event_global_enc', how='left')
    del agg_df
    gc.collect()
    
    assessments = ['Bird Measurer (Assessment)', 'Cart Balancer (Assessment)', 
                   'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Mushroom Sorter (Assessment)']
    assessment_fts = []
    for a in assessments:
        feat1 = a.replace(' ','')+'_correct'
        feat2 = a.replace(' ','')+'_incorrect'
        assessment_fts.append(feat1)
        assessment_fts.append(feat2)
        df[feat1] = 0
        df[feat2] = 0
        df.loc[df.title == a,feat1] = df.loc[df.title==a].groupby(['installation_id'])['correct'].transform(pd.Series.cumsum)
        df.loc[df.title == a,feat2] = df.loc[df.title==a].groupby(['installation_id'])['incorrect'].transform(pd.Series.cumsum)
    
    df['metric_point'] = df['start_assessment'] * (df.correct_attempts + df.incorrect_attempts > 0).astype(int)
    df['metric_point_inference'] = df.installation_id != df.installation_id.shift(-1, fill_value='')
    gc.collect()
    
    
    ret_columns = ['installation_id', 'title', 'super_token', 'event_code', 'game_duration',
                   'correct_attempts', 'incorrect_attempts', 
                   'metric_point', 'metric_point_inference', 'event_global_enc'
                  ] + assessment_fts + miss_cols
    df = df[ret_columns]
    gc.collect()
    
    return df

def create_text_file(fname, fout):
    train = load_csv(fname, fout)
    
    #if fout == 'test':
    #    train['mp'] = train['metric_point']
    #    train['mp'] = train.groupby(['installation_id'])['mp'].transform(pd.Series.cumsum)
    #    train.loc[(train.metric_point == 1) & (train.mp > 1),'metric_point'] = 0
    #    train.drop(['mp'], axis=1, inplace=True)
    
    train['event_idx'] = 1
    train['event_idx'] = train.groupby(['installation_id'])['event_idx'].transform(pd.Series.cumsum)

    texts = []
    if fout == 'train':
        labels = {}
        for i,q in enumerate(train.super_token.unique()):
            labels[q] = i + 1
        with open('txt_labels.pickle', 'wb') as handle:
            pickle.dump(labels,handle)
    else:
        with open('txt_labels.pickle', 'rb') as handle:
            labels = pickle.load(handle)
    train['super_token'] = train['super_token'].map(lambda x: labels.get(x, 0)).astype(np.int16)

    for ix in tqdm(train.loc[train.metric_point_inference==1].index):
        point_idx = train.iloc[ix].event_idx
        texts.append(train.iloc[(ix-point_idx+1):(ix+1)]['super_token'].values.tolist())
        
    np.save(fout + '_mpi', np.array(texts))
    
    texts = []
    for ix in tqdm(train.loc[train.metric_point==1].index):
        point_idx = train.iloc[ix].event_idx
        texts.append(train.iloc[(ix-point_idx+1):(ix+1)]['super_token'].values.tolist())
    np.save(fout + '_mp', np.array(texts))
        
    assessments = ['Bird Measurer (Assessment)', 'Cart Balancer (Assessment)', 
               'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Mushroom Sorter (Assessment)']
    assessment_fts = []
    for a in assessments:
        feat1 = a.replace(' ','')+'_correct'
        feat2 = a.replace(' ','')+'_incorrect'
        feat3 = a.replace(' ','')+'_rate'
        assessment_fts.append(feat1)
        assessment_fts.append(feat2)
        assessment_fts.append(feat3)
        train[feat3] = 0
        train.loc[train[feat1]+train[feat2] > 0,feat3] = train.loc[train[feat1]+train[feat2] > 0,feat1] / \
        (train.loc[train[feat1]+train[feat2] > 0,feat1] + train.loc[train[feat1]+train[feat2] > 0,feat2])
        
    train['assessment_rate'] = 0
    train.loc[train.correct_attempts + train.incorrect_attempts > 0, 'assessment_rate'] = \
                  train.correct_attempts / (train.correct_attempts + train.incorrect_attempts)
    
    usefull_fts = ['installation_id', 'title', 'assessment_rate', 
                   'game_duration', 'correct_attempts', 'incorrect_attempts',
                   'metric_point', 'metric_point_inference', 'event_global_enc'] + assessment_fts + miss_cols

    train = train.loc[train.event_code == 2000, usefull_fts].reset_index(drop=True)
    gc.collect()
    
    ecc = pd.read_csv(fout + '_code_counts.csv')
    train = train.merge(ecc, on='event_global_enc', how='left').drop(['event_global_enc'], axis=1)
    del ecc
    gc.collect()
    
    train['event_idx'] = 1
    train['event_idx'] = train.groupby(['installation_id'])['event_idx'].transform(pd.Series.cumsum)
    
    train['label'] = 0
    train.loc[(train.incorrect_attempts >= 2) & (train.correct_attempts > 0),'label'] = 1
    train.loc[(train.incorrect_attempts == 1) & (train.correct_attempts > 0),'label'] = 2
    train.loc[(train.incorrect_attempts == 0) & (train.correct_attempts > 0),'label'] = 3
    train['all_attempts'] = train.incorrect_attempts + train.correct_attempts
    
    train.to_csv(fout+'.csv', index=False)

In [None]:
miss_cols = ['cum_misses']

In [None]:
# %%writefile text_processing_train.py

# from preproc import create_text_file

create_text_file('/kaggle/input/data-science-bowl-2019/train.csv', 'train')

In [None]:
# %%writefile text_processing_test.py

# from preproc import create_text_file

create_text_file('/kaggle/input/data-science-bowl-2019/test.csv', 'test')


In [None]:
# %%time
# !python text_processing_train.py
# !python text_processing_test.py

In [None]:
# %%writefile train_tfidf.py

import numpy as np 
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import pickle
from scipy.sparse import save_npz

texts = np.load('train_mpi.npy',allow_pickle=True).tolist() + np.load('test_mpi.npy',allow_pickle=True).tolist()
for i,t in tqdm(enumerate(texts)):
    texts[i] = ' '.join(['q' + str(q) for q in t])
print(len(texts), len(texts[0]), texts[0][:10])

vectorizer = TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, 
                             lowercase=False, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, 
                             token_pattern='\S+', ngram_range=(1, 3), max_df=0.99, min_df=100, max_features=None, 
                             vocabulary=None, binary=False, norm='l2', use_idf=True, 
                             smooth_idf=True, sublinear_tf=False)
vectorizer.fit(texts)
del texts
gc.collect()

with open('vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer,handle)

for f in ['train_mp','test_mp','test_mpi']:
    texts = np.load(f + '.npy',allow_pickle=True).tolist()
    for i,t in tqdm(enumerate(texts)):
        texts[i] = ' '.join(['q' + str(q) for q in t])
    texts = vectorizer.transform(texts)
    save_npz(f, texts)
    del texts
    gc.collect()


In [None]:
# %%time
# !python train_tfidf.py

In [None]:
from __future__ import absolute_import

import numpy as np 
import pandas as pd 
import json
from pandas.io.json import json_normalize
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import functools
from multiprocessing import Pool
import logging
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm_notebook
import scipy
import tensorflow as tf
import keras
import math
import random
import os

SEED = 239
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

q_train = scipy.sparse.load_npz('train_mp.npz')
q_train_tst = scipy.sparse.load_npz('test_mp.npz')

q_train = scipy.sparse.vstack([q_train, q_train_tst])
del q_train_tst
gc.collect()
q_test = scipy.sparse.load_npz('test_mpi.npz')

test = pd.read_csv('test.csv')
train2 = pd.read_csv('train.csv')

print(train2.shape, test.shape, q_train.shape, q_test.shape,      train2.loc[train2.metric_point==1].shape, test.loc[test.metric_point==1].shape)

train_ids = train2.installation_id.unique()
test_ids = test.loc[test.metric_point_inference==1,'installation_id'].values
train2 = pd.concat([train2,test], sort=False, ignore_index=True).reset_index(drop=True)
del test
gc.collect()

train2['event_idx'] = 1
train2['event_idx'] = train2.groupby(['installation_id'])['event_idx'].transform(pd.Series.cumsum)

event_codes = ['3010','3110','4020','4021','4030','4035',
               '4090','2020','2030','2040','2050','2080','2083',
               '3021','3120','4010','2060','2070','4031','4025',
               '5000','5010','2081','2025','4022','2010','2035','4040',
               '4100','4110','4045','4095','4220','2075','4230','4235',
               '4080','4050']
#event_codes = ['4070','3020','3121','4020','3120','2030','4035','4030']
code_counts_cols = ['event_code_' + q for q in event_codes]

assessments = ['Bird Measurer (Assessment)', 'Cart Balancer (Assessment)', 
               'Cauldron Filler (Assessment)', 'Chest Sorter (Assessment)', 'Mushroom Sorter (Assessment)']
assessment_fts = ['BirdMeasurer(Assessment)_correct',
 'BirdMeasurer(Assessment)_incorrect',
 'BirdMeasurer(Assessment)_rate',
 'CartBalancer(Assessment)_correct',
 'CartBalancer(Assessment)_incorrect',
 'CartBalancer(Assessment)_rate',
 'CauldronFiller(Assessment)_correct',
 'CauldronFiller(Assessment)_incorrect',
 'CauldronFiller(Assessment)_rate',
 'ChestSorter(Assessment)_correct',
 'ChestSorter(Assessment)_incorrect',
 'ChestSorter(Assessment)_rate',
 'MushroomSorter(Assessment)_correct',
 'MushroomSorter(Assessment)_incorrect',
 'MushroomSorter(Assessment)_rate']

from sklearn.preprocessing import MinMaxScaler

labelers = {}
cat_cols = ['title']

print("Process categorical features:")
for col_name in cat_cols:
    labelers[col_name] = {x:i+1 for i, x in enumerate(train2.loc[:, col_name].unique())}
    train2[col_name] = train2[col_name].apply(lambda x: labelers[col_name].get(x, 0))
    
cat_sizes_map = {col_name: len(labeler)+1 for col_name, labeler in labelers.items()}
cat_sizes_map

#'correct_attempts',
train2['correct_attempts'].fillna(0, inplace=True)
num_cols = ['incorrect_attempts','all_attempts','game_duration'] + assessment_fts + code_counts_cols + miss_cols

feat_scalers = {}
high_cups = {}
for f in num_cols:
    feat_scaler = MinMaxScaler(feature_range=(-1,1))
    high_cup = np.percentile(train2.loc[train2.metric_point==1,f].values, 99)
    high_cups[f] = high_cup
    print(f, high_cup)
    train2.loc[train2[f] > high_cup, f] = high_cup + 1
    feat_scaler.fit(train2[f].fillna(0).astype("float32").values.reshape(-1,1))
    train2[f] = feat_scaler.transform(train2[f].fillna(0).astype("float32").values.reshape(-1,1))
    feat_scalers[f] = feat_scaler
print(high_cups)


In [None]:
# In[15]:


from tqdm import tqdm_notebook

first_dim = train2.loc[train2.metric_point==1,:].shape[0]
seq_len = 64
num_cols = ['correct_attempts', 'incorrect_attempts', 'assessment_rate'] + code_counts_cols + miss_cols + ['game_duration', 
            'current_assessment_correct', 'current_assessment_incorrect', 'current_assessment_rate'] 

matrix_titles = np.zeros((first_dim, seq_len))
matrix_numericals = np.zeros((first_dim, seq_len, len(num_cols)))

instids = train2.loc[train2.metric_point==1,'installation_id'].values
j = 0
for ix in tqdm_notebook(train2.loc[train2.metric_point==1].index):
    point_idx = train2.iloc[ix].event_idx
    cur_title = train2.iloc[ix].title
    f1 = np.min([point_idx, seq_len])
    matrix_titles[j,(seq_len - f1):] = train2.iloc[(ix-f1+1):(ix+1)]['title'].values
    for k, f in enumerate(num_cols[:-3]):
        matrix_numericals[j,(seq_len - f1):,k] = train2.iloc[(ix-f1+1):(ix+1)][f].values
    assessment_idx = 0
    for ia,a in enumerate(assessments):
        if labelers['title'][a] == cur_title:
            assessment_idx = ia
    cur_fts = [assessment_fts[3*assessment_idx], assessment_fts[3*assessment_idx+1],assessment_fts[3*assessment_idx+2]]
    matrix_numericals[j,(seq_len - f1):,-3:] = train2.iloc[(ix-f1+1):(ix+1)][cur_fts].values
    j += 1
to_zero_cols_count = len(['correct_attempts', 'incorrect_attempts', 'assessment_rate'] + code_counts_cols)
matrix_numericals[:,-1,:to_zero_cols_count] = 0 #[[0]*to_zero_cols_count]*matrix_numericals.shape[0]
#matrix_numericals[:,-6:,3:to_zero_cols_count] = 0 #[[0]*to_zero_cols_count]*matrix_numericals.shape[0]


In [None]:
# %%writefile train_model.py



# In[16]:


test = pd.read_csv('test.csv')

cat_cols = ['title']
for col_name in cat_cols:
    test[col_name] = test[col_name].apply(lambda x: labelers[col_name].get(x, 0))

test['correct_attempts'].fillna(0, inplace=True)
num_cols = ['incorrect_attempts','game_duration']  + assessment_fts + code_counts_cols + miss_cols
for f in num_cols:
    high_cup = high_cups[f]
    test.loc[test[f] > high_cup, f] = high_cup + 1
    test[f] = feat_scalers[f].transform(test[f].fillna(0).astype("float32").values.reshape(-1,1))


# In[17]:


first_dim = test.loc[test.metric_point_inference==1,:].shape[0]
seq_len = 64
num_cols = ['correct_attempts', 'incorrect_attempts', 'assessment_rate'] + code_counts_cols +  miss_cols+['game_duration', 
            'current_assessment_correct', 'current_assessment_incorrect', 'current_assessment_rate'] 

matrix_titles_test = np.zeros((first_dim, seq_len))
matrix_numericals_test = np.zeros((first_dim, seq_len, len(num_cols)))

j = 0
for ix in tqdm_notebook(test.loc[test.metric_point_inference==1,:].index):
    point_idx = test.iloc[ix].event_idx
    cur_title = test.iloc[ix].title
    f1 = np.min([point_idx, seq_len])
    matrix_titles_test[j,(seq_len - f1):] = test.iloc[(ix-f1+1):(ix+1)]['title'].values
    for k, f in enumerate(num_cols[:-3]):
        matrix_numericals_test[j,(seq_len - f1):,k] = test.iloc[(ix-f1+1):(ix+1)][f].values
    assessment_idx = 0
    for ia,a in enumerate(assessments):
        if labelers['title'][a] == cur_title:
            assessment_idx = ia
    cur_fts = [assessment_fts[3*assessment_idx], assessment_fts[3*assessment_idx+1],assessment_fts[3*assessment_idx+2]]
    matrix_numericals_test[j,(seq_len - f1):,-3:] = test.iloc[(ix-f1+1):(ix+1)][cur_fts].values
    j += 1

matrix_numericals_test[:,-1,:to_zero_cols_count] = 0 #[[0]*to_zero_cols_count]*matrix_numericals_test.shape[0]
#matrix_numericals_test[:,-6:,3:to_zero_cols_count] = 0
print(matrix_numericals_test.shape, matrix_titles_test.shape)

del test
gc.collect()


In [None]:
# In[18]:


from sklearn.metrics import confusion_matrix
from numba import jit 
from functools import partial


@jit
def qwk3(a1, a2):
    assert(len(a1) == len(a2))
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((4, ))
    hist2 = np.zeros((4, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(4):
        for j in range(4):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e     


In [None]:
# In[19]:



import itertools
from keras import Model
from keras.preprocessing.image import img_to_array, load_img
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras import regularizers
from keras import optimizers
from keras import losses
from keras import backend as K
from keras.utils import Sequence

from keras.backend.tensorflow_backend import set_session
from sklearn.metrics import mean_squared_error, log_loss, mean_absolute_error

import scipy as sp

class FeatureSequence(Sequence):
    def __init__(self, Xs, Ys, batch_size, shuffle=False):
        self.Xs = Xs
        self.Ys = Ys
        self.batch_size = batch_size
        
        self.inx = np.arange(self.Xs[0].shape[0])
        self.shuffle = shuffle 
        if self.shuffle:
            np.random.shuffle(self.inx)


    def __len__(self):
        return math.ceil(self.inx.shape[0] / self.batch_size)

    def __getitem__(self, i):
        batch_inx = self.inx[i*self.batch_size:(i+1)*self.batch_size]
        if self.Ys is None:
            return [X[batch_inx] for X in self.Xs], None
        return [X[batch_inx] for X in self.Xs], [Y[batch_inx] for Y in self.Ys]
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.inx)
            
class OptimizedRounder(object):
    def __init__(self, samples):
        self.coef_ = 0
        self.samples = np.array(samples).astype(np.int32)

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            else:
                X_p[i] = 3
        tts = []
        for i in range(self.samples.shape[0]):
            tts.append(qwk3(np.array(y)[self.samples[i]], np.array(X_p)[self.samples[i]]))
        ll = np.median(tts)  
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [1.12232214,1.73925866,2.22506454]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead',
                                          options = {'maxiter':1e7, 'xatol':1e-4}
                                         )

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            else:
                X_p[i] = 3
        return X_p

    def coefficients(self):
        return self.coef_['x']
            
class KappaEvaluationSeq(Callback):
    def __init__(self, X_seq, Y, Y2, samples, name, interval=1):
        super(Callback, self).__init__()

        self.X_seq = X_seq
        self.Y, self.Y2 = Y, Y2
        self.samples = samples.astype(np.int32)
        self.name = name
        self.interval = interval
    
    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred1 = self.model.predict_generator(self.X_seq, steps=len(self.X_seq), 
                                                            use_multiprocessing=False, workers=1, 
                                                            max_queue_size=2*4)
            y_pred1 = y_pred1.ravel()
            vmae = mean_absolute_error(self.Y, y_pred1)     
            
            optR = OptimizedRounder(self.samples)
            optR.fit(y_pred1, self.Y)
            coefficients = optR.coefficients()
            y_pred = optR.predict(y_pred1, coefficients)     
            tts = []
            for i in range(self.samples.shape[0]):
                tts.append(qwk3(np.array(self.Y)[self.samples[i]], np.array(y_pred.astype(int))[self.samples[i]]))
            kapa = np.median(tts) 
            #kapa = qwk3(self.Y, y_pred.astype(int))
            
            logs[self.name+"_kappa"] = kapa
            logs[self.name+"_mae"] = vmae
            coefs = ("[{:.4f},{:.4f},{:.4f}]".format(coefficients[0],coefficients[1],coefficients[2]))
            print((self.name+"_kappa: {:.4f}; "+self.name+"_mae: {:.4f}; "+coefs).format(kapa,vmae))

In [None]:
from keras import backend as K
from keras import initializers
from keras import constraints
from keras import regularizers
from keras.engine import InputSpec, Layer

class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 2.0.6
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
    
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None



In [None]:
# In[21]:


from keras.callbacks import *
import math
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.learning_rate, self.base_lr)
        else:
            K.set_value(self.model.optimizer.learning_rate, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('learning_rate', []).append(K.get_value(self.model.optimizer.learning_rate))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.learning_rate, self.clr())

In [None]:
from typing import Optional, Callable, List, Dict, MutableSequence
from collections import OrderedDict
from keras.optimizers import Optimizer

CPU_CORES = 30

class StochasticEnsembling(Callback):

    def __init__(self, seqs_dict: Dict[str, Sequence], cycle_len: int, iter_per_epoch: int,
                 alpha1: float = 0.0, alpha2: float = 0.0, lr_schedule_mode: Optional[str] = None,
                 swa_cycle_start_inx: int = -1, encoder_layers_out: List[str] = [],
                 save_swa_model: bool = False, save_se_weights: bool = False,
                 folder: str = "", model_name: str = "", verbose: int = 1):
        """
        General implementation of Stochastic Weight Averaging (SWA) and Snapshot Ensembling (SE) with variety
        of available LR schedules modes.
        SWA   https://arxiv.org/abs/1803.05407
        FGE   https://arxiv.org/abs/1802.10026
        SE    https://arxiv.org/abs/1704.00109
        CLR   https://arxiv.org/abs/1506.01186
        CALR  https://arxiv.org/abs/1811.00641 (adapted)
        :param seqs_dict: Dict of sequences and their names for predicting
        :param cycle_len: length of the cycle
        :param iter_per_epoch: iterations per epoch
        :param alpha1: alpha1 param, usually max lr
        :param alpha2: alpha2 param, usually min lr
        :param lr_schedule_mode: one of of the [None, "swa", "fge", "se", "clr", "clr2", "calr"]
        :param swa_cycle_start_inx: after which cycle start making snapshots and update SWA weights.
                                If equals 0, makes it instantly before training
        :param encoder_layers_out: list of encoder layers names to use as additional models outputs
        :param save_se_weights: should weights of each snapshot be saved
        :param save_swa_model: should SWA final model be saved
        :param folder: saving directory
        :param model_name: model name
        :param verbose: verbose
        """
        super(StochasticEnsembling, self).__init__()
        self.alpha1 = alpha1
        self.alpha2 = alpha2
        self.cycle_len = cycle_len
        self.iter_per_epoch = iter_per_epoch
        self.iter_per_cycle = self.cycle_len * self.iter_per_epoch
        self.cycle_num = 0
        self.clr_iterations = 0
        self.current_epoch = 0
        self.lr_schedule_mode = lr_schedule_mode
        self.swa_cycle_start_inx = swa_cycle_start_inx
        self.encoder_layers_out = encoder_layers_out
        self.model_wfo = None

        self.model_counts = 0
        self.seqs_dict = seqs_dict
        self.probs_dict = {k: [] for k in self.seqs_dict.keys()}
        self.features_dict = {k: [] for k in self.seqs_dict.keys()}

        self.save_se_weights = save_se_weights
        self.save_swa_model = save_swa_model
        self.folder = folder
        self.model_name = model_name
        self.verbose = verbose

        self.swa_weights = []

    def on_train_begin(self, logs=None):
        if self.save_se_weights:
            self.model.save_weights(self.folder + self.model_name + "_se_weights_init.h5")

        if self.swa_cycle_start_inx == 0:
            self.snapshot_predict()
            self.swa_weights = self.model.get_weights()
            self.model_counts += 1

    def on_train_end(self, logs=None):
        if self.swa_cycle_start_inx >= 0:
            self.model.set_weights(self.swa_weights)
        if self.save_swa_model:
            self.model.save(self.folder + self.model_name + "_swa_model.h5")

        if len(self.encoder_layers_out) > 0:
            self.model_wfo = Model(inputs=self.model.inputs,
                                   outputs=self.model.outputs + [self.model.get_layer(name=layer_name).output
                                                                 for layer_name in self.encoder_layers_out])

            for seq_name, seq in self.seqs_dict.items():
                pred_outs = self.model_wfo.predict_generator(seq, steps=len(seq),
                                                             use_multiprocessing=False, workers=CPU_CORES,
                                                             max_queue_size=2 * CPU_CORES + 2,
                                                             verbose=0)
                self.probs_dict[seq_name].append(pred_outs[0])
                self.features_dict[seq_name] = pred_outs[1:]
        else:
            self.snapshot_predict()
        self.model_counts += 1

        for seq_name, probs in self.probs_dict.items():
            self.probs_dict[seq_name] = np.concatenate(probs, axis=-1)

    def on_epoch_begin(self, epoch, logs=None):
        self.current_epoch = epoch
        if self.lr_schedule_mode == "se":
            lr = self._se_schedule()
            K.set_value(self.model.optimizer.learning_rate, lr)
            if self.verbose > 0:
                print("Modifying learning rate to {}".format(str(lr)))

    def on_epoch_end(self, epoch, logs=None):
        if (self.lr_schedule_mode == "fge") and (self._t_cycle() != 0.5):
            return
        elif self._t_cycle() != 1.0:
            return

        self.cycle_num += 1
        if self.verbose > 0:
            print("Latest lr: {:.5f}".format(K.get_value(self.model.optimizer.learning_rate)))
            if self.lr_schedule_mode == "fge":
                print("Reached half of {} cycle".format(str(self.cycle_num)))
            else:
                print("Reached {} cycle".format(str(self.cycle_num)))

        if self.save_se_weights:
            self.model.save_weights(self.folder + self.model_name + "_se_weights_" + str(self.model_counts) + ".h5")

        if (self.swa_cycle_start_inx >= 0) and (self.cycle_num >= self.swa_cycle_start_inx):
            self.snapshot_predict()
            self.swa_weights_update()
            self.model_counts += 1

    def on_batch_begin(self, batch, logs=None):
        self.clr_iterations += 1

        if (self.lr_schedule_mode is None) or (self.lr_schedule_mode == "se"):
            return

        if self.lr_schedule_mode == "clr":
            lr = self._clr_schedule()
        elif self.lr_schedule_mode == "clr2":
            lr = self._clr2_schedule()
        elif self.lr_schedule_mode == "calr":
            lr = self._calr_schedule()
        elif self.lr_schedule_mode == "fge":
            lr = self._fge_schedule()
        elif self.lr_schedule_mode == "swa":
            lr = self._swa_schedule()
        else:
            raise ValueError("Unknown schedule mode: " + str(self.lr_schedule_mode))
        K.set_value(self.model.optimizer.lr, lr)

    def _swa_schedule(self):
        return (1 - self._t_cycle()) * self.alpha1 + self._t_cycle() * self.alpha2

    def _fge_schedule(self):
        if self._t_cycle() <= 0.5:
            return ((1.0 - 2.0 * self._t_cycle()) * self.alpha1) + (2.0 * self._t_cycle() * self.alpha2)
        else:
            return ((2.0 - 2.0 * self._t_cycle()) * self.alpha2) + ((2.0 * self._t_cycle() - 1.0) * self.alpha1)

    def _se_schedule(self):
        lr = math.pi * (self.current_epoch % self.cycle_len) / self.cycle_len
        lr = self.alpha1 / 2 * (math.cos(lr) + 1)
        return lr

    def _clr_schedule(self):
        if self._t_cycle() <= 0.5:
            return ((1.0 - 2.0 * self._t_cycle()) * self.alpha2) + (2.0 * self._t_cycle() * self.alpha1)
        else:
            return ((2.0 - 2.0 * self._t_cycle()) * self.alpha1) + ((2.0 * self._t_cycle() - 1.0) * self.alpha2)

    def _clr2_schedule(self):
        decay = 1 / (2 ** self.cycle_num)
        if self._t_cycle() <= 0.5:
            return ((1.0 - 2.0 * self._t_cycle()) * self.alpha2) + (2.0 * self._t_cycle() * self.alpha1) * decay
        else:
            return ((2.0 - 2.0 * self._t_cycle()) * self.alpha1) * decay + ((2.0 * self._t_cycle() - 1.0) * self.alpha2)

    def _calr_schedule(self):
        decay = ((self.cycle_len + 1) / 10) ** (self.current_epoch % self.cycle_len)  # TODO find something better
        if self._t_epoch() <= 0.5:
            return ((1.0 - 2.0 * self._t_epoch()) * self.alpha2) + (2.0 * self._t_epoch() * self.alpha1) * decay
        else:
            return ((2.0 - 2.0 * self._t_epoch()) * self.alpha1) * decay + ((2.0 * self._t_epoch() - 1.0) * self.alpha2)

    def _t_cycle(self):
        return (((self.clr_iterations - 1) % self.iter_per_cycle) + 1) / self.iter_per_cycle

    def _t_epoch(self):
        return (((self.clr_iterations - 1) % self.iter_per_epoch) + 1) / self.iter_per_epoch

    def snapshot_predict(self):
        for seq_name, seq in self.seqs_dict.items():
            self.probs_dict[seq_name].append(self.model.predict_generator(seq, steps=len(seq),
                                                                          use_multiprocessing=False, workers=CPU_CORES,
                                                                          max_queue_size=2 * CPU_CORES + 2,
                                                                          verbose=0))

    def swa_weights_update(self):
        weights = self.model.get_weights()

        if len(self.swa_weights) == 0:
            self.swa_weights = weights
            return

        for i in range(0, len(self.swa_weights)):
            self.swa_weights[i] = (self.swa_weights[i] * self.model_counts + weights[i]) / (self.model_counts + 1)

In [None]:
train_ids_list = train_ids.tolist()
instids = train2.loc[(train2.metric_point==1) & (train2.installation_id.isin(train_ids_list)),'installation_id'].values
train_ids_list = train2.loc[(train2.metric_point==1)&(train2.installation_id.isin(train_ids_list))].installation_id.unique().tolist()
print(len(train_ids_list))

instids_unique = list(train_ids_list)
instids = train2.loc[(train2.metric_point==1),'installation_id'].values

instids2_unique = train2.installation_id.unique()

seqlens = []
for qid in tqdm_notebook(train_ids_list):
    seqlens.append(train2.loc[(train2.metric_point==1)&(train2.installation_id==qid),'metric_point'].sum())
print(len(seqlens), q_train.shape)

In [None]:
# In[28]:


train_ids_list = train2.loc[(train2.metric_point==1)&(train2.installation_id.isin(train_ids_list))].installation_id.unique()


In [None]:
# In[29]:


from sklearn.model_selection import KFold, StratifiedKFold

NFOLDS = 7
folds1 = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)
folds2 = KFold(n_splits=NFOLDS, shuffle=True, random_state=239)
folds3 = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

instids = train2.loc[(train2.metric_point==1),'installation_id'].values

seq_len = 64
#num_cols = ['correct_attempts', 'incorrect_attempts', 'time_df', 'event_duration', 'code_4070_count',  'code_2010_count']
num_cols = list(range(3+4+len(code_counts_cols)+len(miss_cols)))


In [None]:
num_cols

In [None]:

def get_cat_emb(cat_name, cat_size, min_emb_size=2, max_emb_size=50, reg=regularizers.l2(3e-4)): # regularizers.l2(1e-4)
    emb_size = 7
    emb_inp = Input((seq_len,), name=cat_name+'_in')
    #emb = Dropout(0.005)(emb_inp)
    emb = Embedding(cat_size, emb_size, name=cat_name+'_emb', mask_zero=True)(emb_inp) #, embeddings_regularizer=reg
    return emb_inp, emb
    
def buildMixedModel(cat_cols, cat_sizes_map):
    cat_inps = []
    cat_embs = []
    for cat_col in cat_cols:
        emb_inp, emb = get_cat_emb(cat_col, cat_sizes_map[cat_col])
        cat_inps.append(emb_inp)
        cat_embs.append(emb)
    
    num_inp = Input((seq_len,len(num_cols)), name='num_inp')
    text_inp = Input((q_train.shape[1],), name='text_inp')
    x2 = text_inp
    x2 = Dropout(0.5)(x2)
    x2 = Dense(96, activation='relu')(x2)

    cat_embs.append(num_inp)
    x = Concatenate(axis=-1)(cat_embs)
    
    x = Bidirectional(GRU(64, return_sequences=True, recurrent_dropout=0.001, activation='relu'))(x)
    #x = Flatten()(x)
    #x = Dropout(0.5)(x)
    #x = LSTM(64, return_sequences=True, recurrent_dropout=0.001, activation='relu')(x)
    #x = Dropout(0.5)(x)
    #x = LSTM(80, return_sequences=True, recurrent_dropout=0.001, activation='relu')(x)
    #x = Flatten()(x)
    x = AttentionWeightedAverage()(x)
    x = Concatenate()([x,x2])
    x = Dropout(0.5)(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.25)(x)
    
    output1 = Dense(1, activation="linear", name="output1")(x)

    return Model(inputs=cat_inps + [num_inp,text_inp], outputs=[output1])

In [None]:















model = buildMixedModel(cat_cols, cat_sizes_map)
model.summary()

test_pred = np.zeros(matrix_titles_test.shape[0])
raw_preds = np.zeros((matrix_titles_test.shape[0], 15))

histories = []
metrics_last_assessment_val = []
metrics_val = []

def get_inps(idxs):
    inps = []
    inps.append(matrix_titles[idxs, -seq_len:])
    inps.append(matrix_numericals[idxs, -seq_len:, :][:,:,num_cols])
    inps.append(q_train[idxs].todense())
    return inps

test_seq = FeatureSequence([matrix_titles_test[:, -seq_len:],
                            matrix_numericals_test[:, -seq_len:, :][:,:,num_cols], 
                            q_test.todense()], None, 128, shuffle=False)


def random_chs(a):
    size = 1
    replace = True
    fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
    df = pd.DataFrame({'Group_Id':a})
    q = df.groupby('Group_Id', as_index=False).apply(fn).reset_index()
    q.columns = ['v0','v1','v2']
    return q.v1.values.astype(int)

ifold = 0
gcoefs = [0,0,0]

for folds in [folds1]:
    for fold, (train_idxs, val_idxs) in enumerate(folds.split(instids)):
        gc.collect()
        K.clear_session()
        gc.collect()    

        smpls = np.array([np.arange(len(val_idxs))])

        trn_seq = FeatureSequence(get_inps(train_idxs),
                                  [train2.loc[(train2.metric_point == 1),'label'].values[train_idxs]], 
                                  128, shuffle=True)
        val_seq = FeatureSequence(get_inps(val_idxs),
                                  [train2.loc[(train2.metric_point == 1),'label'].values[val_idxs]], 
                                  len(val_idxs), shuffle=False)

        Y = train2.loc[(train2.metric_point == 1),'label'].values[val_idxs]
        Y2 = train2.loc[(train2.metric_point == 1),'all_attempts'].values[val_idxs]
        kappa_metric = KappaEvaluationSeq(val_seq, Y, Y2, smpls, 'val')

        model_file = 'model_' + str(fold) + '.pth'
        early_stop = EarlyStopping(monitor='val_kappa', min_delta=0, patience=16, verbose=1, mode='max')
        model_checkpoint = ModelCheckpoint(model_file, monitor='val_kappa', verbose=1, mode='max',
                                           save_best_only=True, save_weights_only=False, period=1)
        clr = CyclicLR(base_lr=0.00001, max_lr=0.001, step_size=4*math.ceil(len(trn_seq)), mode='triangular2')
        model = buildMixedModel(cat_cols, cat_sizes_map)
        opt=keras.optimizers.Adam(lr=0.001, clipnorm=1.0, clipvalue=1.0)
        #opt = AdamW(weight_decay=0.015, beta_1=0.9, beta_2=0.999, batch_size=8,
        #            samples_per_epoch=len(train_idxs), epochs=8,
        #            clipnorm=0, clipvalue=0)

        model.compile(optimizer=opt, loss=['mean_squared_error'])

        seqs_dict = {"val": val_seq, "test": test_seq}

        se = StochasticEnsembling(seqs_dict=seqs_dict, cycle_len=12, iter_per_epoch=len(trn_seq),
                                  alpha1=0.005, alpha2=0.0005, lr_schedule_mode="clr",
                                  swa_cycle_start_inx=0, model_name="model", verbose=1)

        history = model.fit_generator(
                generator=trn_seq, steps_per_epoch=len(trn_seq),
                initial_epoch=0, epochs=24, shuffle=False, verbose=2,
                callbacks=[kappa_metric, se],  #clr, model_checkpoint, early_stop
                use_multiprocessing=False, workers=1, max_queue_size=2*4)

        histories.append(history)

        #model = load_model(model_file, custom_objects = {
        #    'AttentionWeightedAverage': AttentionWeightedAverage
        #})

        qpred = (se.probs_dict["val"][:,-2] + se.probs_dict["val"][:,-3])/2.0

        optR = OptimizedRounder(smpls) 
        optR.fit(qpred, Y)
        coefficients = optR.coefficients()
        for ic in range(3):
            gcoefs[ic] += coefficients[ic] / NFOLDS
        y_pred = optR.predict(qpred, coefficients)     

        tts = []
        for i in range(smpls.shape[0]):
            tts.append(qwk3(np.array(Y)[smpls[i].astype(int)], np.array(y_pred.astype(int))[smpls[i].astype(int)]))
        kapa = np.median(tts)     

        pred1 = (se.probs_dict["test"][:,-2] + se.probs_dict["test"][:,-3])/2.0
        
        test_pred += pred1.ravel() / NFOLDS
        
        raw_preds[:,ifold] = pred1.ravel()
        ifold += 1
        
        pred1 = optR.predict(pred1, coefficients).ravel().astype(int)

        del model, kappa_metric, clr, model_checkpoint, early_stop, se
        del trn_seq, val_seq, qpred, Y, y_pred
        print(fold, kapa)
        gc.collect()

        metrics_val.append(kapa)

np.save('v24full', raw_preds)

print(np.mean(metrics_val))

optR = OptimizedRounder([1])
y_pred = optR.predict(test_pred, gcoefs).ravel()

submission = pd.DataFrame({'installation_id':test_ids})
submission['accuracy_group'] = y_pred.astype('int')
submission.to_csv('submission.csv', index=None)

In [None]:
# 0.603 default oob kappa

In [None]:
# !python train_model.py