In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostRegressor
from matplotlib import pyplot
import shap
import seaborn as sns

# Statistic lib
from scipy import stats
from scipy.stats import skew, norm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats
import lightgbm as lgb
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
import gc
import json
pd.set_option('display.max_columns', 1000)

# Objective

* In the last notebook we create our baseline model including a feature selection part. 
* Cohen cappa score of 0.456 (lb) with a local cv score of 0.529
* In this notebook we are going to add more features and remove others that i think they overfitt the train set and then check if our local cv score improve.
* Next, we will check if this improvement aligns with the lb.

# Notes
* Check the distribution of the target variable of the out of folds score and the prediction distribution. A good model should more or less have the same distribution.

In [45]:
def eval_qwk_lgb_regr(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """
    dist = Counter(train['target'])
    for k in dist:
        dist[k] /= len(train)
    train['target'].hist()
    
    acum = 0
    bound = {}
    for i in range(3):
        acum += dist[i]
        bound[i] = np.percentile(y_pred, acum * 100)

    def classify(x):
        if x <= bound[0]:
            return 0
        elif x <= bound[1]:
            return 1
        elif x <= bound[2]:
            return 2
        else:
            return 3

    y_pred = np.array(list(map(classify, y_pred))).reshape(y_true.shape)

    return 'cappa', cohen_kappa_score(y_true, y_pred, weights='quadratic'), True

In [46]:
def cohenkappa(ypred, y):
    y = y.get_label().astype("int")
    ypred = ypred.reshape((4, -1)).argmax(axis = 0)
    loss = cohenkappascore(y, y_pred, weights = 'quadratic')
    return "cappa", loss, True

In [47]:
def read_data():
    
    print('Carregando arquivo dataset_treino_new.csv....')
    train = pd.read_csv('../dataset/dataset_treino_new.csv')
    print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))

    print('Carregando arquivo dataset_teste_new.csv....')
    test = pd.read_csv('../dataset/dataset_teste_new.csv')
    print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))

    
    return train, test

In [48]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'target'
        self.cv = self.get_cv()
        self.verbose = verbose
        self.params = self.get_params()
        self.y_pred, self.score, self.model = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self):
        cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
        return cv.split(self.train_df, self.train_df[self.target])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        oof_pred = np.zeros((len(train), ))
        y_pred = np.zeros((len(test), ))
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            conv_x_val = self.convert_x(x_val)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            print('Partial score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val, oof_pred[val_idx])[1]))
        _, loss_score, _ = eval_qwk_lgb_regr(self.train_df[self.target], oof_pred)
        if self.verbose:
            print('Our oof cohen kappa score is: ', loss_score)
        return y_pred, loss_score, model

In [49]:
class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
        val_set = lgb.Dataset(x_val, y_val, categorical_feature=self.categoricals)
        return train_set, val_set
        
    def get_params(self):
        params = {'n_estimators':5000,
                    'boosting_type': 'gbdt',
                    'objective': 'regression',
                    'metric': 'rmse',
                    'subsample': 0.75,
                    'subsample_freq': 1,
                    'learning_rate': 0.01,
                    'feature_fraction': 0.9,
                    'max_depth': 15,
                    'lambda_l1': 1,  
                    'lambda_l2': 1,
                    'early_stopping_rounds': 100
                    }
        return params

In [50]:
class Xgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return xgb.train(self.params, train_set, 
                         num_boost_round=5000, evals=[(train_set, 'train'), (val_set, 'val')], 
                         verbose_eval=verbosity, early_stopping_rounds=100)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = xgb.DMatrix(x_train, y_train)
        val_set = xgb.DMatrix(x_val, y_val)
        return train_set, val_set
    
    def convert_x(self, x):
        return xgb.DMatrix(x)
        
    def get_params(self):
        params = {'colsample_bytree': 0.8,                 
            'learning_rate': 0.01,
            'max_depth': 10,
            'subsample': 1,
            'objective':'reg:squarederror',
            #'eval_metric':'rmse',
            'min_child_weight':3,
            'gamma':0.25,
            'n_estimators':5000}

        return params

In [51]:
class Catb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        clf = CatBoostRegressor(**self.params)
        clf.fit(train_set['X'], 
                train_set['y'], 
                eval_set=(val_set['X'], val_set['y']),
                verbose=verbosity, 
                cat_features=self.categoricals)
        return clf
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        params = {'loss_function': 'RMSE',
                   'task_type': "CPU",
                   'iterations': 5000,
                   'od_type': "Iter",
                    'depth': 10,
                  'colsample_bylevel': 0.5, 
                   'early_stopping_rounds': 300,
                    'l2_leaf_reg': 18,
                   'random_seed': 42,
                    'use_best_model': True
                    }
        return params

In [52]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

class Nn_Model(Base_Model):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True):
        features = features.copy()
        if len(categoricals) > 0:
            for cat in categoricals:
                enc = OneHotEncoder()
                train_cats = enc.fit_transform(train_df[[cat]])
                test_cats = enc.transform(test_df[[cat]])
                cat_cols = ['{}_{}'.format(cat, str(col)) for col in enc.active_features_]
                features += cat_cols
                train_cats = pd.DataFrame(train_cats.toarray(), columns=cat_cols)
                test_cats = pd.DataFrame(test_cats.toarray(), columns=cat_cols)
                train_df = pd.concat([train_df, train_cats], axis=1)
                test_df = pd.concat([test_df, test_cats], axis=1)
        scalar = MinMaxScaler()
        train_df[features] = scalar.fit_transform(train_df[features])
        test_df[features] = scalar.transform(test_df[features])
        print(train_df[features].shape)
        super().__init__(train_df, test_df, features, categoricals, n_splits, verbose)
        
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        model = tf.keras.models.Sequential([
            tf.keras.layers.Input(shape=(train_set['X'].shape[1],)),
            tf.keras.layers.Dense(200, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(100, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(50, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(25, activation='relu'),
            tf.keras.layers.LayerNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(1, activation='relu')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=4e-4), loss='mse')
        print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'], 
                train_set['y'], 
                validation_data=(val_set['X'], val_set['y']),
                epochs=100,
                 callbacks=[save_best, early_stop])
        model.load_weights('nn_model.w8')
        return model
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        return None

In [53]:
from random import choice

class Cnn_Model(Base_Model):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, verbose=True):
        features = features.copy()
        if len(categoricals) > 0:
            for cat in categoricals:
                enc = OneHotEncoder()
                train_cats = enc.fit_transform(train_df[[cat]])
                test_cats = enc.transform(test_df[[cat]])
                cat_cols = ['{}_{}'.format(cat, str(col)) for col in enc.active_features_]
                features += cat_cols
                train_cats = pd.DataFrame(train_cats.toarray(), columns=cat_cols)
                test_cats = pd.DataFrame(test_cats.toarray(), columns=cat_cols)
                train_df = pd.concat([train_df, train_cats], axis=1)
                test_df = pd.concat([test_df, test_cats], axis=1)
        scalar = MinMaxScaler()
        train_df[features] = scalar.fit_transform(train_df[features])
        test_df[features] = scalar.transform(test_df[features])
        self.create_feat_2d(features)
        super().__init__(train_df, test_df, features, categoricals, n_splits, verbose)
        
    def create_feat_2d(self, features, n_feats_repeat=50):
        self.n_feats = len(features)
        self.n_feats_repeat = n_feats_repeat
        self.mask = np.zeros((self.n_feats_repeat, self.n_feats), dtype=np.int32)
        for i in range(self.n_feats_repeat):
            l = list(range(self.n_feats))
            for j in range(self.n_feats):
                c = l.pop(choice(range(len(l))))
                self.mask[i, j] = c
        self.mask = tf.convert_to_tensor(self.mask)
        print(self.mask.shape)
       
        
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0

        inp = tf.keras.layers.Input(shape=(self.n_feats))
        x = tf.keras.layers.Lambda(lambda x: tf.gather(x, self.mask, axis=1))(inp)
        x = tf.keras.layers.Reshape((self.n_feats_repeat, self.n_feats, 1))(x)
        x = tf.keras.layers.Conv2D(18, (50, 50), strides=50, activation='relu')(x)
        x = tf.keras.layers.Flatten()(x)
        #x = tf.keras.layers.Dense(200, activation='relu')(x)
        #x = tf.keras.layers.LayerNormalization()(x)
        #x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(100, activation='relu')(x)
        x = tf.keras.layers.LayerNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        x = tf.keras.layers.Dense(50, activation='relu')(x)
        x = tf.keras.layers.LayerNormalization()(x)
        x = tf.keras.layers.Dropout(0.3)(x)
        out = tf.keras.layers.Dense(1)(x)
        
        model = tf.keras.Model(inp, out)
    
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss='mse')
        print(model.summary())
        save_best = tf.keras.callbacks.ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
        early_stop = tf.keras.callbacks.EarlyStopping(patience=20)
        model.fit(train_set['X'], 
                train_set['y'], 
                validation_data=(val_set['X'], val_set['y']),
                epochs=100,
                 callbacks=[save_best, early_stop])
        model.load_weights('nn_model.w8')
        return model
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = {'X': x_train, 'y': y_train}
        val_set = {'X': x_val, 'y': y_val}
        return train_set, val_set
        
    def get_params(self):
        return None

In [65]:
# read data
train, test = read_data()

Carregando arquivo dataset_treino_new.csv....
dataset_treino.csv tem 114321 linhas and 184 colunas
Carregando arquivo dataset_teste_new.csv....
dataset_teste.csv tem 114393 linhas and 183 colunas


In [66]:
# Leitura dos dados
train, test = read_data()
df = train.append(test)
df = df.drop(columns = ['1'], axis = 1)

Carregando arquivo dataset_treino_new.csv....
dataset_treino.csv tem 114321 linhas and 184 colunas
Carregando arquivo dataset_teste_new.csv....
dataset_teste.csv tem 114393 linhas and 183 colunas


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [67]:
df["v19"] = np.log1p(df["v19"])

In [68]:
# Verificar a quantidade de features numericas e categoricas

numerical_feats = df.dtypes[df.dtypes != "object"]
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = df.dtypes[df.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  164
Number of Categorical features:  19


In [69]:
df.head()

Unnamed: 0,ID,grp_1,grp_10,grp_11,grp_12,grp_2,grp_3,grp_4,grp_5,grp_6,grp_7,grp_8,grp_9,ica_1,ica_10,ica_11,ica_12,ica_2,ica_3,ica_4,ica_5,ica_6,ica_7,ica_8,ica_9,pca_1,pca_10,pca_11,pca_12,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,srp_1,srp_10,srp_11,srp_12,srp_2,srp_3,srp_4,srp_5,srp_6,srp_7,srp_8,srp_9,target,tsvd_1,tsvd_10,tsvd_11,tsvd_12,tsvd_2,tsvd_3,tsvd_4,tsvd_5,tsvd_6,tsvd_7,tsvd_8,tsvd_9,v1,v10,v100,v101,v102,v103,v104,v105,v106,v107,v108,v109,v11,v110,v111,v112,v113,v114,v115,v116,v117,v118,v119 v123,v119 v48,v119 v85,v119^2,v119_x,v119_y,v120,v121,v122,v123 v48,v123 v85,v123^2,v123_x,v123_y,v124,v125,v126,v127,v129,v130,v131,v14,v15,v16,v17,v18,v19,v2,v20,v21,v22,v23,v24,v26,v27,v28,v29,v3,v30,v31,v35,v36,v37,v39,v4,v40,v42,v44,v45,v47,v48 v85,v48^2,v48_x,v48_y,v5,v50,v51,v52,v56,v57,v58,v59,v6,v61,v62 v119,v62 v123,v62 v48,v62 v85,v62^2,v62_x,v62_y,v66,v68,v69,v7,v70,v71,v72,v74,v75,v78,v79,v80,v81,v82,v84,v85^2,v85_x,v85_y,v86,v88,v9,v90,v91,v92,v93,v94,v98,v99
0,3,-91.71068,27.71637,54.711456,120.17605,49.214336,39.81932,6.008493,0.23973,-2.993159,-28.92962,74.760445,96.09949,-0.003843,-0.001623,0.000137,0.000271,-0.000946,0.002434,-0.001772,-0.000151,0.003242,-0.0008,-0.000453,-0.001254,-48.16452,-0.424896,2.754712,-1.715571,1.603576,-0.998099,-14.700655,-1.278328,8.71047,2.471012,3.199084,-0.098913,-1.847689,-2.408795,-29.066944,22.876034,-31.847212,9.526614,-133.76234,5.28748,17.142406,-20.641521,-0.262313,-36.283512,1.0,153.40079,0.464463,1.347929,2.678761,-62.110138,-3.816818,7.057063,-13.019275,-1.45188,8.745543,2.470521,3.073173,1.335739,0.503281,19.4702,8.389236,2.757375,4.374296,1.574039,0.007294,12.579185,E,2.382692,3.930922,16.434109,B,0.433213,O,,15.634908,2.857144,1.95122,6.592012,5.909091,-1.253049e-06,-8e-06,-1.07517e-06,3.965754e-13,-6.297423e-07,-6.297423e-07,1.059603,0.803572,8.0,24.219074,3.397185,3.959225,1.98978,1.98978,0.035754,AU,1.804126,3.113719,0,0.636364,2.857144,11.636387,1.355013,8.571428,3.67035,0.10672,0.13879,8.727474,18.869284,7.730923,XDX,-1.716131e-08,C,1.720818,3.393503,0.590122,8.880867,C,C,A,8.375452,11.326591,0.454546,4.012088,3.921026,7.711453,12.707581,10.498338,9.848672,C,20.781006,148.1511,12.171734,12.171734,7.915266,0.89942,7.277793,G,DI,3.971118,0.529802,10.890984,2.599278,15.858151,-6.297423e-07,1.98978,12.171734,1.707317,1.0,1,1.0,C,15.231789,17.142857,3.176895,11.784549,F,1,B,D,8.571429,E,3.0,7.528326,8.861647,1.299638,2.914931,1.707317,1.707317,0.866426,3.3213,9.999999,0.905342,A,0.442252,5.814018,3.517721,8.877414,1.191337
1,4,-93.24794,38.592617,54.51168,123.92817,48.573597,45.229984,7.24658,-17.26448,0.030932,-14.308875,94.82114,91.5798,-0.000614,0.001376,0.000588,-0.0041,0.000371,-0.003999,-0.000468,-8.4e-05,-0.000147,0.000706,-0.000207,0.001949,-25.950914,1.779193,-2.487668,-0.286266,5.677019,-0.39685,-0.618112,12.524258,1.614324,-1.088056,-5.308384,1.685836,0.007697,2.013834,-28.92959,57.01802,-27.310768,18.56283,-157.43951,14.462655,9.338835,-19.964195,-7.352732,-38.573727,1.0,171.0141,0.914076,-2.541605,-1.772064,-48.366608,-0.820012,1.806945,0.119447,12.426791,1.585607,-1.087751,-5.426009,1.46955,1.31291,14.475939,6.623713,2.462898,5.125846,2.512034,1.505335,12.085176,B,1.825361,4.247858,15.495952,A,3.108809,U,G,10.308044,10.476191,2.222223,10.595357,8.136964,4.003565,18.140322,3.792658,2.13616,1.461561,1.461561,1.144708,2.436195,6.749117,33.998367,7.108153,7.503433,2.73924,2.73924,0.598896,AF,1.614802,2.96362,0,1.560137,1.589403,11.636386,1.992031,4.932127,3.554267,0.773906,0.1812,7.023803,18.036585,6.76311,GUV,1.845672e-07,C,1.826276,2.673322,5.043831,8.296139,C,C,A,8.068506,14.579479,0.642856,0.378418,4.205991,14.305766,12.934363,10.782008,9.156046,E,32.20734,154.048,12.411608,12.411608,9.191265,1.37921,7.134018,G,DY,4.067039,5.330551,10.535108,2.41279,15.075894,2.923122,5.478479,24.823215,5.189874,4.0,2,2.0,A,18.274548,9.516129,2.452166,12.053353,F,2,B,D,13.333334,D,2.090909,7.277655,3.430692,1.423294,6.733697,2.594937,2.594937,1.158301,1.761547,9.059583,0.969183,B,0.542669,5.301047,3.743106,8.303966,1.235546
2,5,-93.17317,27.09815,54.80686,122.12799,50.506966,41.667606,18.028326,-16.219355,-0.176257,-19.010742,92.56423,82.86307,-0.003126,-0.009395,5.1e-05,0.000103,-0.000203,-0.002771,-0.000622,-0.003719,0.000614,-0.000804,-0.000271,-0.000774,-38.11515,-9.903248,0.260174,-0.687128,17.07039,1.68926,-8.619469,-1.635108,12.980241,-0.603744,-0.164112,7.184428,1.045899,5.40709,-26.20066,33.87494,-36.5935,20.01429,-145.68756,9.170828,3.110579,-32.03048,-19.990597,-39.241848,1.0,167.8226,3.00443,9.587715,-2.677776,-62.600937,5.930745,-1.857007,-9.216124,-1.70903,14.247915,-0.606809,-2.938097,0.943877,0.765864,15.491329,5.879353,3.292788,5.924457,1.668401,0.008275,11.670572,C,1.375753,1.184211,14.756098,B,3.367348,S,,11.205562,12.941177,3.129252,3.478911,6.233767,-6.919294e-07,-4e-06,-6.786108e-07,7.799425e-14,-2.792745e-07,-2.792745e-07,2.138728,2.238806,9.333333,31.493532,6.020325,6.138481,2.477596,2.477596,0.013452,AE,1.773709,3.922193,2,0.883118,1.176472,9.603541,1.984127,5.882352,3.170847,0.244541,0.134757,5.310079,17.952332,5.245035,FQ,-2.785053e-07,E,2.244897,5.306122,0.836005,7.499999,C,,A,7.959184,12.730516,0.25974,7.378964,4.410969,13.077201,12.346939,8.89756,5.343819,C,30.887331,161.57785,12.711328,12.711328,5.32616,0.604504,9.637628,F,AS,4.030613,4.277456,9.105481,3.979592,16.075602,-2.792745e-07,2.477596,12.711328,2.429906,1.0,1,1.0,A,11.040463,5.882353,3.928571,8.460654,B,3,B,B,11.764705,E,3.333334,10.194432,8.266199,1.530613,5.904443,2.429906,2.429906,1.071429,3.367346,12.666667,0.811447,G,0.27148,5.15656,4.214944,11.588858,0.841837
3,6,-84.85693,31.718266,40.14042,119.73096,47.808308,47.3436,9.107174,-4.467571,-2.826444,-32.088654,70.51964,96.0872,-0.002676,0.001442,0.000254,0.000359,0.000605,-0.000728,-0.000576,-0.001072,0.002642,0.000579,0.000567,-0.007559,-44.795555,-0.982714,6.848362,2.78433,-0.511768,-5.436988,-13.71354,-1.863892,3.84938,2.223841,-3.667469,-3.011027,-6.738361,0.335619,-15.731087,20.106495,-34.67513,1.853619,-141.46463,9.822379,10.091172,-19.457485,-6.309244,-34.524216,1.0,155.97241,-3.106392,2.676349,6.09745,-59.753407,-9.19182,7.248538,-11.691413,-2.0254,3.205872,2.224963,-2.602192,0.797415,6.542669,18.256351,8.50728,2.503055,4.872158,2.573664,0.113967,12.554274,B,2.230754,1.990131,16.347483,B,2.643678,J,,13.777666,10.574713,1.511063,4.949609,7.180722,1.025151,6.896295,0.8974875,0.3198,0.5655087,0.5655087,1.166281,1.956521,7.018256,22.106771,2.876987,3.286226,1.812795,1.812795,0.002267,CJ,1.41523,2.954381,1,1.677108,1.034483,14.094723,1.945044,5.517242,3.610789,1.224114,0.208339,8.304757,18.376408,7.517125,ACUE,-4.805344e-07,D,1.308269,2.30364,8.926662,8.87452,C,C,B,8.898468,11.302795,0.433735,0.287322,4.22593,11.523045,12.935823,12.708574,9.670823,C,19.353779,148.71448,12.194855,12.194855,11.627439,3.329176,4.780357,H,BW,3.965517,1.732102,11.777912,2.0977,15.92739,0.5655087,1.812795,12.194855,1.587045,1.0,1,1.0,A,18.568129,9.425287,1.987549,13.594727,F,2,B,D,13.448277,B,1.947261,4.797873,13.31582,1.37931,2.518711,1.587045,1.587045,1.242817,1.408046,8.965516,1.042425,B,0.763925,5.498902,3.423944,6.942002,1.334611
4,8,-94.072464,37.100567,46.883675,126.04482,45.960175,42.72321,9.226593,-14.774398,1.247865,-20.24784,90.88272,89.35329,-0.000492,0.000568,0.000501,0.000123,0.000532,0.000437,-0.000739,6e-06,-0.000297,0.000705,-0.000112,0.001467,-26.297642,1.164047,-1.712914,0.004744,5.217902,-0.014489,-0.537369,-0.370327,0.80141,0.351222,0.653618,-0.431471,0.837381,-1.831667,-29.11207,42.456104,-29.633032,16.808092,-157.43951,14.462655,9.338835,-21.29248,-9.871851,-36.43765,1.0,169.87088,-0.078218,-1.792219,-1.073189,-47.78185,-0.078944,0.665971,-0.354514,-0.395111,0.76925,0.351135,0.76501,1.46955,1.050328,14.475939,6.623713,2.462898,5.125846,2.512034,0.242556,12.085176,C,1.980834,3.087909,15.495952,A,3.108809,T,G,14.097098,10.476191,2.222223,8.070175,8.136964,4.003565,18.140322,3.792658,2.13616,1.461561,1.461561,1.144708,2.436195,6.749117,33.998367,7.108153,7.503433,2.73924,2.73924,0.139864,Z,1.614802,2.96362,0,1.560137,1.589403,10.991097,1.992031,4.932127,3.554267,0.773906,0.1812,7.023803,18.036585,6.414567,HIT,1.845672e-07,E,1.826276,2.673322,5.043831,8.296139,C,,A,8.068506,13.765616,0.642856,0.378418,4.205991,10.13892,12.934363,10.782008,9.156046,I,32.20734,154.048,12.411608,12.411608,8.670867,1.364536,7.134018,H,,4.067039,5.330551,10.535108,2.41279,15.075894,1.461561,2.73924,12.411608,2.594937,1.0,1,1.0,C,18.274548,9.516129,2.452166,12.494872,F,1,B,D,13.333334,C,2.090909,7.517742,3.688459,1.423294,6.733697,2.594937,2.594937,1.158301,1.761547,9.059583,0.969183,G,0.542669,5.301047,3.743106,7.644154,1.235546


In [70]:
for c in df.columns:
    col_type = df[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        df[c] =df[c].astype('category')

In [72]:
scaler = StandardScaler()
for c in df.columns:
    col_type = df[c].dtype
    if col_type == 'float64' and c != 'target' and c != 'ID':
        df[c] = scaler.fit_transform(df[c].values.reshape(-1, 1))

In [83]:
treino = df[df['target'].notnull()]
teste = df[df['target'].isnull()]
    
# Separando features preditoras e target
train_x = treino.drop(['ID','target'], axis=1)
train_y = treino['target']

In [84]:
#cat_model = Catb_Model(reduce_train, ajusted_test, features, categoricals=categoricals)
lgb_model = Lgb_Model(treino, teste, train_x, categoricals=categorical_feats)
#xgb_model = Xgb_Model(treino, teste, train_x, categoricals=categorical_feats)

ValueError: Must pass DataFrame with boolean values only

In [None]:
#cnn_model = Cnn_Model(reduce_train, ajusted_test, features, categoricals=categoricals)
nn_model = Nn_Model(train, test, features, categoricals=categoricals)

In [None]:
weights = {'lbg': 0.60, 'cat': 0, 'xgb': 0.20, 'nn': 0.20}

final_pred = (lgb_model.y_pred * weights['lbg']) + (xgb_model.y_pred * weights['xgb']) + (nn_model.y_pred * weights['nn'])
#final_pred = cnn_model.y_pred
print(final_pred.shape)

In [None]:
#pd.DataFrame([(round(a, 2), round(b, 2), round(c, 2), round(d, 2)) for a, b, c, d in zip(lgb_model.y_pred, cat_model.y_pred, xgb_model.y_pred, nn_model.y_pred)], columns=['lgb', 'cat', 'xgb', 'nn']).head(50)

In [None]:
dist = Counter(train['target'])
for k in dist:
    dist[k] /= len(train)
train['target'].hist()

acum = 0
bound = {}
for i in range(3):
    acum += dist[i]
    bound[i] = np.percentile(final_pred, acum * 100)
print(bound)

def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3
    
final_pred = np.array(list(map(classify, final_pred)))

sample_submission['accuracy_group'] = final_pred.astype(int)
sample_submission.to_csv('submission.csv', index=False)
sample_submission['accuracy_group'].value_counts(normalize=True)