# Kaggle
## Competição DSA de Machine Learning - Dezembro 2019

**Versão 1.0.0: LB = ??? CV = 0.448106**
- modelo: Catboost (com parametros de otimizações)
- features engineering: label encoder nas features categoricas
- missing values: média para as features numéricas e NA para as features categoricas

**Versão 1.0.1: LB = 0.49111 CV = 0.467674 ***
- modelo: Catboost (com parametros de otimizações)
- features engineering: label encoder nas features categoricas e PCA
- missing values: média para as features numéricas e NA para as features categoricas

# Importando as bibliotecas

In [1]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

import time
import gc
from tqdm import tqdm

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Variavel para controlar o treinamento no Kaggle
TRAIN_OFFLINE = True

In [2]:
# Importa os pacotes de algoritmos
from catboost import Pool, CatBoostClassifier, cv

# Importa pacotes do sklearn
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedKFold

from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD

# Carregando os dados de treino e teste

In [3]:
def read_data():
    
    if TRAIN_OFFLINE:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('../dataset/dataset_treino.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_teste.csv....')
        test = pd.read_csv('../dataset/dataset_teste.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
        
    else:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_treino.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_treino.csv....')
        test = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_teste.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
    
    return train, test

In [4]:
# Leitura dos dados
train, test = read_data()

Carregando arquivo dataset_treino.csv....
dataset_treino.csv tem 114321 linhas and 133 colunas
Carregando arquivo dataset_teste.csv....
dataset_teste.csv tem 114393 linhas and 132 colunas


In [5]:
train.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21,v22,v23,v24,v25,v26,v27,v28,v29,v30,v31,v32,v33,v34,v35,v36,v37,v38,v39,v40,v41,v42,v43,v44,v45,v46,v47,v48,v49,v50,v51,v52,v53,v54,v55,v56,v57,v58,v59,v60,v61,v62,v63,v64,v65,v66,v67,v68,v69,v70,v71,v72,v73,v74,v75,v76,v77,v78,v79,v80,v81,v82,v83,v84,v85,v86,v87,v88,v89,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99,v100,v101,v102,v103,v104,v105,v106,v107,v108,v109,v110,v111,v112,v113,v114,v115,v116,v117,v118,v119,v120,v121,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.3357,8.7275,C,3.921,7.9153,2.5993,3.1769,0.0129,10.0,0.5033,16.4341,6.0857,2.8668,11.6364,1.355,8.5714,3.6704,0.1067,0.1489,18.8693,7.7309,XDX,-0.0,C,0.1394,1.7208,3.3935,0.5901,8.8809,C,A,1.083,1.0108,7.2701,8.3755,11.3266,0.4545,0,4.0121,7.7115,7.6534,12.7076,2.0155,10.4983,9.8487,0.1136,C,12.1717,8.0866,0.8994,7.2778,G,16.748,0.0371,1.2996,DI,3.9711,0.5298,10.891,1.5884,15.8582,1,0.1535,6.3632,18.3039,C,9.3141,15.2318,17.1429,11.7845,F,1,1.615,B,D,2.2309,7.2924,8.5714,E,3.0,7.5283,8.8616,0.6498,1.2996,1.7073,0.8664,9.5518,3.3213,0.0957,0.9053,A,0.4423,5.814,3.5177,0.462,7.4368,5.4545,8.8774,1.1913,19.4702,8.3892,2.7574,4.3743,1.574,0.0073,12.5792,E,2.3827,3.9309,B,0.4332,O,,15.6349,2.8571,1.9512,6.592,5.9091,-0.0,1.0596,0.8036,8.0,1.9898,0.0358,AU,1.8041,3.1137,2.0243,0,0.6364,2.8571
1,4,1,,,C,,9.1913,,,2.3016,,1.3129,,6.5076,,11.6364,,,,,,,6.7631,GUV,,C,3.0561,,,,,C,A,,,3.6151,,14.5795,,0,,14.3058,,,,,,2.45,E,,,1.3792,,G,,1.1295,,DY,,,,,,2,2.5447,,,A,,,,12.0534,F,2,,B,D,,,,D,,7.2777,3.4307,,,,,9.848,,2.6786,,B,,,,,,,8.304,,,,,,,1.5053,,B,1.8254,4.2479,A,,U,G,10.308,,,10.5954,,,,,,,0.5989,AF,,,1.9578,0,,
2,5,1,0.9439,5.3101,C,4.411,5.3262,3.9796,3.9286,0.0196,12.6667,0.7659,14.7561,6.3847,2.5056,9.6035,1.9841,5.8824,3.1708,0.2445,0.1443,17.9523,5.245,FQ,-0.0,E,0.114,2.2449,5.3061,0.836,7.5,,A,1.4541,1.7347,4.0439,7.9592,12.7305,0.2597,0,7.379,13.0772,6.1735,12.3469,2.9268,8.8976,5.3438,0.126,C,12.7113,6.8367,0.6045,9.6376,F,15.102,0.0856,0.7653,AS,4.0306,4.2775,9.1055,2.1514,16.0756,1,0.1236,5.5179,16.3772,A,8.3673,11.0405,5.8824,8.4607,B,3,2.4136,B,B,1.964,5.9184,11.7647,E,3.3333,10.1944,8.2662,1.5306,1.5306,2.4299,1.0714,8.4475,3.3673,0.1114,0.8114,G,0.2715,5.1566,4.2149,0.3097,5.6633,5.974,11.5889,0.8418,15.4913,5.8794,3.2928,5.9245,1.6684,0.0083,11.6706,C,1.3758,1.1842,B,3.3673,S,,11.2056,12.9412,3.1293,3.4789,6.2338,-0.0,2.1387,2.2388,9.3333,2.4776,0.0135,AE,1.7737,3.9222,1.1205,2,0.8831,1.1765
3,6,1,0.7974,8.3048,C,4.2259,11.6274,2.0977,1.9875,0.1719,8.9655,6.5427,16.3475,9.6467,3.9033,14.0947,1.945,5.5172,3.6108,1.2241,0.2316,18.3764,7.5171,ACUE,-0.0,D,0.1488,1.3083,2.3036,8.9267,8.8745,C,B,1.5876,1.6667,8.7035,8.8985,11.3028,0.4337,0,0.2873,11.523,7.931,12.9358,1.4709,12.7086,9.6708,0.1084,C,12.1949,8.592,3.3292,4.7804,H,16.6217,0.1397,1.1782,BW,3.9655,1.7321,11.7779,1.2292,15.9274,1,0.1403,6.293,17.0116,A,9.7031,18.5681,9.4253,13.5947,F,2,2.2725,B,D,2.1882,8.2136,13.4483,B,1.9473,4.7979,13.3158,1.681,1.3793,1.587,1.2428,10.7471,1.408,0.0391,1.0424,B,0.7639,5.4989,3.4239,0.8325,7.3755,6.747,6.942,1.3346,18.2564,8.5073,2.5031,4.8722,2.5737,0.114,12.5543,B,2.2308,1.9901,B,2.6437,J,,13.7777,10.5747,1.5111,4.9496,7.1807,0.5655,1.1663,1.9565,7.0183,1.8128,0.0023,CJ,1.4152,2.9544,1.9908,1,1.6771,1.0345
4,8,1,,,C,,,,,,,1.0503,,6.3201,,10.9911,,,,,,,6.4146,HIT,,E,,,,,,,A,,,6.0832,,,,0,,10.1389,,,,,,,I,,,1.3645,,H,,,,,,,,,,1,,,,C,,,,,F,1,,B,D,,,,C,,,,,,,,,,,,G,,,,,,,,,,,,,,,,C,,,A,,T,G,14.0971,,,,,,,,,,,Z,,,,0,,


In [6]:
test.head()

Unnamed: 0,ID,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21,v22,v23,v24,v25,v26,v27,v28,v29,v30,v31,v32,v33,v34,v35,v36,v37,v38,v39,v40,v41,v42,v43,v44,v45,v46,v47,v48,v49,v50,v51,v52,v53,v54,v55,v56,v57,v58,v59,v60,v61,v62,v63,v64,v65,v66,v67,v68,v69,v70,v71,v72,v73,v74,v75,v76,v77,v78,v79,v80,v81,v82,v83,v84,v85,v86,v87,v88,v89,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99,v100,v101,v102,v103,v104,v105,v106,v107,v108,v109,v110,v111,v112,v113,v114,v115,v116,v117,v118,v119,v120,v121,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,0,1.3755,11.3611,C,4.2008,6.577,2.0818,1.7844,0.0111,9.5238,1.3129,16.8519,6.9612,3.1075,12.2301,1.7731,2.8571,3.4938,0.0968,0.2255,17.9102,7.4378,AFPB,0.0,E,0.0692,0.9542,4.3866,0.573,6.9888,,A,1.1152,2.0818,5.2285,8.1784,12.1832,0.8333,0,2.8428,11.7546,6.7658,10.855,1.2963,10.6134,9.4172,0.1005,J,12.7745,7.5093,0.1188,8.5908,C,17.0213,0.0443,1.3383,AF,2.6022,0.292,9.0386,0.8178,14.9115,1,0.0915,5.9679,15.6115,A,8.0297,15.9124,8.5714,11.2222,F,1,1.929,B,D,2.1469,6.171,15.7143,Q,1.9048,8.9187,8.7507,2.4535,0.8922,1.8605,-0.8922,7.905,1.7844,0.1053,0.989,B,0.6583,4.9293,2.9924,0.76,5.948,5.0,10.0135,0.8178,19.708,4.1868,1.8739,4.129,1.7019,0.0045,11.6881,B,1.5025,2.1217,A,3.5688,U,,12.2467,11.4286,1.2766,4.7779,5.8333,-0.0,1.4599,3.2673,7.619,1.8152,0.0,AF,1.2924,3.9033,1.4859,0,2.3333,1.4286
1,1,,,C,,,,,,,1.291,,6.6183,,10.7608,,,,,,,5.8952,FOG,,D,,,,,,A,A,,,8.3874,,,,4,,6.7376,,,,,,,C,,,1.0271,,A,,,,DI,,,,,,1,,,,C,,,,,F,5,,B,D,,,,E,,,,,,,,,,,,C,,,,,,,,,,,,,,,,D,,,B,,E,,16.2849,,,,,,,,,,,I,,,,0,,
2,2,-0.0,8.2015,C,4.5444,6.5501,1.5584,2.4675,0.0072,7.1429,1.5755,15.1429,6.8942,1.8998,13.3062,1.3021,3.3333,3.1774,0.0707,0.1724,17.5375,8.2535,HXZ,-0.0,B,0.0596,2.8571,2.2078,0.3271,8.3117,C,A,0.974,1.5584,7.959,8.3117,12.1951,-0.0,0,6.7074,7.7013,6.8831,12.0779,3.4286,10.2157,7.9084,0.0822,I,12.7708,7.7922,0.8407,9.8852,A,15.0,0.0201,0.7792,DO,5.4545,-0.0,9.452,2.8571,15.5611,1,0.0837,5.6337,15.7787,C,9.0909,13.2,6.6667,7.8606,F,1,1.4881,B,D,1.9103,5.7143,16.6667,C,1.4286,8.9523,8.7449,1.9481,0.7792,1.8605,0.7792,5.8862,5.7143,0.0439,0.9187,C,0.5526,4.351,3.1637,0.605,6.2338,12.6316,10.2146,0.9524,19.8,6.5356,0.763,4.9175,1.9819,0.0028,11.747,D,1.1888,2.0888,A,3.1169,Q,AJ,15.6227,13.3333,3.75,4.7406,12.6316,0.0,0.6,2.5,5.7143,1.9709,0.0141,AV,1.1287,5.8442,1.4759,0,1.2632,-0.0
3,7,2.6619,3.0412,C,1.6572,9.7731,2.0783,1.4309,1.2522,7.9596,1.5755,14.4244,6.8562,5.0866,10.397,2.8035,7.3029,1.122,0.6085,0.4766,15.2856,5.5395,AGMU,0.0,E,1.78,1.4069,1.3509,5.3474,6.4748,E,A,2.3621,3.8529,6.2238,6.6347,13.7419,2.0112,0,0.0,10.445,6.0592,13.1894,1.8268,7.6359,12.8507,1.5042,I,16.7917,7.8018,0.9464,8.5784,A,15.3827,1.3513,3.813,CN,4.3565,19.5354,10.4259,1.1284,10.9352,1,1.8097,2.1817,12.3661,B,8.4013,17.6771,13.195,14.1243,F,1,3.905,B,D,0.6446,6.3709,11.2033,C,3.5556,6.2849,3.2644,3.2374,4.2206,6.0,1.307,13.0371,0.4317,2.0359,1.4951,B,0.7755,3.1762,5.0713,1.0205,4.98,8.6034,5.7436,0.6888,0.1161,3.6277,,7.4862,4.313,1.0486,5.9571,B,2.6909,6.2116,A,3.9329,F,G,13.7016,6.805,1.9012,12.4461,10.7263,18.2448,2.3229,4.1496,4.404,8.1636,1.1003,B,1.9887,1.5588,2.4488,0,5.3855,1.4938
4,10,1.2528,11.2834,C,4.6384,8.5205,2.3025,3.5102,0.0743,7.6129,1.0503,15.59,6.2736,2.5002,11.3545,1.3535,3.0189,4.974,0.5308,0.1488,18.6695,6.7415,AWW,-0.0,E,0.1587,2.5865,3.2619,3.7866,8.86,C,A,0.9989,1.1964,5.4769,7.7427,10.815,0.3859,0,0.4898,11.0976,7.3815,13.1377,2.9797,12.5891,9.2373,0.128,I,10.5796,7.8781,1.4937,5.5132,H,15.6688,0.063,0.8804,,5.2596,0.7723,11.0482,2.4624,16.9453,2,0.1235,8.1886,17.4902,B,9.4695,18.6951,9.8113,13.147,F,2,1.465,B,D,3.1275,7.7991,14.717,C,1.0323,6.8373,8.7157,1.3205,0.5418,0.9524,0.8634,12.0734,2.2348,0.15,0.9932,B,0.39,5.6321,3.8851,0.4175,6.9413,7.6527,8.5892,1.076,19.2277,6.6068,1.4939,4.929,1.9069,0.0551,14.667,B,2.467,5.3125,A,1.8284,A,I,13.3954,10.1887,3.0318,7.6372,8.0386,0.6627,0.8522,1.6762,6.5806,1.3257,0.2585,A,1.8638,2.6665,2.3743,0,0.6817,2.2642


In [7]:
df = train.append(test)

# Feature Engineering

In [8]:
df = df.drop(['ID'], axis=1)

In [9]:
numerical_feats = df.dtypes[df.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = df.dtypes[df.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  113
Number of Categorical features:  19


In [10]:
# Transformando as features categorias com LabelEncoder
# Realizando tratamento de missing value
# Corrigindo Skewness com Winsorize
le = LabelEncoder()

for col in df.columns:
    if df[col].dtype !='object':
        if col in ['ID','target']:
            continue
        #df[col] = mstats.winsorize(df[col], limits=[0.05, 0.05])[0]
        df[col].fillna(df[col].mean(),inplace=True)
    else:
        df[col].fillna('NA',inplace=True)
        le.fit(np.unique(list(df[col].values)))
        df[col] = le.transform(df[col])

In [11]:
# Add decomposed components: PCA / ICA etc.
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_df = tsvd.fit_transform(df.drop(["target"], axis=1))

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_df = pca.fit_transform(df.drop(["target"], axis=1))

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_df = ica.fit_transform(df.drop(["target"], axis=1))

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_df = grp.fit_transform(df.drop(["target"], axis=1))

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_df = srp.fit_transform(df.drop(["target"], axis=1))

In [12]:
# Append decomposition components to datasets
for i in range(1, n_comp+1):
    df['pca_' + str(i)]  = pca2_results_df[:,i-1]
    df['ica_' + str(i)]  = ica2_results_df[:,i-1]
    df['tsvd_' + str(i)] = tsvd_results_df[:,i-1]
    df['grp_' + str(i)]  = grp_results_df[:,i-1]
    df['srp_' + str(i)]  = srp_results_df[:,i-1]
    
y_train = df["target"]
y_mean = np.mean(y_train)

In [13]:
df.head()

Unnamed: 0,target,v1,v10,v100,v101,v102,v103,v104,v105,v106,v107,v108,v109,v11,v110,v111,v112,v113,v114,v115,v116,v117,v118,v119,v12,v120,v121,v122,v123,v124,v125,v126,v127,v128,v129,v13,v130,v131,v14,v15,v16,v17,v18,v19,v2,v20,v21,v22,v23,v24,v25,v26,v27,v28,v29,v3,v30,v31,v32,v33,v34,v35,v36,v37,v38,v39,v4,v40,v41,v42,v43,v44,v45,v46,v47,v48,v49,v5,v50,v51,v52,v53,v54,v55,v56,v57,v58,v59,v6,v60,v61,v62,v63,v64,v65,v66,v67,v68,v69,v7,v70,v71,v72,v73,v74,v75,v76,v77,v78,v79,v8,v80,v81,v82,v83,v84,v85,v86,v87,v88,v89,v9,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99,pca_1,ica_1,tsvd_1,grp_1,srp_1,pca_2,ica_2,tsvd_2,grp_2,srp_2,pca_3,ica_3,tsvd_3,grp_3,srp_3,pca_4,ica_4,tsvd_4,grp_4,srp_4,pca_5,ica_5,tsvd_5,grp_5,srp_5,pca_6,ica_6,tsvd_6,grp_6,srp_6,pca_7,ica_7,tsvd_7,grp_7,srp_7,pca_8,ica_8,tsvd_8,grp_8,srp_8,pca_9,ica_9,tsvd_9,grp_9,srp_9,pca_10,ica_10,tsvd_10,grp_10,srp_10,pca_11,ica_11,tsvd_11,grp_11,srp_11,pca_12,ica_12,tsvd_12,grp_12,srp_12
0,1.0,1.3357,0.5033,19.4702,8.3892,2.7574,4.3743,1.574,0.0073,12.5792,4,2.3827,3.9309,16.4341,1,0.4332,15,25,15.6349,2.8571,1.9512,6.592,5.9091,-0.0,6.0857,1.0596,0.8036,8.0,1.9898,0.0358,21,1.8041,3.1137,2.0243,0,2.8668,0.6364,2.8571,11.6364,1.355,8.5714,3.6704,0.1067,0.1489,8.7275,18.8693,7.7309,21417,-0.0,2,0.1394,1.7208,3.3935,0.5901,8.8809,2,2,0,1.083,1.0108,7.2701,8.3755,11.3266,0.4545,0,4.0121,3.921,7.7115,7.6534,12.7076,2.0155,10.4983,9.8487,0.1136,2,12.1717,8.0866,7.9153,0.8994,7.2778,6,16.748,0.0371,1.2996,90,3.9711,0.5298,10.891,2.5993,1.5884,15.8582,1,0.1535,6.3632,18.3039,2,9.3141,15.2318,17.1429,3.1769,11.7845,5,1,1.615,1,3,2.2309,7.2924,8.5714,4,0.0129,3.0,7.5283,8.8616,0.6498,1.2996,1.7073,0.8664,9.5518,3.3213,0.0957,10.0,0.9053,0,0.4423,5.814,3.5177,0.462,7.4368,5.4545,8.8774,1.1913,-9594.327,0.0008,21417.288,-3276.9464,-13.4116,-21.4027,0.0017,-34.7049,-1684.946,-32.7123,23.6509,0.0008,-35.2376,-7670.8542,-91.8458,-15.2318,-0.0016,5.1324,-4594.6384,25.7582,-4.4981,0.0016,-15.0688,-5392.1847,-56.4814,-4.5938,0.0029,-6.4122,1358.1126,50.8409,2.7138,-0.0013,-1.9104,-3281.2709,-9.802,2.6144,0.0009,3.4181,3969.6267,-27.9049,0.6028,-0.0021,-2.5416,15339.8161,-4.3617,-0.3357,0.0021,0.2959,9736.7653,21.7053,-1.4956,0.0015,-0.507,-2103.9069,16.4145,0.288,0.0002,1.8134,-8354.9767,-41.6862
1,1.0,1.6297,1.3129,12.0859,6.8602,2.8929,5.3007,2.6455,1.5053,11.7821,1,1.8254,4.2479,15.4471,0,3.3687,21,17,10.308,10.5475,2.2905,10.5954,8.3653,3.1721,6.5076,1.2936,2.7408,6.8228,3.5534,0.5989,6,1.6726,3.2387,1.9578,0,3.804,1.9291,1.7378,11.6364,2.0823,4.9193,3.8267,0.8443,0.2225,7.4502,17.7724,6.7631,9923,1.0946,2,3.0561,1.8756,2.741,5.0992,8.2069,2,2,0,1.6234,2.1639,3.6151,8.121,14.5795,0.7411,0,1.2454,4.1432,14.3058,7.1836,12.9283,2.2155,10.7896,9.1391,2.45,4,12.5444,8.0183,9.1913,1.3792,7.1997,6,15.7105,1.1295,1.5613,106,4.0777,7.7067,10.5831,2.4361,1.714,14.5793,2,2.5447,6.3354,15.8441,0,9.2889,17.554,9.4501,2.483,12.0534,5,2,2.4348,1,3,2.4016,7.3077,13.34,3,2.3016,2.2089,7.2777,3.4307,2.177,1.6087,2.8191,1.2212,9.848,1.9243,2.6786,9.0318,0.9668,1,0.5835,5.4707,3.8547,0.6671,6.4579,7.6236,8.304,1.2493,1899.6712,-0.0007,9923.6139,-1525.5428,-19.2616,-38.2255,-0.0002,52.6445,-754.1062,-37.3502,39.7011,-0.0,-46.11,-3570.0133,-110.7139,-1.2163,-0.0041,20.1943,-2115.5702,13.5935,2.97,0.0004,-1.1145,-2498.7859,-56.0806,-2.4421,-0.0008,3.1292,601.9377,39.1353,10.4786,0.0002,-5.6992,-1505.683,-15.5514,-6.214,0.0023,9.1539,1858.5134,-23.516,0.4945,-0.0031,6.2572,7122.8627,2.4151,0.0259,-0.0001,0.173,4501.9979,25.3524,-0.3629,-0.0024,0.0475,-1014.5804,17.5606,0.3928,0.0002,0.6244,-3820.6224,-39.4691
2,1.0,0.9439,0.7659,15.4913,5.8794,3.2928,5.9245,1.6684,0.0083,11.6706,2,1.3758,1.1842,14.7561,1,3.3673,19,25,11.2056,12.9412,3.1293,3.4789,6.2338,-0.0,6.3847,2.1387,2.2388,9.3333,2.4776,0.0135,5,1.7737,3.9222,1.1205,2,2.5056,0.8831,1.1765,9.6035,1.9841,5.8824,3.1708,0.2445,0.1443,5.3101,17.9523,5.245,9090,-0.0,4,0.114,2.2449,5.3061,0.836,7.5,2,7,0,1.4541,1.7347,4.0439,7.9592,12.7305,0.2597,0,7.379,4.411,13.0772,6.1735,12.3469,2.9268,8.8976,5.3438,0.126,2,12.7113,6.8367,5.3262,0.6045,9.6376,5,15.102,0.0856,0.7653,19,4.0306,4.2775,9.1055,3.9796,2.1514,16.0756,1,0.1236,5.5179,16.3772,0,8.3673,11.0405,5.8824,3.9286,8.4607,1,3,2.4136,1,1,1.964,5.9184,11.7647,4,0.0196,3.3333,10.1944,8.2662,1.5306,1.5306,2.4299,1.0714,8.4475,3.3673,0.1114,12.6667,0.8114,6,0.2715,5.1566,4.2149,0.3097,5.6633,5.974,11.5889,0.8418,2732.6731,0.001,9090.2365,-1394.4319,-10.4171,48.9488,0.0023,-6.8839,-695.3198,-31.5713,41.446,-0.0004,8.9855,-3248.503,-22.8441,-10.1406,-0.003,40.5862,-1939.47,32.3213,1.5928,-0.0043,-10.07,-2265.2401,-42.9493,-11.8377,-0.0011,-3.8399,575.1506,30.513,4.2655,-0.0005,-11.4771,-1398.1368,-10.3326,-3.8305,-0.0028,3.6814,1696.9502,-14.7135,5.7908,-0.0032,4.0606,6494.2326,-3.2265,-2.9338,0.0039,4.5569,4132.4078,16.8467,8.0074,-0.0024,-0.6382,-915.1379,20.2883,-0.5912,-0.0004,-8.1904,-3539.8104,-41.1784
3,1.0,0.7974,6.5427,18.2564,8.5073,2.5031,4.8722,2.5737,0.114,12.5543,1,2.2308,1.9901,16.3475,1,2.6437,9,25,13.7777,10.5747,1.5111,4.9496,7.1807,0.5655,9.6467,1.1663,1.9565,7.0183,1.8128,0.0023,64,1.4152,2.9544,1.9908,1,3.9033,1.6771,1.0345,14.0947,1.945,5.5172,3.6108,1.2241,0.2316,8.3048,18.3764,7.5171,1953,-0.0,3,0.1488,1.3083,2.3036,8.9267,8.8745,2,2,1,1.5876,1.6667,8.7035,8.8985,11.3028,0.4337,0,0.2873,4.2259,11.523,7.931,12.9358,1.4709,12.7086,9.6708,0.1084,2,12.1949,8.592,11.6274,3.3292,4.7804,7,16.6217,0.1397,1.1782,50,3.9655,1.7321,11.7779,2.0977,1.2292,15.9274,1,0.1403,6.293,17.0116,0,9.7031,18.5681,9.4253,1.9875,13.5947,5,2,2.2725,1,3,2.1882,8.2136,13.4483,1,0.1719,1.9473,4.7979,13.3158,1.681,1.3793,1.587,1.2428,10.7471,1.408,0.0391,8.9655,1.0424,1,0.7639,5.4989,3.4239,0.8325,7.3755,6.747,6.942,1.3346,9869.6757,0.0009,1953.767,-295.1241,-18.631,18.6575,0.0011,94.0023,-121.7333,17.5104,-17.996,0.0014,38.9967,-725.4804,-56.3704,-10.769,0.0002,8.6154,-370.7652,19.8171,-3.2155,0.0027,-10.3373,-480.7006,-56.088,-1.1867,-0.003,-2.5321,90.996,96.199,-1.4649,-0.0009,-0.4554,-307.7178,-20.412,-1.6939,-0.0014,-2.1,380.0843,-24.8668,-0.8598,0.0014,1.5924,1404.0283,-1.6271,-1.8807,0.0007,-0.1968,866.6257,26.1441,-3.688,-0.0007,-3.2927,-190.1471,17.0031,1.4439,0.0009,1.6372,-729.0015,-41.0948
4,1.0,1.6297,1.0503,12.0859,6.8602,2.8929,5.3007,2.6455,1.0849,11.7821,2,2.1503,4.1778,15.4471,0,3.3687,20,17,14.0971,10.5475,2.2905,8.2977,8.3653,3.1721,6.3201,1.2936,2.7408,6.8228,3.5534,0.9293,90,1.6726,3.2387,2.0295,0,3.804,1.9291,1.7378,10.9911,2.0823,4.9193,3.8267,0.8443,0.2225,7.4502,17.7724,6.4146,10300,1.0946,4,1.7021,1.8756,2.741,5.0992,8.2069,2,7,0,1.6234,2.1639,6.0832,8.121,13.3672,0.7411,0,1.2454,4.1432,10.1389,7.1836,12.9283,2.2155,10.7896,9.1391,1.6351,8,12.5444,8.0183,8.7411,1.3645,7.1997,7,15.7105,1.2552,1.5613,118,4.0777,7.7067,10.5831,2.4361,1.714,14.5793,1,1.6911,6.3354,15.8441,2,9.2889,17.554,9.4501,2.483,12.2703,5,1,2.4348,1,3,2.4016,7.3077,13.34,2,1.5044,2.2089,7.2863,6.2214,2.177,1.6087,2.8191,1.2212,10.1771,1.9243,1.5195,9.0318,0.9668,6,0.5835,5.4707,3.8547,0.6671,6.4579,7.6236,7.6653,1.2493,1522.6749,-0.0001,10300.9033,-1553.9293,-15.2189,-49.3656,-0.0003,92.2481,-787.45,45.3198,-44.4599,0.0011,3.5071,-3729.2254,-120.8265,0.4819,-0.0039,-34.1597,-2166.0778,12.6165,2.9449,0.0,0.1789,-2613.5993,-51.3214,-3.5274,-0.0005,1.5357,608.6435,119.0693,11.2528,0.0005,-5.7076,-1573.7202,-13.8709,0.0503,0.0028,10.4639,1933.6629,-23.7194,-0.5844,0.0038,-0.0048,7402.4353,-2.468,-1.1316,-0.0003,-0.9238,4672.0747,24.0584,-0.3879,0.0004,-1.0912,-989.5619,16.654,1.343,0.0009,0.071,-3959.5626,-42.8985


In [14]:
df.shape

(228714, 192)

In [15]:
# Separar o dataset de treino e teste
treino = df[df['target'].notnull()]
teste  = df[df['target'].isnull()]

In [16]:
treino.to_csv('../dataset/dataset_treino_new.csv', index=False)
teste.to_csv('../dataset/dataset_teste_new.csv', index=False)

# Algoritmo Catboost


In [None]:
#Checking for any categorical features
columns = list(df.columns)
if 'target' in columns:
    columns.remove('target')
        
cat_features = list(np.where(df[columns].dtypes == np.object)[0])
print(cat_features)

In [None]:
def cross_val(X, y, X_test, param, cat_features, n_splits=3):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    
    acc = []
    predict = None
    
    for tr_ind, val_ind in skf.split(X, y):
        X_train = X[tr_ind]
        y_train = y[tr_ind]
        
        X_valid = X[val_ind]
        y_valid = y[val_ind]
        
        clf = CatBoostClassifier(iterations=500,
                                loss_function = param['loss_function'],
                                depth=param['depth'],
                                l2_leaf_reg = param['l2_leaf_reg'],
                                eval_metric = 'Accuracy',
                                leaf_estimation_iterations = 10,
                                use_best_model=True,
                                logging_level='Silent'
        )
        
        clf.fit(X_train, 
                y_train,
                #cat_features=cat_features,
                eval_set=(X_valid, y_valid)
        )
        
        y_pred = clf.predict(X_valid)
        accuracy = accuracy_score(y_valid, y_pred)
        acc.append(accuracy)
    return sum(acc)/n_splits

In [None]:
def catboost_GridSearchCV(X, y, X_test, params, cat_features, n_splits=5):
    ps = {'acc':0,
          'param': []
    }
    
    predict=None
    
    for prms in tqdm(list(ParameterGrid(params)), ascii=True, desc='Params Tuning:'):
                          
        acc = cross_val(X, y, X_test, prms, cat_features, n_splits=5)

        if acc>ps['acc']:
            ps['acc'] = acc
            ps['param'] = prms
    print('Acc: '+str(ps['acc']))
    print('Params: '+str(ps['param']))
    
    return ps['param']

In [None]:
RANDOM_STATE = 0

# Separar o dataset de treino e teste
treino = df[df['target'].notnull()]
teste  = df[df['target'].isnull()]

# Separando features preditoras e target
X_train = treino.drop(['target'], axis=1)
y_train = treino['target']
X_teste = teste.drop(['target'], axis=1)

# Padronizando os dados de treino
#numerical_feats = X_train.dtypes[X_train.dtypes != "object"].index

scaler = StandardScaler()
#X_train[numerical_feats] = scaler.fit_transform(X_train[numerical_feats])
#X_teste[numerical_feats] = scaler.fit_transform(X_teste[numerical_feats])

X_train = scaler.fit_transform(X_train)
X_teste = scaler.fit_transform(X_teste)

X_train.shape, X_teste.shape

In [None]:
# Treinando o modelo com otimizacao

params = {'depth':[2, 3, 4],
          'loss_function': ['Logloss'],
          'l2_leaf_reg':np.logspace(-20, -19, 3)
}

param = catboost_GridSearchCV(X_train, y_train, X_teste, params, cat_features)

clf = CatBoostClassifier(iterations=2500,
                        loss_function = param['loss_function'],
                        depth=param['depth'],
                        l2_leaf_reg = param['l2_leaf_reg'],
                        eval_metric = 'Accuracy',
                        leaf_estimation_iterations = 10,
                        use_best_model=True
)

X_train, X_valid, y_train, y_valid = train_test_split(X_train,
                                                      y_train, 
                                                      shuffle=True,
                                                      random_state=RANDOM_STATE,
                                                      train_size=0.8,
                                                      stratify=y_train
)
clf.fit(X_train, 
        y_train,
        cat_features=cat_features,
        logging_level='Silent',
        eval_set=(X_valid, y_valid)
)

# Resultados

In [None]:
# make the prediction using the resulting model
preds_proba_train = clf.predict_proba(X_train)
preds_proba_test  = clf.predict_proba(X_valid)

print("Log Loss (Treino): %f" % log_loss(y_train, preds_proba_train[:,1]))
print("Log Loss (Test): %f" % log_loss(y_valid, preds_proba_test[:,1]))

In [None]:
plt.hist(preds_proba_test[:,1])
plt.show()

# Submissão

In [None]:
submission = pd.read_csv('../dataset/sample_submission.csv')
submission['PredictedProb'] = clf.predict_proba(X_teste)[:,1]
print(submission.shape)
submission.head()

In [None]:
submission.to_csv('../submission/submission_cat_v.1.0.1.csv', index=False)

In [None]:
plt.hist(submission.PredictedProb)
plt.show()