# Kaggle
## Competição DSA de Machine Learning - Dezembro 2019

Versão 1.0.0: LB = 0.48866 CV = 0.463102
- modelo: LightGBM (com algumas otimizações)
- features engineering: gerado através do Auto_ViML

Versão 1.0.1: LB = 0.48991 CV = 0.462946
- modelo: LightGBM (com algumas otimizações)
- features engineering: gerado através do Auto_ViML (com novas features)

Versão 1.0.2: LB = 0.48915 CV = 0.464442
- modelo: LightGBM (com algumas otimizações)
- features engineering: gerado através do Auto_ViML (com agrupamento pela coluna v2)

Versão 1.0.3: LB = ???? CV = 
- modelo: LightGBM (com algumas otimizações)
- features engineering: gerado através do Auto_ViML (com agrupamento pela coluna v3)

# Importando as bibliotecas

In [1]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

import time
import datetime
import gc
from scipy.stats import mstats

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Variavel para controlar o treinamento no Kaggle
TRAIN_OFFLINE = True

In [2]:
# Importa os pacotes de algoritmos
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier

# Importa pacotes do sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


Using TensorFlow backend.


# Carregando os dados de treino e teste

In [3]:
# Funcao de leitura dos dados
def read_data():
    
    if TRAIN_OFFLINE:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('../dataset/dataset_treino.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_teste.csv....')
        test = pd.read_csv('../dataset/dataset_teste.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
        
    else:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_treino.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_treino.csv....')
        test = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_teste.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
    
    return train, test

In [4]:
# Leitura dos dados
train, test = read_data()

Carregando arquivo dataset_treino.csv....
dataset_treino.csv tem 114321 linhas and 133 colunas
Carregando arquivo dataset_teste.csv....
dataset_teste.csv tem 114393 linhas and 132 colunas


In [5]:
# Juntando os dois dataset (treino e teste)
df = train.append(test)
df.shape

(228714, 133)

In [6]:
# Verificar a quantidade de features numericas e categoricas

numerical_feats = df.dtypes[df.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = df.dtypes[df.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  114
Number of Categorical features:  19


# Feature Engineering

In [7]:
# Tratando os dados outliers com Winsorize
for col in df.columns:
    if df[col].dtype !='object':
        if col in ['ID','target']:
            continue
        df[col] = mstats.winsorize(df[col], limits=[0.05, 0.05])[0]
        df[col].fillna(df[col].mean(),inplace=True)

In [8]:
# Tratando os dados outliers com Winsorize e missing com media (numerico) e XXX (categorica)
for col in df.columns:
    if df[col].dtype !='object':
        if col in ['ID','target']:
            continue
        df[col] = mstats.winsorize(df[col], limits=[0.05, 0.05])[0]
        df[col].fillna(df[col].mean(),inplace=True)
    else:
        df[col].fillna('XXX',inplace=True)
        
# Função para criação de novas features, agrupando por colunas
'''def ft(data):
    df = pd.DataFrame()

    numerical_cols = data.dtypes[data.dtypes != "object"].index

    for col in numerical_cols:
        if col in ['ID','target','v2']:
            continue
        
        df[col + '_mean']   = data.groupby(['v2'])[col].mean()
        df[col + '_median'] = data.groupby(['v2'])[col].median()
        df[col + '_max']    = data.groupby(['v2'])[col].max()
        df[col + '_min']    = data.groupby(['v2'])[col].min()
        df[col + '_std'] = data.groupby(['v2'])[col].std()

    return df'''
    
#new_df = ft(df)
#new_df = new_df.reset_index()
#new_df = pd.merge(df, new_df, on='v2',how='left')
new_df = df.copy()

In [9]:
# Tratando as features categoricas com LabelEncoder
le = LabelEncoder()

for col in categorical_feats:
    le.fit(np.unique(list(new_df[col].values)))
    new_df[col] = le.transform(new_df[col])

In [11]:
# Drop features 
#new_df = new_df.drop(new_df[to_drop], axis=1)
new_df.shape

(228714, 133)

In [12]:
new_df.head()

Unnamed: 0,ID,target,v1,v10,v100,v101,v102,v103,v104,v105,v106,v107,v108,v109,v11,v110,v111,v112,v113,v114,v115,v116,v117,v118,v119,v12,v120,v121,v122,v123,v124,v125,v126,v127,v128,v129,v13,v130,v131,v14,v15,v16,v17,v18,v19,v2,v20,v21,v22,v23,v24,v25,v26,v27,v28,v29,v3,v30,v31,v32,v33,v34,v35,v36,v37,v38,v39,v4,v40,v41,v42,v43,v44,v45,v46,v47,v48,v49,v5,v50,v51,v52,v53,v54,v55,v56,v57,v58,v59,v6,v60,v61,v62,v63,v64,v65,v66,v67,v68,v69,v7,v70,v71,v72,v73,v74,v75,v76,v77,v78,v79,v8,v80,v81,v82,v83,v84,v85,v86,v87,v88,v89,v9,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99
0,3,1.0,1.335739,0.525165,19.470199,8.389237,2.757375,4.374296,1.781359,0.014526,12.579184,4,2.382692,3.930922,16.434108,1,1.692213,14,35,15.634907,8.292682,1.95122,6.592012,5.919002,2.395187e-07,6.085711,1.059603,1.294964,8.0,1.98978,0.035754,21,1.804126,3.113719,2.024285,0,2.86683,0.79503,2.857144,11.636387,1.355013,8.571429,3.67035,0.190826,0.154388,8.727474,18.869283,7.730923,21416,-1.716131e-08,2,0.139412,1.720818,3.393503,1.212316,8.880867,2,2,0,1.083033,1.111398,7.270147,8.375452,11.326592,0.454546,0,4.012088,3.921026,7.711453,7.653429,12.707581,2.015505,10.498338,9.848672,0.113561,2,12.171733,8.086643,7.915266,0.89942,7.277792,6,16.747968,0.04973,1.299638,90,3.971118,0.529802,10.890984,2.599278,1.588448,15.858152,1,0.153461,6.363189,18.303925,2,9.314079,15.231789,17.142857,3.176895,11.784549,5,1,1.614988,1,3,2.23094,7.292418,11.111111,4,0.024422,3.0,7.528326,8.861647,1.049409,1.299638,1.707317,0.866426,9.551836,3.3213,0.095678,9.999999,0.905342,0,0.442252,5.814018,3.51772,0.462019,7.436824,5.454545,8.877414,1.191337
1,4,1.0,1.335739,0.525165,19.470199,8.389237,2.757375,4.374296,1.781359,0.014526,12.579184,1,2.382692,3.930922,16.434108,0,1.692213,20,17,15.634907,8.292682,1.95122,6.592012,5.919002,2.395187e-07,6.085711,1.059603,1.294964,8.0,1.98978,0.035754,6,1.804126,3.113719,2.024285,0,2.86683,0.79503,2.857144,11.636387,1.355013,8.571429,3.67035,0.190826,0.154388,8.727474,18.869283,7.730923,9923,-1.716131e-08,2,0.139412,1.720818,3.393503,1.212316,8.880867,2,2,0,1.083033,1.111398,7.270147,8.375452,11.326592,0.454546,0,4.012088,3.921026,7.711453,7.653429,12.707581,2.015505,10.498338,9.848672,0.113561,4,12.171733,8.086643,7.915266,0.89942,7.277792,6,16.747968,0.04973,1.299638,106,3.971118,0.529802,10.890984,2.599278,1.588448,15.858152,1,0.153461,6.363189,18.303925,0,9.314079,15.231789,17.142857,3.176895,11.784549,5,1,1.614988,1,3,2.23094,7.292418,11.111111,3,0.024422,3.0,7.528326,8.861647,1.049409,1.299638,1.707317,0.866426,9.551836,3.3213,0.095678,9.999999,0.905342,1,0.442252,5.814018,3.51772,0.462019,7.436824,5.454545,8.877414,1.191337
2,5,1.0,1.335739,0.525165,19.470199,8.389237,2.757375,4.374296,1.781359,0.014526,12.579184,2,2.382692,3.930922,16.434108,1,1.692213,18,35,15.634907,8.292682,1.95122,6.592012,5.919002,2.395187e-07,6.085711,1.059603,1.294964,8.0,1.98978,0.035754,5,1.804126,3.113719,2.024285,0,2.86683,0.79503,2.857144,11.636387,1.355013,8.571429,3.67035,0.190826,0.154388,8.727474,18.869283,7.730923,9090,-1.716131e-08,4,0.139412,1.720818,3.393503,1.212316,8.880867,2,7,0,1.083033,1.111398,7.270147,8.375452,11.326592,0.454546,0,4.012088,3.921026,7.711453,7.653429,12.707581,2.015505,10.498338,9.848672,0.113561,2,12.171733,8.086643,7.915266,0.89942,7.277792,5,16.747968,0.04973,1.299638,19,3.971118,0.529802,10.890984,2.599278,1.588448,15.858152,1,0.153461,6.363189,18.303925,0,9.314079,15.231789,17.142857,3.176895,11.784549,1,1,1.614988,1,1,2.23094,7.292418,11.111111,4,0.024422,3.0,7.528326,8.861647,1.049409,1.299638,1.707317,0.866426,9.551836,3.3213,0.095678,9.999999,0.905342,6,0.442252,5.814018,3.51772,0.462019,7.436824,5.454545,8.877414,1.191337
3,6,1.0,1.335739,0.525165,19.470199,8.389237,2.757375,4.374296,1.781359,0.014526,12.579184,1,2.382692,3.930922,16.434108,1,1.692213,9,35,15.634907,8.292682,1.95122,6.592012,5.919002,2.395187e-07,6.085711,1.059603,1.294964,8.0,1.98978,0.035754,64,1.804126,3.113719,2.024285,0,2.86683,0.79503,2.857144,11.636387,1.355013,8.571429,3.67035,0.190826,0.154388,8.727474,18.869283,7.730923,1953,-1.716131e-08,3,0.139412,1.720818,3.393503,1.212316,8.880867,2,2,1,1.083033,1.111398,7.270147,8.375452,11.326592,0.454546,0,4.012088,3.921026,7.711453,7.653429,12.707581,2.015505,10.498338,9.848672,0.113561,2,12.171733,8.086643,7.915266,0.89942,7.277792,7,16.747968,0.04973,1.299638,50,3.971118,0.529802,10.890984,2.599278,1.588448,15.858152,1,0.153461,6.363189,18.303925,0,9.314079,15.231789,17.142857,3.176895,11.784549,5,1,1.614988,1,3,2.23094,7.292418,11.111111,1,0.024422,3.0,7.528326,8.861647,1.049409,1.299638,1.707317,0.866426,9.551836,3.3213,0.095678,9.999999,0.905342,1,0.442252,5.814018,3.51772,0.462019,7.436824,5.454545,8.877414,1.191337
4,8,1.0,1.335739,0.525165,19.470199,8.389237,2.757375,4.374296,1.781359,0.014526,12.579184,2,2.382692,3.930922,16.434108,0,1.692213,19,17,15.634907,8.292682,1.95122,6.592012,5.919002,2.395187e-07,6.085711,1.059603,1.294964,8.0,1.98978,0.035754,90,1.804126,3.113719,2.024285,0,2.86683,0.79503,2.857144,11.636387,1.355013,8.571429,3.67035,0.190826,0.154388,8.727474,18.869283,7.730923,10300,-1.716131e-08,4,0.139412,1.720818,3.393503,1.212316,8.880867,2,7,0,1.083033,1.111398,7.270147,8.375452,11.326592,0.454546,0,4.012088,3.921026,7.711453,7.653429,12.707581,2.015505,10.498338,9.848672,0.113561,8,12.171733,8.086643,7.915266,0.89942,7.277792,7,16.747968,0.04973,1.299638,128,3.971118,0.529802,10.890984,2.599278,1.588448,15.858152,1,0.153461,6.363189,18.303925,2,9.314079,15.231789,17.142857,3.176895,11.784549,5,1,1.614988,1,3,2.23094,7.292418,11.111111,2,0.024422,3.0,7.528326,8.861647,1.049409,1.299638,1.707317,0.866426,9.551836,3.3213,0.095678,9.999999,0.905342,6,0.442252,5.814018,3.51772,0.462019,7.436824,5.454545,8.877414,1.191337


# Algoritmo LigthGBM com Hyperparametros

In [13]:
# Configurações Gerais

GENERATE_SUBMISSION_FILES = True
SUBMISSION_SUFIX = "_lgbm_v.1.0.3"
STRATIFIED_KFOLD = False
RANDOM_SEED = np.random.seed(123)
NUM_THREADS = 4
NUM_FOLDS = 10
EARLY_STOPPING = 1000

In [14]:
# Selecionando os melhores parametros
LIGHTGBM_PARAMS = {'boosting_type': 'gbdt', 
                  'colsample_bytree': 0.881783, 
                  'is_unbalance': True, 
                  'learning_rate': 0.0129388, 
                  'min_child_samples': 315, 
                  'num_leaves': 139, 
                  'reg_alpha': 0.484807, 
                  'reg_lambda': 0.515065, 
                  'subsample_for_bin': 280000, 
                  'subsample': 0.635119, 
                  'n_estimators': 10000
                 }

In [17]:
# ------------------------- LIGHTGBM MODEL -------------------------
# Funcao para processar todo o pipeline do treinamento e gerar a submissao
def run_model(dataset, categorical_feature = None):
    
    # Separar o dataset de treino e teste
    treino = dataset[dataset['target'].notnull()]
    teste  = dataset[dataset['target'].isnull()]
    
    # Separando features preditoras e target
    X_ = treino.drop(['ID','target'], axis=1)
    y_ = treino['target']
    
    # Aplicando a funcao SMOTE
    # SMOTE eh um metodo de oversampling. Ele cria exemplos sinteticos da classe minoritaria ao inves de criar copias
    sm = SMOTE(random_state=0)
    X, y = sm.fit_sample(X_, y_)

    #X = X_.copy()
    #y = y_.copy()
    
    # Padronizando os dados de treino
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    

    #del_features = ['ID','target']
    predictors = list(X_.columns)
    
    print("Train/valid shape: {}, test shape: {}".format(X.shape, y.shape))

    # Defini o tipo de Cross-Validation
    if not STRATIFIED_KFOLD:
        folds = KFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)
    else:
        folds = StratifiedKFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)

    # Hold oof predictions, test predictions, feature importance and training/valid auc
    oof_preds = np.zeros(X.shape[0])
    sub_preds = np.zeros(teste.shape[0])
    importance_df = pd.DataFrame()
    eval_results = dict()

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
        train_x, train_y = X[train_idx], y[train_idx]
        valid_x, valid_y = X[valid_idx], y[valid_idx]

        params = {'random_state': RANDOM_SEED, 'nthread': NUM_THREADS}
        
        clf = LGBMClassifier(**{**params, **LIGHTGBM_PARAMS})
        
        if not categorical_feature:
            clf.fit(train_x, train_y, 
                    eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    eval_metric='logloss', 
                    verbose=100, 
                    early_stopping_rounds= EARLY_STOPPING)
        else:
            clf.fit(train_x, train_y, 
                    eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    eval_metric='logloss', 
                    verbose=100, 
                    early_stopping_rounds=EARLY_STOPPING,
                    feature_name= list(X_[predictors].columns), 
                    categorical_feature= categorical_feature)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(teste.drop(['ID','target'], axis=1), num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        # Feature importance by GAIN and SPLIT
        fold_importance = pd.DataFrame()
        fold_importance["feature"] = predictors
        fold_importance["gain"] = clf.booster_.feature_importance(importance_type='gain')
        fold_importance["split"] = clf.booster_.feature_importance(importance_type='split')
        importance_df = pd.concat([importance_df, fold_importance], axis=0)
        eval_results['train_{}'.format(n_fold+1)]  = clf.evals_result_['training']['binary_logloss']
        eval_results['valid_{}'.format(n_fold+1)] = clf.evals_result_['valid_1']['binary_logloss']

        print('Fold %2d Log Loss : %.6f' % (n_fold + 1, log_loss(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full Log Loss score %.6f' % log_loss(y, oof_preds))
    teste['target'] = sub_preds.copy()

    # Get the average feature importance between folds
    mean_importance = importance_df.groupby('feature').mean().reset_index()
    mean_importance.sort_values(by= 'gain', ascending=False, inplace=True)
    
    # Save feature importance, test predictions and oof predictions as csv
    if GENERATE_SUBMISSION_FILES:

        # Save submission (test data) and feature importance
        submission = pd.read_csv('../dataset/sample_submission.csv')
        submission['PredictedProb'] = sub_preds.copy()
        submission.to_csv('../submission/submission{}.csv'.format(SUBMISSION_SUFIX), index=False)
        
        mean_importance.to_csv('feature_importance{}.csv'.format(SUBMISSION_SUFIX), index=False)
        plt.hist(submission.PredictedProb)
        plt.show()

    
    return mean_importance

In [18]:
run_model(new_df)

Train/valid shape: (114321, 131), test shape: (114321,)
Training until validation scores don't improve for 1000 rounds
[100]	training's binary_logloss: 0.556955	valid_1's binary_logloss: 0.561395
[200]	training's binary_logloss: 0.584978	valid_1's binary_logloss: 0.59416
[300]	training's binary_logloss: 0.58889	valid_1's binary_logloss: 0.603554
[400]	training's binary_logloss: 0.584124	valid_1's binary_logloss: 0.604451
[500]	training's binary_logloss: 0.577714	valid_1's binary_logloss: 0.603314
[600]	training's binary_logloss: 0.571009	valid_1's binary_logloss: 0.601735
[700]	training's binary_logloss: 0.564507	valid_1's binary_logloss: 0.600222
[800]	training's binary_logloss: 0.558324	valid_1's binary_logloss: 0.598828
[900]	training's binary_logloss: 0.552476	valid_1's binary_logloss: 0.597535
[1000]	training's binary_logloss: 0.546927	valid_1's binary_logloss: 0.596251
Early stopping, best iteration is:
[26]	training's binary_logloss: 0.52988	valid_1's binary_logloss: 0.53191
Fol

KeyboardInterrupt: 