# Kaggle
## Competição DSA de Machine Learning - Dezembro 2019

Versão 1.0.0: LB = 0.48866 CV = 0.463102
- modelo: LightGBM (com algumas otimizações)
- features engineering: gerado através do Auto_ViML

Versão 1.0.1: LB = 0.48991 CV = 0.462946
- modelo: LightGBM (com algumas otimizações)
- features engineering: gerado através do Auto_ViML (com novas features)

Versão 1.0.2: LB = ???? CV = ????
- modelo: LightGBM (com algumas otimizações)
- features engineering: gerado através do Auto_ViML
- Agrupamento pela coluna v2

## 1. Importando as bibliotecas

In [1]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

import time
import datetime
import gc

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Variavel para controlar o treinamento no Kaggle
TRAIN_OFFLINE = True

In [2]:
# Importa os pacotes de algoritmos
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier

# Importa pacotes do sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, log_loss
from sklearn.preprocessing import scale, MinMaxScaler, StandardScaler


## 2. Carregando os dados de treino e teste

In [3]:
def read_data():
    
    if TRAIN_OFFLINE:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('../dataset/dataset_treino_modificado.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_teste.csv....')
        test = pd.read_csv('../dataset/dataset_teste_modificado.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
        
    else:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_treino.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_treino.csv....')
        test = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_teste.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
    
    return train, test

In [4]:
# Leitura dos dados
train, test = read_data()

Carregando arquivo dataset_treino.csv....
dataset_treino.csv tem 114321 linhas and 44 colunas
Carregando arquivo dataset_teste.csv....
dataset_teste.csv tem 114393 linhas and 51 colunas


In [5]:
train.head()

Unnamed: 0,v31,v129,v50,v110,v66,v47,v38,v113,v56,v79,v24,v71,v74,v101,v3,v62,v30,v85,v72,v67,v100,v94,v84,v111,v95,v106,v108,v22,v125,v112,v52,v91,v107,v10_bin,v14_bin,v26_bin,v28_bin,v34_bin,v46_bin,v55_bin,v57_bin,v58_bin,v65_bin,target
0,0,0,0.11127,0,0,0,0,0,0,0,0,0,0,0.386152,0,1,0,0.269716,1,0.598997,0.141689,0.281723,0.139374,0.338176,0.212921,0.495177,0.131094,0,0,0,0,0,0,49,45,122,175,141,251,138,9,106,128,0
1,0,0,0.163392,0,1,1,0,1,1,1,1,1,0,0.0,0,1,1,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0,1,0,0,61,35,0,0,175,0,0,0,0,0,1
2,0,0,0.146414,0,0,2,0,0,2,2,2,1,0,0.233377,0,1,-1,0.32859,1,0.441279,0.764561,0.239724,0.172676,0.34009,0.163506,0.40794,0.135128,2,2,1,0,0,0,61,58,30,175,62,124,138,21,54,37,1
3,0,0,0.053418,0,2,1,0,0,-1,3,1,1,0,0.0,0,2,-1,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,2,2,1,1,38,43,0,0,62,0,0,0,0,0,0
4,1,0,0.171427,1,1,3,6,-1,3,4,0,1,1,0.398321,0,0,-1,0.114583,6,0.506923,0.973248,0.197568,0.128673,0.236744,0.170047,0.612259,0.132526,4,4,3,3,0,0,55,137,54,69,120,15,32,61,22,151,1


In [6]:
test.head()

Unnamed: 0,v31,v129,v50,v110,v66,v47,v38,v113,v56,v79,v24,v71,v74,v101,v3,v62,v30,v85,v72,v67,v100,v94,v84,v111,v95,v106,v108,v22,v125,v112,v52,v91,v107,v10_bin,v14_bin,v26_bin,v28_bin,v34_bin,v46_bin,v55_bin,v57_bin,v58_bin,v65_bin,target_Logistic Regression_predictions,target_Linear Discriminant_predictions,target_Naive Bayes_predictions,target_Bagging_predictions,target_CatBoost_predictions,Class_proba_0,Class_proba_1,target_Ensembled_predictions
0,0,0,0.053278,0,2,2,0,-1,2,2,1,1,0,0.24699,0,1,-1,0.136213,1,0.429988,0.986097,0.199456,0.090104,0.217561,0.174763,0.604196,0.119166,10794,19,7,7,3,3,49,100,30,43,62,16,52,9,10,129,1,1,0,0,0,0.707753,0.292247,0
1,0,0,0.096527,1,1,3,4,-1,4,6,3,1,0,0.0,0,1,4,0.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,113,53,14,10,2,2,48,44,0,0,212,0,0,0,0,0,1,1,1,1,1,0.119297,0.880703,1
2,0,0,0.087654,0,1,1,0,1,28,3,0,1,0,0.358836,0,1,1,0.136213,1,0.480519,0.990476,0.208014,0.084725,0.196042,0.159372,0.607002,0.104227,18210,66,17,10,2,2,55,158,163,16,158,2,21,141,0,129,1,1,1,1,1,0.293546,0.706454,1
3,0,0,0.092684,0,0,1,0,0,0,3,1,1,0,0.220365,0,1,5,0.333333,1,0.44768,0.05315,0.30332,0.248601,0.234898,0.200637,0.33129,0.175756,2444,65,5,10,3,3,55,38,49,175,114,207,182,95,131,21,1,1,1,1,1,0.357478,0.642522,1
4,0,0,0.118745,0,0,1,0,13,-1,3,1,1,0,0.362228,0,2,1,0.092971,2,0.498549,0.963224,0.244056,0.073417,0.134688,0.140762,0.746046,0.165096,7527,36,16,8,3,3,38,45,153,152,69,27,21,141,36,170,1,1,1,1,1,0.267498,0.732502,1


## 3. Feature Engineering

In [7]:
test = test[test.columns[:-8]]

In [8]:
df = train.append(test)

In [9]:
df.head()

Unnamed: 0,target,v100,v101,v106,v107,v108,v10_bin,v110,v111,v112,v113,v125,v129,v14_bin,v22,v24,v26_bin,v28_bin,v3,v30,v31,v34_bin,v38,v46_bin,v47,v50,v52,v55_bin,v56,v57_bin,v58_bin,v62,v65_bin,v66,v67,v71,v72,v74,v79,v84,v85,v91,v94,v95
0,0.0,0.141689,0.386152,0.495177,0,0.131094,49,0,0.338176,0,0,0,0,45,0,0,122,175,0,0,0,141,0,251,0,0.11127,0,138,0,9,106,1,128,0,0.598997,0,1,0,0,0.139374,0.269716,0,0.281723,0.212921
1,1.0,0.0,0.0,0.0,0,0.0,61,0,0.0,0,1,1,0,35,1,1,0,0,0,1,0,175,0,0,1,0.163392,1,0,1,0,0,1,0,1,0.0,1,1,0,1,0.0,0.0,0,0.0,0.0
2,1.0,0.764561,0.233377,0.40794,0,0.135128,61,0,0.34009,1,0,2,0,58,2,2,30,175,0,-1,0,62,0,124,2,0.146414,0,138,2,21,54,1,37,0,0.441279,1,1,0,2,0.172676,0.32859,0,0.239724,0.163506
3,0.0,0.0,0.0,0.0,1,0.0,38,0,0.0,2,0,3,0,43,3,1,0,0,0,-1,0,62,0,0,1,0.053418,2,0,-1,0,0,2,0,2,0.0,1,2,0,3,0.0,0.0,1,0.0,0.0
4,1.0,0.973248,0.398321,0.612259,0,0.132526,55,1,0.236744,3,-1,4,0,137,4,0,54,69,0,-1,1,120,6,15,3,0.171427,3,32,3,61,22,0,151,1,0.506923,1,6,1,4,0.128673,0.114583,0,0.197568,0.170047


## 4. Criar e avaliar alguns algoritmos de Machine Learning

## 4.1. Algoritmo LigthGBM - Baseline

In [10]:
# Configurações Gerais
N_FOLDS = 5
MAX_EVALS = 5

In [11]:
# Sample 16000 rows (10000 for training, 6000 for testing)
df_sample = train.sample(n = 16000, random_state = 42)

# Extract the labels
labels = np.array(df_sample['target'].astype(np.int32)).reshape((-1, ))
features = df_sample.drop(columns = ['target'])

# Split into training and testing data
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 6000, random_state = 42)

print('Train shape: ', train_features.shape)
print('Test shape: ', test_features.shape)

train_features.head()

Train shape:  (10000, 43)
Test shape:  (6000, 43)


Unnamed: 0,v31,v129,v50,v110,v66,v47,v38,v113,v56,v79,v24,v71,v74,v101,v3,v62,v30,v85,v72,v67,v100,v94,v84,v111,v95,v106,v108,v22,v125,v112,v52,v91,v107,v10_bin,v14_bin,v26_bin,v28_bin,v34_bin,v46_bin,v55_bin,v57_bin,v58_bin,v65_bin
47330,0,0,0.165348,0,0,1,0,0,19,3,2,1,0,0.386583,0,1,5,0.104905,1,0.522758,0.953991,0.218162,0.072848,0.198991,0.182555,0.616986,0.149325,12281,53,14,10,0,0,49,63,93,69,114,40,32,95,54,151
11405,0,0,0.074938,0,2,1,0,30,16,3,1,1,0,0.0,0,1,-1,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45,33,15,7,0,0,23,45,0,0,141,0,0,0,0,0
15294,0,0,0.103444,1,2,3,0,-1,7,6,3,2,0,0.17038,0,1,4,0.323436,1,0.457241,0.05463,0.270963,0.170138,0.263253,0.174342,0.261754,0.128156,1520,34,5,2,0,0,84,100,51,175,141,206,170,9,184,37
37644,0,0,0.100789,0,2,1,0,19,-1,3,1,1,0,0.476015,0,1,-1,0.1599,1,0.49904,0.859925,0.224547,0.102141,0.160181,0.149524,0.687146,0.19018,3154,9,6,6,0,0,38,45,153,175,114,54,21,98,54,170
24175,0,0,0.082064,0,0,1,0,0,-1,3,1,1,0,0.270713,0,1,1,0.230915,1,0.416347,0.881249,0.303255,0.19491,0.261041,0.16144,0.397617,0.097428,4746,4,3,3,3,3,49,45,30,158,62,121,170,62,54,7


In [12]:
model = lgb.LGBMClassifier(random_state=50)

# Training set
train_set = lgb.Dataset(train_features, label = train_labels)
test_set = lgb.Dataset(test_features, label = test_labels)

In [16]:
# Default hyperparamters
hyperparameters = model.get_params()

# Using early stopping to determine number of estimators.
del hyperparameters['n_estimators']

# Perform cross validation with early stopping
cv_results = lgb.cv(hyperparameters, 
                    train_set, 
                    num_boost_round = 10000, 
                    nfold = N_FOLDS, 
                    metrics = 'auc', 
                    early_stopping_rounds = 100, 
                    verbose_eval = True, 
                    seed = 42)

# Highest score
best = cv_results['auc-mean'][-1]

# Standard deviation of best score
best_std = cv_results['auc-stdv'][-1]

print('The maximium ROC AUC in cross validation was {:.5f} with std of {:.5f}.'.format(best, best_std))
print('The ideal number of iterations was {}.'.format(len(cv_results['auc-mean'])))

[1]	cv_agg's auc: 0.69967 + 0.0103059
[2]	cv_agg's auc: 0.708585 + 0.0134018
[3]	cv_agg's auc: 0.712464 + 0.0145416
[4]	cv_agg's auc: 0.714034 + 0.0126541
[5]	cv_agg's auc: 0.716827 + 0.0114023
[6]	cv_agg's auc: 0.718418 + 0.0110519
[7]	cv_agg's auc: 0.719265 + 0.0111177
[8]	cv_agg's auc: 0.720627 + 0.0117034
[9]	cv_agg's auc: 0.721753 + 0.0113278
[10]	cv_agg's auc: 0.72181 + 0.0116593
[11]	cv_agg's auc: 0.723475 + 0.0109437
[12]	cv_agg's auc: 0.723141 + 0.0109949
[13]	cv_agg's auc: 0.723747 + 0.010562
[14]	cv_agg's auc: 0.724239 + 0.0116522
[15]	cv_agg's auc: 0.725165 + 0.0115041
[16]	cv_agg's auc: 0.724843 + 0.0114311
[17]	cv_agg's auc: 0.725475 + 0.011641
[18]	cv_agg's auc: 0.72586 + 0.0112587
[19]	cv_agg's auc: 0.725155 + 0.0116055
[20]	cv_agg's auc: 0.725358 + 0.0118072
[21]	cv_agg's auc: 0.725418 + 0.0115992
[22]	cv_agg's auc: 0.726107 + 0.0117984
[23]	cv_agg's auc: 0.727088 + 0.0112039
[24]	cv_agg's auc: 0.726719 + 0.0108398
[25]	cv_agg's auc: 0.727584 + 0.010783
[26]	cv_agg's a

In [18]:
# Optimal number of estimators found in cv
model.n_estimators = len(cv_results['auc-mean'])

# Train and make predicions with model
model.fit(train_features, train_labels)
preds = model.predict_proba(test_features)[:, 1]
baseline_auc = roc_auc_score(test_labels, preds)

print('The baseline model scores {:.5f} ROC AUC on the test set.'.format(baseline_auc))

The baseline model scores 0.75322 ROC AUC on the test set.


In [19]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

def objective(hyperparameters):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization.
       Writes a new line to `outfile` on every iteration"""
    
    # Keep track of evals
    global ITERATION
    
    ITERATION += 1
    
    # Using early stopping to find number of trees trained
    if 'n_estimators' in hyperparameters:
        del hyperparameters['n_estimators']
    
    # Retrieve the subsample
    subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
    
    # Extract the boosting type and subsample to top level keys
    hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
    hyperparameters['subsample'] = subsample
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
        hyperparameters[parameter_name] = int(hyperparameters[parameter_name])

    start = timer()
    
    # Perform n_folds cross validation
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
                        early_stopping_rounds = 100, metrics = 'auc', seed = 50)

    run_time = timer() - start
    
    # Extract the best score
    best_score = cv_results['auc-mean'][-1]
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = len(cv_results['auc-mean'])
    
    # Add the number of estimators to the hyperparameters
    hyperparameters['n_estimators'] = n_estimators

    # Write to the csv file ('a' means append)
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score])
    of_connection.close()

    # Dictionary with information for evaluation
    return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
            'train_time': run_time, 'status': STATUS_OK}

In [20]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

In [None]:
# Create the learning rate
learning_rate = {'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2))}

In [None]:
learning_rate_dist = []

# Draw 10000 samples from the learning rate domain
for _ in range(10000):
    learning_rate_dist.append(sample(learning_rate)['learning_rate'])
    
plt.figure(figsize = (8, 6))
sns.kdeplot(learning_rate_dist, color = 'red', linewidth = 2, shade = True);
plt.title('Learning Rate Distribution', size = 18); plt.xlabel('Learning Rate', size = 16); plt.ylabel('Density', size = 16);

In [None]:
# Discrete uniform distribution
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
num_leaves_dist = []

# Sample 10000 times from the number of leaves distribution
for _ in range(10000):
    num_leaves_dist.append(sample(num_leaves)['num_leaves'])
    
# kdeplot
plt.figure(figsize = (8, 6))
sns.kdeplot(num_leaves_dist, linewidth = 2, shade = True);
plt.title('Number of Leaves Distribution', size = 18); plt.xlabel('Number of Leaves', size = 16); plt.ylabel('Density', size = 16);

In [None]:
# boosting type domain 
boosting_type = {'boosting_type': hp.choice('boosting_type', 
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('subsample', 0.5, 1)}, 
                                             {'boosting_type': 'dart', 'subsample': hp.uniform('subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}])}

# Draw a sample
hyperparams = sample(boosting_type)
hyperparams

In [None]:
# Retrieve the subsample if present otherwise set to 1.0
subsample = hyperparams['boosting_type'].get('subsample', 1.0)

# Extract the boosting type
hyperparams['boosting_type'] = hyperparams['boosting_type']['boosting_type']
hyperparams['subsample'] = subsample

hyperparams

### Complete Bayesian Domain

In [26]:
# Define the search space
space = {
    'boosting_type': hp.choice('boosting_type', 
                                            [{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}, 
                                             {'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
                                             {'boosting_type': 'goss', 'subsample': 1.0}]),
    'num_leaves': hp.quniform('num_leaves', 20, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'is_unbalance': hp.choice('is_unbalance', [True, False]),
}

### Example of Sampling from the Domain

In [None]:
# Sample from the full space
x = sample(space)

# Conditional logic to assign top-level keys
subsample = x['boosting_type'].get('subsample', 1.0)
x['boosting_type'] = x['boosting_type']['boosting_type']
x['subsample'] = subsample

x

In [None]:
x = sample(space)
subsample = x['boosting_type'].get('subsample', 1.0)
x['boosting_type'] = x['boosting_type']['boosting_type']
x['subsample'] = subsample
x

#### Optimization Algorithm

In [21]:
from hyperopt import tpe

# Create the algorithm
tpe_algorithm = tpe.suggest

#### Results History

In [22]:
from hyperopt import Trials

# Record results
trials = Trials()

In [None]:
# Create a file and open a connection
OUT_FILE = 'bayes_test.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

ITERATION = 0

# Write column names
headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
writer.writerow(headers)
of_connection.close()

#### Automated Hyperparameter Optimization in Practice

In [24]:
from hyperopt import fmin

In [None]:
# Global variable
global  ITERATION

ITERATION = 0

# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
            max_evals = MAX_EVALS)

best

In [None]:
# Sort the trials with lowest loss (highest AUC) first
trials_dict = sorted(trials.results, key = lambda x: x['loss'])
trials_dict[:1]

In [None]:
results = pd.read_csv(OUT_FILE)

In [None]:
import ast

def evaluate(results, name):
    """Evaluate model on test data using hyperparameters in results
       Return dataframe of hyperparameters"""
    
    new_results = results.copy()
    # String to dictionary
    new_results['hyperparameters'] = new_results['hyperparameters'].map(ast.literal_eval)
    
    # Sort with best values on top
    new_results = new_results.sort_values('score', ascending = False).reset_index(drop = True)
    
    # Print out cross validation high score
    print('The highest cross validation score from {} was {:.5f} found on iteration {}.'.format(name, new_results.loc[0, 'score'], new_results.loc[0, 'iteration']))
    
    # Use best hyperparameters to create a model
    hyperparameters = new_results.loc[0, 'hyperparameters']
    model = lgb.LGBMClassifier(**hyperparameters)
    
    # Train and make predictions
    model.fit(train_features, train_labels)
    preds = model.predict_proba(test_features)[:, 1]
    
    print('ROC AUC from {} on test data = {:.5f}.'.format(name, roc_auc_score(test_labels, preds)))
    
    # Create dataframe of hyperparameters
    hyp_df = pd.DataFrame(columns = list(new_results.loc[0, 'hyperparameters'].keys()))

    # Iterate through each set of hyperparameters that were evaluated
    for i, hyp in enumerate(new_results['hyperparameters']):
        hyp_df = hyp_df.append(pd.DataFrame(hyp, index = [0]), 
                               ignore_index = True)
        
    # Put the iteration and score in the hyperparameter dataframe
    hyp_df['iteration'] = new_results['iteration']
    hyp_df['score'] = new_results['score']
    
    return hyp_df

In [None]:
bayes_results = evaluate(results, name = 'Bayesian')
bayes_results

#### Continue Optimization

#### Next Steps

In [None]:
import json

MAX_EVALS = 50

# Create a new file and open a connection
OUT_FILE = 'bayesian_trials_50.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

# Write column names
headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
writer.writerow(headers)
of_connection.close()

# Record results
trials = Trials()

global ITERATION

ITERATION = 0 

best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials, max_evals = MAX_EVALS)

# Sort the trials with lowest loss (highest AUC) first
trials_dict = sorted(trials.results, key = lambda x: x['loss'])

print('Finished, best results')
print(trials_dict[:1])

# Save the trial results
with open('trials.json', 'w') as f:
     f.write(json.dumps(trials_dict))

 38%|███▊      | 19/50 [1:55:11<3:39:13, 424.29s/it, best loss: 0.2630466912492445] 

In [None]:
import json

# Save the trial results
with open('trials.json', 'w') as f:
     f.write(json.dumps(trials_dict))

## 4.2. Algoritmo LigthGBM - Full Dataset

In [None]:
# Extract the train labels
train_labels = np.array(train['target'].astype(np.int32)).reshape((-1, ))
train = train.drop(columns = ['target'])

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

In [None]:
#random_results['hyperparameters'] = random_results['hyperparameters'].map(ast.literal_eval)
bayes_results['hyperparameters'] = bayes_results['hyperparameters'].map(ast.literal_eval)

In [None]:
hyperparameters = dict(**bayes_results.loc[0, 'hyperparameters'])
del hyperparameters['n_estimators']

# Cross validation with n_folds and early stopping
cv_results = lgb.cv(hyperparameters, train_set,
                    num_boost_round = 10000, early_stopping_rounds = 100, 
                    metrics = 'auc', nfold = N_FOLDS)

print('The cross validation score on the full dataset for Bayesian optimization = {:.5f} with std: {:.5f}.'.format(
    cv_results['auc-mean'][-1], cv_results['auc-stdv'][-1]))
print('Number of estimators = {}.'.format(len(cv_results['auc-mean'])))

In [None]:
model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), **hyperparameters)
model.fit(train, train_labels)

preds = model.predict_proba(test)[:, 1]

In [None]:
plt.hist(preds)
plt.show()

## 4.3. Algoritmo LigthGBM Hyperparametros

In [None]:
# Configurações Gerais

GENERATE_SUBMISSION_FILES = True
SUBMISSION_SUFIX = "_lgbm_v.1.0.2"
STRATIFIED_KFOLD = False
RANDOM_SEED = 42 #737851
NUM_THREADS = 4
NUM_FOLDS = 10
EARLY_STOPPING = 100

'''hyperparameters: {'boosting_type': 'gbdt', 
                    'colsample_bytree': 0.8292757502521847, 
                    'is_unbalance': False, 
                    'learning_rate': 0.0292651925552576, 
                    'min_child_samples': 430, 
                    'num_leaves': 60, 
                    'reg_alpha': 0.43110525046663706, 
                    'reg_lambda': 0.5664343989313816, 
                    'subsample_for_bin': 160000, 
                    'subsample': 0.6142376582118154, 
                    'n_estimators': 151}
                
                
LIGHTGBM_PARAMS = {
    'boosting_type': 'goss',
    'n_estimators': 10000,
    'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 1,
    'is_unbalance': False,
    'silent':-1,
    'verbose':-1
}'''


In [None]:
# ------------------------- LIGHTGBM MODEL -------------------------

def run_model(data, categorical_feature = None):
    df = data[data['target'].notnull()]
    test = data[data['target'].isnull()]
    del_features = ['target']
    predictors = list(filter(lambda v: v not in del_features, df.columns))
    
    print("Train/valid shape: {}, test shape: {}".format(df.shape, test.shape))

    if not STRATIFIED_KFOLD:
        folds = KFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)
    else:
        folds = StratifiedKFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)

    # Hold oof predictions, test predictions, feature importance and training/valid auc
    oof_preds = np.zeros(df.shape[0])
    sub_preds = np.zeros(test.shape[0])
    importance_df = pd.DataFrame()
    eval_results = dict()

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df[predictors], df['target'])):
        train_x, train_y = df[predictors].iloc[train_idx], df['target'].iloc[train_idx]
        valid_x, valid_y = df[predictors].iloc[valid_idx], df['target'].iloc[valid_idx]

        params = {'random_state': RANDOM_SEED, 'nthread': NUM_THREADS}
        clf = LGBMClassifier(**{**params, **hyperparameters})
        if not categorical_feature:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    eval_metric='logloss', verbose=400, early_stopping_rounds= EARLY_STOPPING)
        else:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    eval_metric='logloss', verbose=400, early_stopping_rounds=EARLY_STOPPING,
                    feature_name= list(df[predictors].columns), categorical_feature= categorical_feature)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test[predictors], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        # Feature importance by GAIN and SPLIT
        fold_importance = pd.DataFrame()
        fold_importance["feature"] = predictors
        fold_importance["gain"] = clf.booster_.feature_importance(importance_type='gain')
        fold_importance["split"] = clf.booster_.feature_importance(importance_type='split')
        importance_df = pd.concat([importance_df, fold_importance], axis=0)
        eval_results['train_{}'.format(n_fold+1)]  = clf.evals_result_['training']['binary_logloss']
        eval_results['valid_{}'.format(n_fold+1)] = clf.evals_result_['valid_1']['binary_logloss']

        print('Fold %2d Log Loss : %.6f' % (n_fold + 1, log_loss(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full Log Loss score %.6f' % log_loss(df['target'], oof_preds))
    test['target'] = sub_preds.copy()

    # Get the average feature importance between folds
    mean_importance = importance_df.groupby('feature').mean().reset_index()
    mean_importance.sort_values(by= 'gain', ascending=False, inplace=True)
    # Save feature importance, test predictions and oof predictions as csv
    if GENERATE_SUBMISSION_FILES:

        # Save submission (test data) and feature importance
        submission = pd.read_csv('../dataset/sample_submission.csv')
        submission['PredictedProb'] = sub_preds.copy()
        submission.to_csv('../submission/submission{}.csv'.format(SUBMISSION_SUFIX), index=False)
        
        mean_importance.to_csv('feature_importance{}.csv'.format(SUBMISSION_SUFIX), index=False)
        plt.hist(submission.PredictedProb)
        plt.show()
    return mean_importance

In [None]:
run_model(df)