# Kaggle
## Competição DSA de Machine Learning - Dezembro 2019

Versão 1.0.0: LB = 0.50557
- modelo: XGBoost (com algumas otimizações)
- features categoricas: removido
- dados missing: atribuído o valor medio

Versão 1.0.1: LB = 0.48972 / CV = 0.469777
- modelo: XGBoost executando todas as otimizações
- features engineering: gerado através do Auto_ViML

Versão 1.0.2: LB = 0.55264 / CV = 0.469158
- modelo: XGBoost executando todas as otimizações
- dados missing: removido colunas com mais de 40% de NA e as demais -999
- features categoricas: label encoder
- feature engineering: usando pacote Boruta

Versão 1.0.3: LB = ??? / CV = ???
- modelo: XGBoost executando todas as otimizações
- features engineering: gerado através do Auto_ViML (modificado v1)

Versão 1.0.4: LB = 0.49299 / CV = 0.470500
- modelo: XGBoost executando todas as otimizações
- features engineering: Kernel_Feature_Engineering_v.1.0.0

## 1. Importando as bibliotecas

In [None]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import itertools
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

import time
import datetime
import gc

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Variavel para controlar o treinamento no Kaggle
TRAIN_OFFLINE = True

In [None]:
# Importa os pacotes de algoritmos
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb

# Importa pacotes do sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, log_loss
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

## 2. Carregando os dados de treino e teste

In [None]:
def read_data():
    
    if TRAIN_OFFLINE:
        print('Carregando arquivo dataset_treino_new.csv....')
        train = pd.read_csv('../dataset/dataset_treino_new.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_teste_new.csv....')
        test = pd.read_csv('../dataset/dataset_teste_new.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
        
    else:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_treino.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_treino.csv....')
        test = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_teste.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
    
    return train, test

In [None]:
# Leitura dos dados
train, test = read_data()

In [None]:
df = train.append(test)

In [None]:
df = df.drop(columns = ['1'], axis = 1)

In [None]:
df["v19"] = np.log1p(df["v19"])

In [None]:
# Verificar a quantidade de features numericas e categoricas

numerical_feats = df.dtypes[df.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = df.dtypes[df.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

In [None]:
# Transformando as features categorias com LabelEncoder
le = LabelEncoder()

for i, col in enumerate(df):
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(np.array(df[col].astype(str)).reshape((-1,)))

In [None]:
df.head()

# Feature Engineering

In [None]:
# Create correlation matrix
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop

In [None]:
# Drop features 
df = df.drop(df[to_drop], axis=1)
df.shape

In [None]:
df.head()

# Algoritmo XGBoost - Extreme Gradient Boosting

In [None]:
treino = df[df['target'].notnull()]
teste = df[df['target'].isnull()]
    
# Separando features preditoras e target
train_x = treino.drop(['ID','target'], axis=1)
train_y = treino['target']

# Padronizando os dados
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)

In [None]:
# Criando uma funcao para criação, execução e validação do modelo
def run_model(modelo, X_tr, y_tr, useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    
    # Utilização do Cross-Validation
    if useTrainCV:
        xgb_param = modelo.get_xgb_params()
        xgtrain = xgb.DMatrix(X_tr, label=y_tr)
        
        print ('Start cross validation')
        cvresult = xgb.cv(xgb_param, 
                          xgtrain, 
                          num_boost_round=modelo.get_params()['n_estimators'], 
                          nfold=cv_folds,
                          metrics=['logloss'],
                          stratified=True,
                          seed=42,
                          verbose_eval=True,
                          early_stopping_rounds=early_stopping_rounds)

        modelo.set_params(n_estimators=cvresult.shape[0])
        best_tree = cvresult.shape[0]
        print('Best number of trees = {}'.format(best_tree))
    
    # Fit do modelo
    modelo.fit(X_tr, y_tr, eval_metric='logloss')
        
    # Predição no dataset de treino
    train_pred = modelo.predict(X_tr)
    train_pred_prob = modelo.predict_proba(X_tr)[:,1]
    
    # Exibir o relatorio do modelo
    print("Log Loss (Treino): %f" % log_loss(y_tr, train_pred_prob))
    print("Log Loss (Test): %f" % cvresult['test-logloss-mean'][best_tree-1])
    
    feature_imp = pd.Series(modelo.feature_importances_.astype(float)).sort_values(ascending=False)
    
    plt.figure(figsize=(18,8))
    feature_imp[:25].plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.tight_layout()

## Passo 06: reduzindo Learning Rate

In [None]:
%%time

# Criando o modelo XGB com todas as otimizações
modeloXGB_v2 = XGBClassifier(learning_rate = 0.1, 
                             n_estimators = 1000, 
                             max_depth = 4,
                             min_child_weight = 1, 
                             gamma = 0, 
                             subsample = 0.7, 
                             colsample_bytree = 0.6,
                             reg_alpha = 0.005,
                             objective = 'binary:logistic', 
                             n_jobs = -1,
                             scale_pos_weight = 1, 
                             seed = 1234)

run_model(modeloXGB_v2, train_x, train_y)

In [None]:
# Visualizando o modelo XGBoost Otimizado
print(modeloXGB_v2)

## 5. Submissions

In [None]:
# Preparando os dados de teste
new_teste = teste.drop(['ID','target'], axis=1)
new_teste = scaler.fit_transform(new_teste)
new_teste.shape

In [None]:
submission = pd.read_csv('../dataset/sample_submission.csv')
submission['PredictedProb'] = modeloXGB_v2.predict_proba(new_teste)[:,1]
print(submission.shape)
submission.head()

In [None]:
submission.to_csv('../submission/submission_xgb_v.1.0.4.csv', index=False)

In [None]:
plt.hist(submission.PredictedProb)
plt.show()