# Kaggle
## Competição DSA de Machine Learning - Dezembro 2019

Versão 1.0.0: LB = ???
- modelo: LSTM com 1 camadas
- features categoricas: removido
- dados missing: atribuído o valor medio

## 1. Importando bibliotecas

In [None]:
# Importar os principais pacotes
import numpy as np
import pandas as pd
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm
import re
import codecs
import time
import datetime
import gc
from numba import jit
from collections import Counter
import copy
from typing import Any

# Evitar que aparece os warnings
import warnings
warnings.filterwarnings("ignore")

# Seta algumas opções no Jupyter para exibição dos datasets
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Variavel para controlar o treinamento no Kaggle
TRAIN_OFFLINE = True

In [None]:
# Importa os pacotes de algoritmos de redes neurais (Keras)
import keras
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Masking, Dropout
from keras.layers import TimeDistributed
from keras.layers import Bidirectional
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras.utils import to_categorical
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam
from keras.utils import np_utils

# Importa pacotes do sklearn
from sklearn import preprocessing
import sklearn.metrics as mtr
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler

## 2. Carregando os dados

In [None]:
def read_data():
    
    if TRAIN_OFFLINE:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('../dataset/dataset_treino_modificado.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_teste.csv....')
        test = pd.read_csv('../dataset/dataset_teste_modificado.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))
        
    else:
        print('Carregando arquivo dataset_treino.csv....')
        train = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_treino.csv')
        print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))
        
        print('Carregando arquivo dataset_treino.csv....')
        test = pd.read_csv('/kaggle/input/competicao-dsa-machine-learning-dec-2019/dataset_teste.csv')
        print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))

    return train, test

In [None]:
# Leitura dos dados
train, test = read_data()

# Diminuindo os dados para testar
#train = train[:1000]

## 3. Feature Engineering

In [None]:
# Aplicando valores median para Na
train.fillna(train.median(),inplace=True)

# Removendo todas as variaveis categoricas
#drop_features = []
#for col in train.columns:
#    if train[col].dtype =='object':
#        drop_features.append(col)

# Deixando somente features preditoras
#train = train.drop(drop_features, axis=1)
#train.drop(['ID'], axis=1, inplace=True)
train_x = train.drop('target', axis=1)

# Padronizando os dados (0 para a média, 1 para o desvio padrão)
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)

# Separando a variavel target
train_y = train['target']

# Para criar o Modelo RNN LSTM com o Keras nossos dados precisam estar no formato [samples, seq_length, stepsize]   
seq_length = 43
X = np.asarray(np.reshape(train_x, (train_x.shape[0], seq_length, 1)))

# One-Hot Encoding para as variáveis de saída
y = to_categorical(train_y)

## 4. Modelos de Deep Learning (LSTM)

In [None]:
# Classe para controlar as iteraçoes da rede neural LSTM
class CyclicLR(Callback):
    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}
        print(self.clr())


        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        
        K.set_value(self.model.optimizer.lr, self.clr())
        
    def on_epoch_end(self, epoch, logs=None):
        print(self.clr())

In [None]:
def build_LSTM(x_tr, y_tr, x_val, y_val, shape, epochs, batch_size):
    
    #model = Sequential()
    #model.add(LSTM(256, input_shape=(x_tr.shape[1], 1), return_sequences=True))
    #model.add(Dropout(0.2))
    #model.add(LSTM(256))
    #model.add(Dropout(0.2))
    #model.add(Dense(y_tr.shape[1], activation='softmax'))
    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model = Sequential()   
    model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5, input_shape=(shape, 1)))
    model.add(Dense(2, activation='softmax'))  
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    
    es = EarlyStopping(monitor='val_loss', 
                       mode='min',
                       restore_best_weights=True, 
                       verbose=1, 
                       patience=5)

    mc = ModelCheckpoint('best_model.h5',
                         monitor='val_loss',
                         mode='min',
                         save_best_only=True, 
                         verbose=1, 
                         save_weights_only=True)

    #cl = CyclicLR(base_lr=0.00001, 
    #              max_lr=0.01,
    #              step_size=70., 
    #              mode='triangular2')

    #clr = CyclicLR(base_lr=0.00001, max_lr=0.01,step_size=5, mode='exp_range',gamma=0.99994)
    
    model.fit(x_tr, y_tr,
              validation_data=[x_val, y_val],
              callbacks=[es,mc],
              epochs=epochs, 
              batch_size=batch_size,
              verbose=1,
              shuffle=True)
    
    return model

In [None]:
# Limpando a memoria
gc.collect()

In [None]:
# Setup para a execucao do modelo

# Variaveis de controle de epochs e batch_size
epochs = 200
batch_size = 1024

# Cross validation folds
kf = 2
folds = KFold(n_splits=kf, shuffle=True, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

nn = build_LSTM(X_train, y_train, X_test, y_test, seq_length, epochs, batch_size)

## 5. Resultado

In [None]:
# Grafico mostrando a acuracia do modelo no primeiro KFold
plt.figure(figsize=(18, 8))
plt.subplot(2, 1, 1)
plt.plot(models_nn[0].history.history["loss"], "o-", alpha=.9, label="loss")
plt.plot(models_nn[0].history.history["val_loss"], "o-", alpha=.9, label="val_loss")
plt.axhline(1, linestyle="--", c="C2")
plt.legend()
plt.subplot(2, 1, 2)
plt.plot(models_nn[0].history.history["accuracy"], "o-", alpha=.9, label="accuracy")
plt.plot(models_nn[0].history.history["val_accuracy"], "o-", alpha=.9, label="val_accuracy")
plt.axhline(.7, linestyle="--", c="C2")
plt.legend()
plt.show()

## 6. Previsões

In [None]:
# Funcao para percorrer todos os modelos e fazer previsoes
def predict(x_te, models_nn):
    
    model_num_nn = len(models_nn)

    for k,m in enumerate(models_nn):
        if k==0:
            y_pred_nn = m.predict_proba(x_te, batch_size=1024)[:,1]
        else:
            y_pred_nn += m.predict_proba(x_te, batch_size=1024)[:,1]
            
    y_pred_nn = y_pred_nn / model_num_nn
    
    return y_pred_nn

In [None]:
# Removendo todas as variaveis categoricas
drop_features = []
for col in test.columns:
    if test[col].dtype =='object':
        drop_features.append(col)

new_test = test.drop(drop_features, axis=1)

new_test.fillna(-999,inplace=True)
new_test.drop(['ID'], axis=1, inplace=True)

# Padronizando os dados (0 para a média, 1 para o desvio padrão)
new_test = scaler.fit_transform(new_test)

# Reshape para o padrao da LSTM
new_test = np.asarray(np.reshape(new_test, (new_test.shape[0], seq_length, 1)))

In [None]:
# Realizando as previsoes
pred_test = predict(new_test, models_nn)
pred_test

## 7. Submissions

In [None]:
submission = pd.DataFrame({'ID': test["ID"], 'PredictedProb': pred_test.reshape((pred_test.shape[0]))})
print(submission.head())

In [None]:
submission.to_csv('../submission/submission_lstm_v1.csv', index=False)

In [None]:
submission['PredictedProb'].value_counts(normalize=True)

In [None]:
plt.hist(submission.PredictedProb)
plt.show()