# Kaggle
## Competição DSA de Machine Learning - Dezembro 2019

# Importando as bibliotecas

In [36]:
# here we will import the libraries used for machine learning
import sys 
import numpy as np # linear algebra
from scipy.stats import randint
import pandas as pd # data processing, CSV file I/O, data manipulation 
import matplotlib.pyplot as plt # this is used for the plot the graph 
import seaborn as sns # used for plot interactive graph. 

# Importa pacotes do sklearn
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Carregando os dados de treino e teste

In [2]:
def read_data():
    print('Carregando arquivo dataset_treino.csv....')
    train = pd.read_csv('../dataset/dataset_treino.csv')
    print('dataset_treino.csv tem {} linhas and {} colunas'.format(train.shape[0], train.shape[1]))

    print('Carregando arquivo dataset_teste.csv....')
    test = pd.read_csv('../dataset/dataset_teste.csv')
    print('dataset_teste.csv tem {} linhas and {} colunas'.format(test.shape[0], test.shape[1]))

    return train, test

In [3]:
# Leitura dos dados
train, test = read_data()

Carregando arquivo dataset_treino.csv....
dataset_treino.csv tem 114321 linhas and 133 colunas
Carregando arquivo dataset_teste.csv....
dataset_teste.csv tem 114393 linhas and 132 colunas


In [4]:
df = train.append(test)

# Feature Engineering

In [5]:
df = df.drop(['ID'], axis=1)

In [30]:
numerical_feats = df.dtypes[df.dtypes != "object"].index
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = df.dtypes[df.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  192
Number of Categorical features:  0


In [9]:
# Transformando as features categorias com LabelEncoder
# Realizando tratamento de missing value
# Corrigindo Skewness com Winsorize
le = LabelEncoder()

for col in df.columns:
    if df[col].dtype !='object':
        if col in ['ID','target']:
            continue
        #df[col] = mstats.winsorize(df[col], limits=[0.05, 0.05])[0]
        df[col].fillna(df[col].mean(),inplace=True)
    else:
        df[col].fillna('NA',inplace=True)
        le.fit(np.unique(list(df[col].values)))
        df[col] = le.transform(df[col])

In [10]:
# Add decomposed components: PCA / ICA etc.
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_df = tsvd.fit_transform(df.drop(["target"], axis=1))

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_df = pca.fit_transform(df.drop(["target"], axis=1))

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_df = ica.fit_transform(df.drop(["target"], axis=1))

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_df = grp.fit_transform(df.drop(["target"], axis=1))

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_df = srp.fit_transform(df.drop(["target"], axis=1))



In [11]:
# Append decomposition components to datasets
for i in range(1, n_comp+1):
    df['pca_' + str(i)]  = pca2_results_df[:,i-1]
    df['ica_' + str(i)]  = ica2_results_df[:,i-1]
    df['tsvd_' + str(i)] = tsvd_results_df[:,i-1]
    df['grp_' + str(i)]  = grp_results_df[:,i-1]
    df['srp_' + str(i)]  = srp_results_df[:,i-1]
    
y_train = df["target"]
y_mean = np.mean(y_train)

In [12]:
df.head()

Unnamed: 0,target,v1,v10,v100,v101,v102,v103,v104,v105,v106,...,pca_11,ica_11,tsvd_11,grp_11,srp_11,pca_12,ica_12,tsvd_12,grp_12,srp_12
0,1.0,1.335739,0.503281,19.470199,8.389237,2.757375,4.374296,1.574039,0.007294,12.579184,...,-1.495551,0.001516,-0.507048,-2103.90695,16.414498,0.287991,0.000181,1.813369,-8354.976687,-41.686171
1,1.0,1.629654,1.31291,12.085858,6.860181,2.892925,5.300739,2.64548,1.505335,11.782091,...,-0.362886,-0.002445,0.047501,-1014.5804,17.560602,0.392828,0.000237,0.624377,-3820.622435,-39.469108
2,1.0,0.943877,0.765864,15.491329,5.879353,3.292788,5.924457,1.668401,0.008275,11.670572,...,8.00744,-0.002375,-0.638238,-915.137928,20.288315,-0.591242,-0.000435,-8.19043,-3539.810399,-41.178446
3,1.0,0.797415,6.542669,18.256352,8.507281,2.503055,4.872157,2.573664,0.113967,12.554274,...,-3.687951,-0.000718,-3.292667,-190.147106,17.003139,1.443903,0.000916,1.637239,-729.001495,-41.094768
4,1.0,1.629654,1.050328,12.085858,6.860181,2.892925,5.300739,2.64548,1.084907,11.782091,...,-0.387866,0.000394,-1.091193,-989.561933,16.653962,1.343012,0.000851,0.070965,-3959.562617,-42.898538


In [13]:
df.shape

(228714, 192)

In [14]:
# Separar o dataset de treino e teste
treino = df[df['target'].notnull()]
teste  = df[df['target'].isnull()]

In [32]:
def add_entropy_binning(temp_train, targ, num_vars, important_features, temp_test,entropy_binning):
    """
        ######   This is where we do ENTROPY BINNING OF CONTINUOUS VARS ###########
        #### It is best to do Binning on ONLY on the top most variables from Important_Features!
        #### Make sure that the Top 2-10 vars are all CONTINUOUS VARS! Otherwise Binning Waste!
        ####    That ensures you get the Best Results!
    """
    seed = 99
    continuous_vars = copy.deepcopy(num_vars)
    
    if entropy_binning:
        if len(continuous_vars) > 0 and len(continuous_vars) <= 2:
            max_depth =  2
            continuous_vars = continuous_vars[:]
        elif len(continuous_vars) > 2 and len(continuous_vars) <= 5:
            max_depth = len(continuous_vars) - 2
            continuous_vars = continuous_vars[:2]
            entropy_binning = True
        elif len(continuous_vars) > 5 and len(continuous_vars) <= 10:
            max_depth = 5
            continuous_vars = continuous_vars[:5]
            entropy_binning = True
        elif len(continuous_vars) > 10 and len(continuous_vars) <= 50:
            max_depth = 10
            continuous_vars = continuous_vars[:10]
            entropy_binning = True
        elif len(continuous_vars) > 50:
            max_depth = 10
            continuous_vars = continuous_vars[:50]
        print('Entropy Binning %d continuous variables...' %len(continuous_vars))
        new_bincols = []
        ###   This is an Awesome Entropy Based Binning for Continuous Variables ###########
        from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier 

        clf = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=2, 
                                         max_depth=max_depth, 
                                         random_state=seed)
        entropy_threshold = []
        
        for each_num in continuous_vars:
            try:
                clf.fit(temp_train[each_num].values.reshape(-1,1),temp_train[targ].values)
                entropy_threshold = clf.tree_.threshold[clf.tree_.threshold>-2]
                entropy_threshold = np.sort(entropy_threshold)
                if isinstance(each_num, str):
                    bincol = each_num+'_bin'
                    temp_train[bincol] = np.digitize(temp_train[each_num].values, entropy_threshold)
                else:
                    bincol = 'bin_'+str(each_num)
                    temp_train[bincol] = np.digitize(temp_train[each_num].values, entropy_threshold)
                #### Drop the original continuous variable after you have created the bin ###
                temp_train.drop(each_num,axis=1,inplace=True)
                if type(temp_test) != str:
                    if isinstance(each_num, str):
                        bincol = each_num+'_bin'
                        temp_test[bincol] = np.digitize(temp_test[each_num].values, entropy_threshold)
                    else:
                        bincol = 'bin_'+str(each_num)
                        temp_test[bincol] = np.digitize(temp_test[each_num].values, entropy_threshold)
                    #### Drop the original continuous variable after you have created the bin ###
                    temp_test.drop(each_num,axis=1,inplace=True)
                important_features.append(bincol)
                important_features.remove(each_num)
                num_vars.append(bincol)
                num_vars.remove(each_num)
                new_bincols.append(bincol)
            except:
                print('Error in %s during Entropy Binning' %each_num)
        print('    Binning and replacing %s numeric features.' %(len(new_bincols)))
    else:
        print('    No Entropy Binning specified or there are no numeric vars in data set to Bin')
    return temp_train, num_vars, important_features, temp_test

In [None]:
saved_num_vars = treino[numerical_feats].drop(['target'], axis=1).index

In [37]:
# Separando features preditoras e target
X = treino.drop(['target'], axis=1)
y = treino['target']

# Padronizando os dados de treino
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#########     SELECT IMPORTANT FEATURES HERE   #############################
important_features, num_vars, imp_cats = find_top_features_xgb(train,red_preds,train_sel,each_target,corr_limit,verbose)


In [38]:


############   Add Entropy Binning of Continuous Variables Here ##############################
#saved_important_features = copy.deepcopy(important_features)  ### these are original features without '_bin' added
#saved_num_vars = copy.deepcopy(num_vars)  ### these are original numeric features without '_bin' added  
        

if len(numerical_feats) > 0:
    #### Do binning only when there are numeric features ####
    part_train, num_vars, important_features, part_cv = add_entropy_binning(X_train, 'target', saved_num_vars, 
                                                         important_features, X_test, modeltype, Binning_Flag)
    
### Now we add another Feature tied to KMeans clustering using Predictor and Target variables ###
#X_train, X_test, y_train, y_test = part_train[important_features],part_cv[important_features
#                                           ],train[each_target][:train_num],train[each_target][train_num:]

NameError: name 'important_features' is not defined

# Gera dataset tratado

In [None]:
treino.to_csv('../dataset/dataset_treino_new.csv', index=False)
teste.to_csv('../dataset/dataset_teste_new.csv', index=False)