In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.feature_selection import f_classif, chi2
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, KBinsDiscretizer
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, GenericUnivariateSelect, RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier


### Load test and train data

In [None]:
AbsenteeismAtWork = pd.read_csv('data/train_data.csv', index_col=0)
AbsenteeismAtWork['Work load Average/day '] = [x.replace(',', '') for x in AbsenteeismAtWork['Work load Average/day ']]
AbsenteeismAtWork['Work load Average/day '] = AbsenteeismAtWork['Work load Average/day '].astype(int)

X_train = AbsenteeismAtWork.drop('Absent', 1)
y_train =  AbsenteeismAtWork['Absent']
X_test = pd.read_csv('data/test_data.csv', index_col=0)
y_test = pd.read_csv('data/sample_submission.csv', index_col=0)

### Funcoes auxiliares

#### Transformação

In [None]:
def robustScaling2(X_train, X_test):
    scaler = RobustScaler()
    scaled_data = scaler.fit_transform( X_train )
    scaled_test = scaler.transform( X_test )
    return scaled_data, scaled_test;

def discretize2(X_train, X_test):
    featuresToDiscretize = ['Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day ', 'Hit target', 'Weight', 'Height', 'Body mass index']
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    X_train[featuresToDiscretize] = discretizer.fit_transform(X_train[featuresToDiscretize])
    X_test[featuresToDiscretize] = discretizer.transform(X_test[featuresToDiscretize])
    return X_train, X_test;


#### Load balancing

In [None]:
def smoteeenSampler(X_train, y_train):
    smote_enn = SMOTEENN(random_state=0)
    X_balanced, y_train = smote_enn.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

def smotetomekSampler(X_train, y_train):
    smote_tomek = SMOTETomek(random_state=0)
    X_balanced, y_train = smote_tomek.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;

def overSampler(X_train, y_train):
    ros = RandomOverSampler()
    X_balanced, y_train = ros.fit_sample(X_train, y_train)
    X_balanced, y_train = shuffle(X_balanced, y_train)
    return X_balanced, y_train;


#### Feature selection

In [None]:
def selectKBest_f_classif(X_train, y_train, X_test):
    kbest_selector_f_classif = SelectKBest(f_classif, k=8)
    selector = kbest_selector_f_classif.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_f_classif.transform(X_train)
    X_test_selected = kbest_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectKBest_chi2(X_train, y_train, X_test):
    kbest_selector_chi2 = SelectKBest(chi2, k=8)
    selector = kbest_selector_chi2.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = kbest_selector_chi2.transform(X_train)
    X_test_selected = kbest_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectPercentile_f_classif(X_train, y_train, X_test):
    percentile_selector_f_classif = SelectPercentile(f_classif, percentile=25)
    selector = percentile_selector_f_classif.fit(X_train, y_train)
    X_train_selected = percentile_selector_f_classif.transform(X_train)
    X_test_selected = percentile_selector_f_classif.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectPercentile_chi2(X_train, y_train, X_test):
    percentile_selector_chi2 = SelectPercentile(chi2, percentile=25)
    selector = percentile_selector_chi2.fit(X_train, y_train)
    X_train_selected = percentile_selector_chi2.transform(X_train)
    X_test_selected = percentile_selector_chi2.transform(X_test)
    
    return X_train_selected, X_test_selected;


def selectVarianceThreshold(X_train, y_train, X_test):
    varianceThreshold_selector = VarianceThreshold()
    selector = varianceThreshold_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_train)
    X_train_selected = varianceThreshold_selector.transform(X_train)
    X_test_selected = varianceThreshold_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

def selectGenericUnivariateSelect(X_train, y_train, X_test):
    gus_selector = GenericUnivariateSelect(f_classif, 'k_best', param=19)
    selector = gus_selector.fit(X_train, y_train)
    X_train_selected = gus_selector.transform(X_train)
    X_test_selected = gus_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def rfeLogReg(X_train, y_train, X_test):
    rfe_log_selector = RFE(LogisticRegression(), 12)
    selector = rfe_log_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_log_selector.transform(X_train)
    X_test_selected = rfe_log_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def rfeSVC(X_train, y_train, X_test):
    rfe_svc_selector = RFE(SVC(kernel='linear'), 12)
    selector = rfe_svc_selector.fit(X_train, y_train)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = rfe_svc_selector.transform(X_train)
    X_test_selected = rfe_svc_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;


def sfmTree(X_train, y_train, X_test):
    tree_selector = ExtraTreesClassifier(n_estimators=50)
    selector = tree_selector.fit(X_train, y_train)
    sfm_Tree_selector = SelectFromModel(tree_selector, prefit=True)
    #printFeatureSelection(selector, X_balanced)
    X_train_selected = sfm_Tree_selector.transform(X_train)
    X_test_selected = sfm_Tree_selector.transform(X_test)
    
    return X_train_selected, X_test_selected;

#### Funções para grid search e aplicação das técnicas do pipeline

In [None]:
def gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test):        
    clf = GridSearchCV(model, param_grid, refit=True, verbose=0)
    clf.fit(X_train,y_train)
    print(clf.best_params_)
    predicted = clf.predict(X_test)
    evaluateModel(modelName, y_test, predicted)
    return ;
    
    
def apllyGridSearchWithTransformation(model, transformer, param_grid, modelName):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = transformer(X_train, X_test)
    
    gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test)
    return ;


def apllyGridSearchWithFSelect(model, transformer, selector, param_grid, modelName):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = transformer(X_train, X_test)
    X_train, X_test = selector(X_train, y_train, X_test)
    
    gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test)
    return ;

def apllyGridSearchWithLoadBalancing(model, transformer, selector, balancer, param_grid, modelName):
    X_train = AbsenteeismAtWork.drop('Absent', 1)
    y_train =  AbsenteeismAtWork['Absent']
    X_test = pd.read_csv('data/test_data.csv', index_col=0)
    y_test = pd.read_csv('data/sample_submission.csv', index_col=0)
    
    X_train, X_test = transformer(X_train, X_test)
    X_train, y_train = balancer(X_train, y_train)
    X_train, X_test = selector(X_train, y_train, X_test)

    gridSearch(model, param_grid, modelName, X_train, y_train, X_test, y_test)
    return ;


def evaluateModel(name, y_test, predicted):
    print("Accuracy: %0.3f || AUROC %0.3f || (Accuracy, Precision) 0:( %0.3f, %0.3f)  1:( %0.3f, %0.3f) ->" 
              % (accuracy_score(y_test,predicted), roc_auc_score(y_test, predicted),
                recall_score(y_test,predicted,pos_label=0), precision_score(y_test,predicted,pos_label=0),
                recall_score(y_test,predicted,pos_label=1), precision_score(y_test,predicted,pos_label=1)), name)
    return;


## Avaliação dos modelos

#### SVC

In [None]:
param_grid_svc = {
    'class_weight': ['balanced', None], 
    'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001], 
    'kernel': ['rbf', 'linear']
} 
print("SVC com robust scaling + select variance threshold + smoteenSampler:")
apllyGridSearchWithLoadBalancing(SVC(), robustScaling2, selectVarianceThreshold, smoteeenSampler, param_grid_svc, "SVC")
print("SVC com discretizacao e select selectVarianceThreshold + over sampler:")
apllyGridSearchWithLoadBalancing(SVC(), discretize2, selectPercentile_chi2, overSampler, param_grid_svc, "SVC")
print("SVC com discretizacao e select selectPercentile_chi2 + over sampler:")
apllyGridSearchWithLoadBalancing(SVC(), discretize2, selectVarianceThreshold, overSampler, param_grid_svc, "SVC")

#### KNN

In [None]:
grid_params_knn = {
    'n_neighbors' : [3,5,7,11,13, 15, 17],
    'weights' : ['uniform', 'distance'],
    'metric' : ['euclidean', 'manhattan']
}

apllyGridSearchWithLoadBalancing(KNeighborsClassifier(), discretize2, selectPercentile_chi2, overSampler, grid_params_knn, "knn")

apllyGridSearchWithLoadBalancing(KNeighborsClassifier(), discretize2, selectPercentile_chi2, smotetomekSampler, grid_params_knn, "knn")

apllyGridSearchWithLoadBalancing(KNeighborsClassifier(), discretize2, selectPercentile_chi2, smotetomekSampler, grid_params_knn, "knn")

#### Random Forest

In [None]:
grid_params_randomforest = {
    'n_estimators' : [10,50,100,200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
    'max_depth' : [1,20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'random_state' : [0,1,2,3,4,5],
    'bootstrap': [True, False],
}

apllyGridSearchWithLoadBalancing(RandomForestClassifier(), discretize2, selectPercentile_chi2, overSampler, grid_params_randomforest, "Random Forest")
