# BigData API serving : notebook for pre-treatments

In [9]:
%load_ext autoreload
%autoreload 2
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
csv_filepath = '../data/ref_data.csv'

## Step 1 : load the dataset

In [5]:
X_num, X_cat, y, labels = load_heterogeneous_dataset(csv_filepath, debugging=True)

Number of rows and columns with numerical values : (91713, 74)
Number of rows and columns with categorical values : (91713, 7)
y.shape = (91713,)
X_num =        hospital_id   age        bmi  elective_surgery  height  icu_id  \
0              118  68.0  22.730000                 0   180.3      92   
1               81  77.0  27.420000                 0   160.0      90   
2              118  25.0  31.950000                 0   172.7      93   
3              118  81.0  22.640000                 1   165.1      92   
4               33  19.0        NaN                 0   188.0      91   
...            ...   ...        ...               ...     ...     ...   
91708           30  75.0  23.060250                 0   177.8     927   
91709          121  56.0  47.179671                 0   183.0     925   
91710          195  48.0  27.236914                 0   170.2     908   
91711           66   NaN  23.297481                 0   154.9     922   
91712          104  82.0  22.031250        

## Step 2 : imput numerical and categorical missing values

In [6]:
# the imputer function return the new dataset which is the concatenation of the imputed numerical and categorical data
X_concat = imputer_variables(X_num, X_cat, debugging=True)

X_cat_filled: [[2. 1. 1. ... 2. 9. 0.]
 [2. 0. 1. ... 5. 8. 6.]
 [2. 0. 0. ... 5. 5. 3.]
 ...
 [2. 1. 0. ... 5. 5. 3.]
 [2. 0. 0. ... 5. 8. 6.]
 [2. 0. 2. ... 5. 1. 1.]]
X_num_imput: [[118.          68.          22.73       ...   0.           0.
    0.        ]
 [ 81.          77.          27.42       ...   0.           0.
    0.        ]
 [118.          25.          31.95       ...   0.           0.
    0.        ]
 ...
 [195.          48.          27.23691351 ...   0.           0.
    0.        ]
 [ 66.          62.30951592  23.29748133 ...   0.           0.
    0.        ]
 [104.          82.          22.03125    ...   0.           0.
    0.        ]]
newDataset: [[2. 1. 1. ... 0. 0. 0.]
 [2. 0. 1. ... 0. 0. 0.]
 [2. 0. 0. ... 0. 0. 0.]
 ...
 [2. 1. 0. ... 0. 0. 0.]
 [2. 0. 0. ... 0. 0. 0.]
 [2. 0. 2. ... 0. 0. 0.]]


## Step 3 : choose the criterion use as scoring method

## Step 4 : compare classifiers models with the cross validation method

In [11]:
#create the classifier dictionary
clfs = init_clfs()

In [None]:
bestClf, X_train, X_test, y_train, y_test, strategy = comparison_cross_validation(X_concat, y, clfs, n_splits=1, debugging=True)

## Step 5 : variables selection

In [None]:
sorted_idx = feature_importance(X_train, y_train, labels, debugging=True)

In [None]:
nb_selected_features = feature_selection(X_train, X_test, y_train, y_test, bestClf, sorted_idx, debugging=True)

## Step 6 : search best hyperparameters for the best classifier found

In [10]:
#create the classifier parameters dictionary
clfs_parameters = init_clfs_parameters()

In [None]:
# model = DecisionTreeClassifier()
# print(DecisionTreeClassifier.__name__)
# print(clfs_parameters[type(model)])

DecisionTreeClassifier
{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}


In [None]:
#select for this model the corresponding parameters grid
param_grid = clfs_parameters[type(bestClf)]

#update X_train with the selected features
X_train_selected = X_train[:,sorted_idx[:nb_selected_features]]

newBestClf = fine_tune_model(X_train_selected, y_train, bestClf, param_grid, debugging=True)

## Step 7 : creation of the pickles file and the pipeline

## Step 8 : save the new dataset as a csv file

In [None]:
# replace the original ref_data.csv with the new data imputed
csv_filepath = '../data/ref_data.csv'
create_data_csv(X_concat, y, labels, csv_filepath, debugging=True)