# Feature Engineering & Model

Import the necessary packages:

In [1]:
import re
import pandas as pd
import numpy as np

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Train-test split and Grid Search
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler

# Feature Engine
from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from feature_engine.encoding import OneHotEncoder, RareLabelEncoder

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

Read the data, and create a copy of it:

In [2]:
df = pd.read_csv('data/train_data.csv')
df_ = df.copy(deep = True)

df_ = df_.replace('?', np.nan)

Train-test split:

In [3]:
target = 'readmitted'

X = df_.drop(columns = target)
y = df_[target].apply(lambda x: False if x == 'No' else True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify = y)

## Feature Engineering

Class to correct the 'race' column:

In [4]:
class CorrectRace(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = 'race'
        self.map = { 'white' : 'caucasian'
                   , 'africanamerican' : 'african-american'
                   , 'african american' : 'african-american'
                   , 'afro american' : 'african-american'
                   , 'euro' : 'european'
                   , '?' : 'other'
                   , 'asian' : 'other'
                   , 'latino' : 'other'
                   }
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X[self.col] = X[self.col].str.lower().replace(self.map)
        
        return X

Convert the 'age' column to numerical:

In [5]:
class ConvertAge(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = 'age'
        self.map = { '[0-10)' : 5
                    ,'[10-20)' : 15
                    ,'[20-30)' : 25
                    ,'[30-40)' : 35
                    ,'[40-50)' : 45
                    ,'[50-60)' : 55
                    ,'[60-70)' : 65
                    ,'[70-80)' : 75
                    ,'[80-90)' : 85
                    ,'[90-100)' : 95
                   }
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X[self.col] = X[self.col].map(self.map)
        
        return X

Correct column types:

In [6]:
class CorrectType(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols = ['admission_type_code',
                     'discharge_disposition_code',
                     'admission_source_code',
                     'blood_transfusion',
                     'max_glu_serum']
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        for col in self.cols:
            X[col] = X[col].astype(str)
            X[col].replace('nan', np.NaN, inplace = True)
        
        return X

Correct the admission_type_code column:

In [7]:
class CorrectAdmissionTypeCode(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = 'admission_type_code'
        self.map = { '5.0' : 'Unknown'
                   , '6.0' : 'Unknown'
                   , '8.0' : 'Unknown'}
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X[self.col] = X[self.col].replace(self.map)
        X[self.col] = X[self.col].fillna('Unknown')
        
        return X

Correct the discharge_disposition_code column:

In [8]:
class CorrectDischargeDispositionCode(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = 'discharge_disposition_code'
        self.map = { '18.0' : 'Unknown'
                   , '25.0' : 'Unknown'
                   , '26.0' : 'Unknown'}
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X[self.col] = X[self.col].replace(self.map)
        X[self.col] = X[self.col].map(str)
        X[self.col] = X[self.col].fillna('Unknown')
        
        return X

Correct the admission_source_code column:

In [9]:
class CorrectAdmissionSourceCode(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = 'admission_source_code'
        self.map = { '9' : 'Unknown'
                   , '15' : 'Unknown'
                   , '16' : 'Unknown'
                   , '19' : 'Unknown'
                   , '20' : 'Unknown'}
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X[self.col] = X[self.col].replace(self.map)
        
        return X

Correct the max_glu_serum colum:

In [10]:
class CorrectMaxGluSerum(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = 'max_glu_serum'
        self.map = {'norm' : 'normal',
                    '>200' : 'abnormal',
                    '>300' : 'abnormal',
                    }
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X[self.col] = X[self.col].str.lower()
        X[self.col] = X[self.col].replace(self.map)
        
        return X

Correct the A1Cresult colum:

In [11]:
class CorrectA1CResult(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.col = 'A1Cresult'
        self.map = {'norm' : 'normal',
                    '>8' : 'abnormal',
                    '>7' : 'abnormal',
                    }
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X[self.col] = X[self.col].str.lower()
        X[self.col] = X[self.col].replace(self.map)
        
        return X

Separation of feature types:

In [12]:
features_to_drop = [ 'admission_id'
                    ,'patient_id'
                    ,'weight'
                    ,'medical_specialty'
                    ,'payer_code'
                    ,'blood_type'
                    ,'has_prosthesis'
                    ,'diag_1'
                    ,'diag_2'
                    ,'diag_3']

categorical_features = ['race'
                        ,'gender'
                        ,'admission_type_code'
                        ,'discharge_disposition_code'
                        ,'admission_source_code'
                        ,'complete_vaccination_status'
                        ,'blood_transfusion'
                        ,'max_glu_serum'
                        ,'A1Cresult'
                        ,'diuretics'
                        ,'insulin'
                        ,'change'
                        ,'diabetesMed'
                       ]

categorical_features_ohe_2mf = ['gender'
                               ,'complete_vaccination_status']

numerical_features = ['age'
                     ,'time_in_hospital'
                     ,'num_lab_procedures'
                     ,'num_procedures'
                     ,'num_medications'
                     ,'number_outpatient'
                     ,'number_emergency'
                     ,'number_inpatient'
                     ,'number_diagnoses'
                     ,'hemoglobin_level'
                      
                     ]

Pipeline:

In [16]:
pipe = Pipeline([
    # Remove unwanted columns
    ('drop', DropFeatures(features_to_drop = features_to_drop)),
    
    # Correct type of columns
    ('object', CorrectType()),
    
    # Correct the race column
    ('race', CorrectRace()),
    
    # Map the age column
    ('age', ConvertAge()),
    
    # Correct the admission type column
    ('admission_type', CorrectAdmissionTypeCode()),
    
    # Correct the discharge disposition code column
    ('discharge_code', CorrectDischargeDispositionCode()),
    
    # Correct the admission source code column
    ('admission_code', CorrectAdmissionSourceCode()),
    
    # Correct the max glu serum column
    ('max_glu_serum', CorrectMaxGluSerum()),
    
    # Correct the A1Cresult column
    ('A1Cresult', CorrectA1CResult()),
    
    # Impute the categorical features with the mode
    ('categorical_imputer', CategoricalImputer(imputation_method = 'frequent',
                                               variables = categorical_features)),
    
    # Impute the numerical features with the median
    ('numerical_imputer', MeanMedianImputer(imputation_method = 'median',
                                            variables = numerical_features)),
    
    # Rare label encoding
    ('rare', RareLabelEncoder(tol = 0.01, 
                              n_categories = 2,
                              variables = categorical_features)),
    
    # One-hot encoding
    ('ohe', OneHotEncoder(variables = [f for f in categorical_features if f not in categorical_features_ohe_2mf])),
    
    # One-hot encoding of top 2
    ('ohe_diag', OneHotEncoder(top_categories = 10
                               , variables = categorical_features_ohe_2mf)),
    
    # Standard scaler
    ('scaling', StandardScaler()),
    
    # Model
    #('clf', RandomForestClassifier(n_estimators = 250
    #                              ,max_depth = 10
    #                              ,class_weight = 'balanced'
    #                              ,random_state = 42
    #                              ,n_jobs = -1))
    
    ('clf', LogisticRegression(class_weight = 'balanced',
                               C = 0.1,
                               random_state = 42))
    
    #('clf', XGBClassifier(scale_pos_weight = 9))
])

Fit the pipeline:

In [14]:
# X_train_tf = pipe.fit_transform(X_train)
# X_train_tf.columns

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)


Index(['age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'hemoglobin_level',
       'race_caucasian', 'race_african-american', 'race_other',
       'race_european', 'race_hispanic', 'race_black',
       'admission_type_code_1.0', 'admission_type_code_2.0',
       'admission_type_code_Unknown', 'admission_type_code_3.0',
       'admission_type_code_Rare', 'discharge_disposition_code_1.0',
       'discharge_disposition_code_Unknown', 'discharge_disposition_code_11.0',
       'discharge_disposition_code_6.0', 'discharge_disposition_code_3.0',
       'discharge_disposition_code_2.0', 'discharge_disposition_code_Rare',
       'discharge_disposition_code_5.0', 'discharge_disposition_code_22.0',
       'admission_source_code_7', 'admission_source_code_1',
       'admission_source_code_4', 'admission_source_code_6',
       'admission_source_code_17', 'admission_sou

In [17]:
# pipe.fit(X_train, y_train, clf__eval_metric = 'auc')
pipe.fit(X_train, y_train)

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)


Pipeline(steps=[('drop',
                 DropFeatures(features_to_drop=['admission_id', 'patient_id',
                                                'weight', 'medical_specialty',
                                                'payer_code', 'blood_type',
                                                'has_prosthesis', 'diag_1',
                                                'diag_2', 'diag_3'])),
                ('object', CorrectType()), ('race', CorrectRace()),
                ('age', ConvertAge()),
                ('admission_type', CorrectAdmissionTypeCode()),
                ('discharge_code', CorrectDischargeDispositionCode(...
                                          'discharge_disposition_code',
                                          'admission_source_code',
                                          'blood_transfusion', 'max_glu_serum',
                                          'A1Cresult', 'diuretics', 'insulin',
                                          'change', 'di

In [18]:
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.93      0.68      0.79     21702
        True       0.18      0.57      0.28      2722

    accuracy                           0.67     24424
   macro avg       0.55      0.62      0.53     24424
weighted avg       0.84      0.67      0.73     24424



In [47]:
#param_grid = {    
#    # try different gradient boosted tree model paramenters
#    'clf__n_estimators' : [250, 500, 1000],
#    'clf__max_depth' : [None, 5, 10, 15]
#}

param_grid = {    
    # try different gradient boosted tree model paramenters
    'clf__penalty' : ['l1', 'l2', 'elasticnet'],
    'clf__C' : [1e-4, 1e-3, 1e-2, 1e-1, 0, 1]
}

grid_search = GridSearchCV(pipe, 
                           param_grid,
                           cv = 5, 
                           n_jobs = -1, 
                           scoring='precision',
                           verbose = 5)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1416, in fit
    for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/joblib/parallel.py", line 1048, in __call__
    if self.dispatch_one_batch(iterator):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/joblib/parallel.py

  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got 

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty

    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 59

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got 

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got 

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-pack

  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent

  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
 

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "considered f

  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-pack

  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent"

  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent"

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got 

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-pack

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got 

  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered f

    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
Traceback (most recent call last):
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/pgomes/.virtualenvs/blu15/lib/python3.7/site-packages/sklearn/pipeline.p

  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('drop',
                                        DropFeatures(features_to_drop=['admission_id',
                                                                       'patient_id',
                                                                       'weight',
                                                                       'medical_specialty',
                                                                       'payer_code',
                                                                       'blood_type',
                                                                       'has_prosthesis',
                                                                       'diag_1',
                                                                       'diag_2',
                                                                       'diag_3'])),
                                       ('object', CorrectType()),
                 

In [48]:
print(grid_search.best_params_)

y_pred = grid_search.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prc = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1s = f1_score(y_test, y_pred)

print(acc)
print(prc)
print(rec)
print(f1s)

print(classification_report(y_test, y_pred))

{'clf__C': 0.1, 'clf__penalty': 'l2'}
0.6701604978709466
0.1834045584045584
0.5675973548861132
0.27722949937197205
              precision    recall  f1-score   support

       False       0.93      0.68      0.79     21702
        True       0.18      0.57      0.28      2722

    accuracy                           0.67     24424
   macro avg       0.55      0.63      0.53     24424
weighted avg       0.84      0.67      0.73     24424



In [33]:
y_pred_2 = pipe.predict(X_test)

In [34]:
print(accuracy_score(y_test, y_pred_2))
print(precision_score(y_test, y_pred_2))
print(recall_score(y_test, y_pred_2))
print(f1_score(y_test, y_pred_2))

0.647109400589584
0.1642946601389047
0.5301249081557678
0.2508474576271187
