In [1]:
%matplotlib inline

In [193]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_curve

# Model Training and improvement

## Live Demo

In [10]:
diab = pd.read_csv('data/diabetic_data.csv')

In [16]:
# we separate the data of the target feature
# the target may be called label since it is classification
diab_attrs = diab.drop(columns='readmitted')
diab_tgt = diab['readmitted']

In [17]:
# transforming one-hot-encoding the categorial var
# this way we transform all cols of the dataframe
diab_attrs = pd.get_dummies(diab_attrs, drop_first=True)

In [19]:
logistic_regression = LogisticRegression()

In [20]:
logistic_regression.fit(diab_attrs, diab_tgt)

In [24]:
scaler = MinMaxScaler()

In [25]:
diab_attrs_scaled = scaler.fit_transform(diab_attrs)

In [27]:
logistic_regression.fit(diab_attrs_scaled, diab_tgt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# passing the preprocces of data to a pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('model', LogisticRegression())
])

In [30]:
pipeline

In [34]:
# when we fit, we fit the original data, 
# they are then scalled and passed to the model
# we take sample data to improve performance
sample_data = diab.sample(5000, random_state=42)

In [38]:
sample_data_attrs = pd.get_dummies(sample_data.drop(columns='readmitted'))
sample_tgt = sample_data['readmitted']

In [39]:
pipeline.fit(sample_data_attrs, sample_tgt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
sample_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [55]:
categorical_columns = sample_data.dtypes[sample_data.dtypes == np.object_].index.values

In [56]:
categorical_columns = columns[:-1]

In [64]:
# numerical_columns = sample_data.dtypes[sample_data.dtypes != np.object_].index.values
numerical_columns = ['discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses']

In [69]:
# now we can add one hot encoder to the pipeline, working similar to pd.get_dummies
# we use column selector and transformer to be passed to the ohe
# the remainder has to option to pass through the unchanged cols or to drop them
preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(), categorical_columns),
    ('numerical', MinMaxScaler(), numerical_columns)
], remainder='passthrough')

In [70]:
preprocessor

In [74]:
# we can pass a func to FunctionTransformer and that will transform the dataset
# applying the function to it
FunctionTransformer(np.log10).transform([10, 1000])

array([1., 3.])

In [77]:
number_processor = Pipeline([
    ('log_transformer', FunctionTransformer(np.log10)),
    ('minmax', MinMaxScaler())
])

In [78]:
# now we nest pipelines
preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(), categorical_columns),
    ('numerical', number_processor, numerical_columns)
], remainder='passthrough')

In [79]:
preprocessor

In [81]:
# example how to have data preprocessing and a model
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression())
])

In [82]:
pipeline

In [89]:
sample_data_attrs = sample_data.drop(columns='readmitted')
sample_data_tgt = sample_data['readmitted']

In [90]:
pipeline.fit(sample_data_attrs, sample_data_tgt)

  result = func(self.values, **kwargs)


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [96]:
# we have log(0) returns NaN, therefore we add ONLY for testing the pipeline a
# small value so we EPS
EPS = 1e-10
number_processor = Pipeline([
    ('log_transformer', FunctionTransformer(lambda x: np.log10(x + EPS))),
    ('minmax', MinMaxScaler())
])

In [97]:
preprocessor = ColumnTransformer([
    ('categorical', OneHotEncoder(), categorical_columns),
    ('numerical', number_processor, numerical_columns)
], remainder='drop')

In [98]:
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression())
])

In [99]:
pipeline.fit(sample_data_attrs, sample_data_tgt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [100]:
pipeline.score(sample_data_attrs, sample_data_tgt)

0.6488

In [102]:
# saving the workflow to be accessible when kernel is restarted or on another notebook
# pickle.dump() and pickle.load()

In [107]:
diab = pd.read_csv('data/diabetic_data.csv')

In [108]:
attrs = diab.sample(5000, random_state=12341234).drop(columns='readmitted')
tgt = diab.sample(5000, random_state=12341234)['readmitted']

In [109]:
pipeline.fit(attrs, tgt)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [124]:
pipeline['classifier'].coef_

array([[-0.27641266,  0.12975668, -0.26094978, ...,  0.23186478,
         0.37680239,  0.30292346],
       [ 0.09279994, -0.07867702,  0.00259239, ...,  0.14837258,
         0.18261066,  0.14278449],
       [ 0.18361272, -0.05107965,  0.25835739, ..., -0.38023736,
        -0.55941305, -0.44570795]])

In [125]:
pipeline_reg = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', LogisticRegression(C=1e-5))
])

In [126]:
pipeline_reg.fit(attrs, tgt)

In [127]:
pipeline_reg['classifier'].coef_

array([[-7.66815221e-05,  7.30547546e-05, -1.76902688e-05, ...,
         2.41420711e-04,  7.92213914e-04,  1.37170065e-04],
       [-5.97262342e-05, -1.51177786e-04, -1.73471068e-05, ...,
         5.86809270e-04,  1.44648474e-03,  2.32759266e-04],
       [ 1.36407756e-04,  7.81230318e-05,  3.50373756e-05, ...,
        -8.28229981e-04, -2.23869865e-03, -3.69929331e-04]])

In [133]:
train_test_split(attrs)[0].shape

(3750, 49)

In [134]:
train_test_split(attrs)[1].shape

(1250, 49)

In [135]:
attrs_train, attrs_test, tgt_train, tgt_test = train_test_split(attrs, tgt)

In [137]:
attrs_train.shape, tgt_train.shape

((3750, 49), (3750,))

In [139]:
attrs_test.shape, tgt_test.shape

((1250, 49), (1250,))

In [140]:
pipeline.fit(attrs_train, tgt_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [141]:
pipeline.score(attrs_train, tgt_train)

0.6872

In [146]:
diab_attrs = diab.drop(columns='readmitted')
diab_tgt = diab['readmitted']

In [153]:
diab_attrs_trn, diab_attrs_test, diab_tgt_trn, diab_tgt_test = train_test_split(diab_attrs, diab_tgt)

In [154]:
pipeline.fit(diab_attrs_trn, diab_tgt_trn)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [155]:
pipeline.score(diab_attrs_trn, diab_tgt_trn)

0.5867617001205387

In [156]:
pipeline.score(diab_attrs_test, diab_tgt_test)

ValueError: Found unknown categories ['Surgery-PlasticwithinHeadandNeck', 'Dermatology'] in column 5 during transform

In [159]:
print(classification_report(diab_tgt_trn, pipeline.predict(diab_attrs_trn)))

              precision    recall  f1-score   support

         <30       0.42      0.01      0.02      8557
         >30       0.51      0.38      0.44     26729
          NO       0.61      0.84      0.71     41038

    accuracy                           0.59     76324
   macro avg       0.52      0.41      0.39     76324
weighted avg       0.56      0.59      0.54     76324



In [186]:
y_pred = pipeline.predict(diab_attrs_trn).shape

In [187]:
y_pred[:10]

(76324,)

In [188]:
y_true = diab_tgt_trn

In [189]:
y_true.shape

(76324,)

In [190]:
y_true = y_true.tolist()

In [191]:
roc_curve(y_true, y_pred)

ValueError: multiclass format is not supported

In [196]:
cv = GridSearchCV(estimator=pipeline, param_grid={
    'classifier__C':[0.5, 1, 10],
    'preprocess__numerical__minmax__feature_range':[(-1, 1), (-0.5, 0.5), (0, 1)]
}, cv=5)

In [198]:
cv = cv.fit(diab_attrs_trn, diab_tgt_trn)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Traceback (most recent call last):
  File "C:\Users\Master\miniconda3\envs\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "C:\Users\Master\miniconda3\envs\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 455, in __call__
    return estimator.score(*args, **kwargs)
  File "C:\Users\Master\miniconda3\envs\venv\lib\site-packages\sklearn\pipeline.py", line 1000, in score
    Xt = transform.transform(Xt)
  File "C:\Users\Master\miniconda3\envs\venv\lib\site-packages\sklearn\utils\_set_output.py", line 313, in 

In [200]:
cv.best_params_

{'classifier__C': 0.5, 'preprocess__numerical__minmax__feature_range': (-1, 1)}

In [202]:
cv.cv_results_

{'mean_fit_time': array([5.14991384, 3.75744328, 3.71765094, 3.85362659, 3.73255897,
        3.73856144, 3.4566947 , 3.89111924, 3.65641522]),
 'std_fit_time': array([2.19690482, 0.18455827, 0.27462254, 0.37522707, 0.19204441,
        0.51649063, 0.05769683, 0.3843129 , 0.32655766]),
 'mean_score_time': array([0.21941733, 0.0616909 , 0.03474441, 0.03807344, 0.05365639,
        0.05824485, 0.03231311, 0.0307826 , 0.04927611]),
 'std_score_time': array([0.26066178, 0.02001281, 0.00589739, 0.00877147, 0.03769493,
        0.02515632, 0.00632715, 0.00546502, 0.02887292]),
 'param_classifier__C': masked_array(data=[0.5, 0.5, 0.5, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value=1e+20),
 'param_preprocess__numerical__minmax__feature_range': masked_array(data=[(-1, 1), (-0.5, 0.5), (0, 1), (-1, 1), (-0.5, 0.5),
                    (0, 1), (-1, 1), (-0.5, 0.5), (0, 1)],
              mask