In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

<h1>Model Training and Improvement - Classification</h1>

In [3]:
diabetes_data = pd.read_csv("data/diabetes/diabetic_data.csv")

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [6]:
diabetes_data.shape

(101766, 50)

In [7]:
diabetes_data_target = diabetes_data["readmitted"]
diabetes_data_attributes = diabetes_data.drop("readmitted", axis = 1)

In [8]:
diabetes_data_attributes = pd.get_dummies(diabetes_data_attributes)

In [9]:
diabetes_data_attributes.shape

(101766, 2472)

In [10]:
diabetes_data_attributes_scaled = MinMaxScaler().fit_transform(diabetes_data_attributes)

In [11]:
logistic_regression_base = LogisticRegression(C = 1e9)
logistic_regression_base.fit(diabetes_data_attributes_scaled, diabetes_data_target)
logistic_regression_base.score(diabetes_data_attributes_scaled, diabetes_data_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5953068804905371

In [12]:
logistic_regression_base = LogisticRegression(C = 1)
logistic_regression_base.fit(diabetes_data_attributes_scaled, diabetes_data_target)
logistic_regression_base.score(diabetes_data_attributes_scaled, diabetes_data_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5950317394807696

In [13]:
logistic_regression_base = LogisticRegression(C = 0.001)
logistic_regression_base.fit(diabetes_data_attributes_scaled, diabetes_data_target)
logistic_regression_base.score(diabetes_data_attributes_scaled, diabetes_data_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5645402197197492

In [14]:
diabetes_data_attributes_train, diabetes_data_attributes_test, diabetes_data_target_train, diabetes_data_target_test = train_test_split(diabetes_data_attributes, diabetes_data_target, test_size = 10000, random_state = 30, stratify = diabetes_data_target)

In [15]:
diabetes_data_attributes_train.shape, diabetes_data_attributes_test.shape

((91766, 2472), (10000, 2472))

In [16]:
diabetes_data_target_train.shape, diabetes_data_target_test.shape

((91766,), (10000,))

In [17]:
diabetes_data.groupby("readmitted").size()

readmitted
<30    11357
>30    35545
NO     54864
dtype: int64

In [18]:
diabetes_data_target_train.groupby(diabetes_data_target_train).size() / len(diabetes_data_target_train)

readmitted
<30    0.111599
>30    0.349280
NO     0.539121
Name: readmitted, dtype: float64

In [19]:
diabetes_data_target_test.groupby(diabetes_data_target_test).size() / len(diabetes_data_target_test)

readmitted
<30    0.1116
>30    0.3493
NO     0.5391
Name: readmitted, dtype: float64

In [20]:
logistic_regression_split = LogisticRegression(C = 5)
logistic_regression_split.fit(diabetes_data_attributes_train, diabetes_data_target_train)

LogisticRegression(C=5)

In [21]:
logistic_regression_split.score(diabetes_data_attributes_train, diabetes_data_target_train)

0.5413551860166075

In [22]:
logistic_regression_split.score(diabetes_data_attributes_test, diabetes_data_target_test)

0.5436

In [23]:
diabetes_predictions_test = logistic_regression_split.predict(diabetes_data_attributes_test)

In [24]:
print(classification_report(diabetes_data_target_test, diabetes_predictions_test))

              precision    recall  f1-score   support

         <30       0.00      0.00      0.00      1116
         >30       0.45      0.14      0.21      3493
          NO       0.56      0.92      0.69      5391

    accuracy                           0.54     10000
   macro avg       0.34      0.35      0.30     10000
weighted avg       0.46      0.54      0.45     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
params = {
    "C": [0.01, 0.1, 1, 10, 100, 1000],
    "fit_intercept": [True, False],
    "max_iter": [10, 100, 1000]
}

In [26]:
grid_search = GridSearchCV(estimator = LogisticRegression(), param_grid = params)

In [27]:
grid_search.fit(diabetes_data_attributes_train[:100], diabetes_data_target_train[:100])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000],
                         'fit_intercept': [True, False],
                         'max_iter': [10, 100, 1000]})

In [28]:
grid_search.best_estimator_

LogisticRegression(C=0.01, max_iter=10)

In [29]:
grid_search.best_estimator_.score(diabetes_data_attributes_test, diabetes_data_target_test)

0.5378

In [30]:
print(classification_report(
    diabetes_data_target_test,
    grid_search.best_estimator_.predict(diabetes_data_attributes_test)
))

              precision    recall  f1-score   support

         <30       0.00      0.00      0.00      1116
         >30       0.40      0.03      0.06      3493
          NO       0.54      0.98      0.70      5391

    accuracy                           0.54     10000
   macro avg       0.31      0.34      0.25     10000
weighted avg       0.43      0.54      0.40     10000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
grid_search.cv_results_

{'mean_fit_time': array([0.20897737, 0.06635809, 0.0789609 , 0.06056695, 0.07655387,
        0.06036391, 0.06935768, 0.0705617 , 0.07875905, 0.0617672 ,
        0.07016439, 0.07255764, 0.06136823, 0.07135668, 0.07795596,
        0.06436429, 0.06156402, 0.07775507, 0.06376047, 0.08175178,
        0.06916285, 0.07355905, 0.07915688, 0.09654474, 0.11133637,
        0.13152294, 0.11173873, 0.07055879, 0.0793551 , 0.10134177,
        0.07775841, 0.08435683, 0.09474916, 0.07695794, 0.10203667,
        0.07895541]),
 'std_fit_time': array([0.28406906, 0.01038309, 0.02045736, 0.00872715, 0.01994727,
        0.00475637, 0.0207783 , 0.00652377, 0.03124552, 0.0097974 ,
        0.01171577, 0.02346081, 0.00955501, 0.00928682, 0.02473257,
        0.01379942, 0.005533  , 0.02848066, 0.01266135, 0.02078669,
        0.00969401, 0.02032115, 0.00892693, 0.02387442, 0.02067251,
        0.0265823 , 0.0280214 , 0.00338224, 0.00688366, 0.02815697,
        0.0052266 , 0.00780837, 0.02773593, 0.00900929, 0.027

In [32]:
grid_search.best_params_

{'C': 0.01, 'fit_intercept': True, 'max_iter': 10}