# Pipeline Diabetes Dataset

`David Andrés Torres Forero`

This notebook is a result of the previous analysis and understanding of the following dataset: `10 Years Diabetes Dataset` found on Kaggle: https://www.kaggle.com/datasets/jimschacko/10-years-diabetes-dataset.

The main idea of using this information is to propose a workflow by using the most indicated ML algorithm measured on EDA notebook. 

## Importing libraries

In [1]:
#General libraries 
import pandas as pd
import numpy as np
import seaborn as sns

#Pipeline creation libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

#Modeling libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

#Hyperparameter tunning
from sklearn.model_selection import GridSearchCV

#Automation libraries
from joblib import dump

In [2]:
#df = pd.read_csv('/work/diabetic_data.csv')

In [3]:
'''
db = df.copy()
db = db.replace("?",np.nan)
db = db.drop(columns=["encounter_id","patient_nbr",'discharge_disposition_id',
                    'admission_source_id','payer_code', 'medical_specialty',
                    'diag_1', 'diag_2', 'diag_3',"medical_specialty"])

db = db.replace({"No":0,"Down":0, "Steady":1, "Up":1, "Ch":1,"NO":0, "<30":1, ">30":1, "Yes": 1})

db = db.drop(columns=["examide", "citoglipton",'glyburide-metformin', 'glipizide-metformin',
                    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','acarbose', 
                    'miglitol', 'troglitazone', 'tolazamide', 'tolbutamide','acetohexamide','chlorpropamide',
                    'repaglinide', 'nateglinide']
                    '''

'\ndb = df.copy()\ndb = db.replace("?",np.nan)\ndb = db.drop(columns=["encounter_id","patient_nbr",\'discharge_disposition_id\',\n                    \'admission_source_id\',\'payer_code\', \'medical_specialty\',\n                    \'diag_1\', \'diag_2\', \'diag_3\',"medical_specialty"])\n\ndb = db.replace({"No":0,"Down":0, "Steady":1, "Up":1, "Ch":1,"NO":0, "<30":1, ">30":1, "Yes": 1})\n\ndb = db.drop(columns=["examide", "citoglipton",\'glyburide-metformin\', \'glipizide-metformin\',\n                    \'glimepiride-pioglitazone\', \'metformin-rosiglitazone\', \'metformin-pioglitazone\',\'acarbose\', \n                    \'miglitol\', \'troglitazone\', \'tolazamide\', \'tolbutamide\',\'acetohexamide\',\'chlorpropamide\',\n                    \'repaglinide\', \'nateglinide\']\n                    '

In [4]:
db = pd.read_csv('/work/diabetic_data_prepared.csv')

In [5]:
train, test = train_test_split(db, test_size=0.3, 
                               random_state=42,
                              stratify=db['readmitted'])

## Pipeline

### Pipeline for numeric and categorical types

In [6]:
numeric_pipeline = Pipeline(
    [
        ('Imputación con la media',   SimpleImputer(strategy='mean')),
        ('Escalado minmax', MinMaxScaler())
        
    ]
)

In [7]:
categorical_pipeline = Pipeline(
        [
            ('imputación moda', SimpleImputer(strategy='most_frequent')),
            ('onehot encoder', OneHotEncoder(handle_unknown="ignore",sparse=False))
        ]
)

### Database transformation

In [83]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 22 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   race                99493 non-null   object
 1   gender              101766 non-null  object
 2   age                 101766 non-null  object
 3   weight              3197 non-null    object
 4   admission_type_id   101766 non-null  int64 
 5   time_in_hospital    101766 non-null  int64 
 6   num_lab_procedures  101766 non-null  int64 
 7   num_procedures      101766 non-null  int64 
 8   num_medications     101766 non-null  int64 
 9   number_outpatient   101766 non-null  int64 
 10  number_emergency    101766 non-null  int64 
 11  number_inpatient    101766 non-null  int64 
 12  number_diagnoses    101766 non-null  int64 
 13  max_glu_serum       101766 non-null  object
 14  A1Cresult           101766 non-null  object
 15  metformin           101766 non-null  int64 
 16  gl

In [9]:
column_transformer = ColumnTransformer(
    [
        ('numeric pipeline', numeric_pipeline,[4,5,6,7,8,9,10,11,12,15,16,17,18,19,20]),
        ('categorical pipeline', categorical_pipeline,[0,1,2,3,13,14])
    ]
)

### Features and target selection

In [10]:
features = [['race', 'gender', 'age', 'weight', 'admission_type_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone']]

In [11]:
#Train
train = train.dropna()
X_train = train[features[0]]
y_train = train['readmitted']
X_train = X_train.dropna()

#Test
test = test.dropna()
X_test = test[features[0]]
y_test = test['readmitted']
X_test = X_test.dropna()

## Pipeline ML model

In [12]:
svc_pipeline = Pipeline(
    [
        ('preprocesamiento', column_transformer),
        ('modelo',SVC(C=1.0,kernel='rbf'))
    ]
)

## Results SVM

In [13]:
svc_results = cross_validate(svc_pipeline,X_train,y_train,
                        cv=10, return_train_score=True,
                        scoring=['accuracy', 'f1', 'precision', 'recall'])

In [14]:
print('train Accuracy ', svc_results['train_accuracy'].mean(),'+-', svc_results['train_accuracy'].std())
print('val Accuracy', svc_results['test_accuracy'].mean(),'+-', svc_results['test_accuracy'].std())
print('---'*30)
print('train f1 ', svc_results['train_f1'].mean(),'+-', svc_results['train_f1'].std())
print('val f1', svc_results['test_f1'].mean(),'+-', svc_results['test_f1'].std())
print('---'*30)
print('train precision ', svc_results['train_precision'].mean(),'+-', svc_results['train_precision'].std())
print('val precision', svc_results['test_precision'].mean(),'+-', svc_results['test_precision'].std())
print('---'*30)
print('train recall ', svc_results['train_recall'].mean(),'+-', svc_results['train_recall'].std())
print('val recall', svc_results['test_recall'].mean(),'+-', svc_results['test_recall'].std())


train Accuracy  0.6505669177709298 +- 0.005024479061978669
val Accuracy 0.6094787130596542 +- 0.013913523289869415
------------------------------------------------------------------------------------------
train f1  0.7645840543740607 +- 0.0022914154737320135
val f1 0.738499603444507 +- 0.012417828993396855
------------------------------------------------------------------------------------------
train precision  0.6362869202944232 +- 0.003932600665484908
val precision 0.611977168878834 +- 0.007331599281520526
------------------------------------------------------------------------------------------
train recall  0.9577295379488857 +- 0.0030846407644336588
val recall 0.9316557655038761 +- 0.0323697627674376


In [15]:

svc_results = cross_validate(svc_pipeline,X_train,y_train,
                        cv=10, return_train_score=True,
                        scoring=['accuracy', 'f1', 'precision', 'recall'])

## Random Forest

In [16]:
rdc_pipeline = Pipeline(
    [
        ('preprocesamiento', column_transformer),
        ('modelo',RandomForestClassifier(max_depth=3))
    ]
)

In [17]:
rdc_results = cross_validate(rdc_pipeline,X_train,y_train,
                        cv=10, return_train_score=True,
                        scoring=['accuracy', 'f1', 'precision', 'recall'])

In [18]:
print('train Accuracy ', rdc_results['train_accuracy'].mean(),'+-', rdc_results['train_accuracy'].std())
print('val Accuracy', rdc_results['test_accuracy'].mean(),'+-', rdc_results['test_accuracy'].std())
print('---'*30)
print('train f1 ', rdc_results['train_f1'].mean(),'+-', rdc_results['train_f1'].std())
print('val f1', rdc_results['test_f1'].mean(),'+-', rdc_results['test_f1'].std())
print('---'*30)
print('train precision ', rdc_results['train_precision'].mean(),'+-', rdc_results['train_precision'].std())
print('val precision', rdc_results['test_precision'].mean(),'+-', rdc_results['test_precision'].std())
print('---'*30)
print('train recall ', rdc_results['train_recall'].mean(),'+-', rdc_results['train_recall'].std())
print('val recall', rdc_results['test_recall'].mean(),'+-', rdc_results['test_recall'].std())

train Accuracy  0.6051819382574012 +- 0.003473480242046975
val Accuracy 0.598435716399611 +- 0.006792422322299558
------------------------------------------------------------------------------------------
train f1  0.7496551603419267 +- 0.0015677742115983045
val f1 0.7459995657929769 +- 0.004096125442146442
------------------------------------------------------------------------------------------
train precision  0.6003719580820168 +- 0.0021325842865747018
val precision 0.5965744217948702 +- 0.004279098373978748
------------------------------------------------------------------------------------------
train recall  0.9977568355597869 +- 0.0012913996218934937
val recall 0.9953367248062015 +- 0.006222817085226626


## LogisticRegression

In [19]:
lr_pipeline = Pipeline(
    [
        ('preprocesamiento', column_transformer),
        ('modelo',LogisticRegression(max_iter=1000))
    ]
)

In [20]:
lr_results = cross_validate(lr_pipeline,X_train,y_train,
                        cv=10, return_train_score=True,
                        scoring=['accuracy', 'f1', 'precision', 'recall'])

In [21]:
print('train Accuracy ', lr_results['train_accuracy'].mean(),'+-', lr_results['train_accuracy'].std())
print('val Accuracy', lr_results['test_accuracy'].mean(),'+-', lr_results['test_accuracy'].std())
print('---'*30)
print('train f1 ', lr_results['train_f1'].mean(),'+-', lr_results['train_f1'].std())
print('val f1', lr_results['test_f1'].mean(),'+-', lr_results['test_f1'].std())
print('---'*30)
print('train precision ', lr_results['train_precision'].mean(),'+-', lr_results['train_precision'].std())
print('val precision', lr_results['test_precision'].mean(),'+-', lr_results['test_precision'].std())
print('---'*30)
print('train recall ', lr_results['train_recall'].mean(),'+-', lr_results['train_recall'].std())
print('val recall', lr_results['test_recall'].mean(),'+-', lr_results['test_recall'].std())

train Accuracy  0.6412141983107151 +- 0.0049752493373685786
val Accuracy 0.6255654673825731 +- 0.024437526069583795
------------------------------------------------------------------------------------------
train f1  0.7386502252752913 +- 0.002723319563871483
val f1 0.7264551212418848 +- 0.019128050514559677
------------------------------------------------------------------------------------------
train precision  0.649770683409401 +- 0.004578030574961014
val precision 0.6405404720275611 +- 0.016208509551633853
------------------------------------------------------------------------------------------
train recall  0.8557627711165988 +- 0.005597504433470905
val recall 0.8400859980620154 +- 0.03828034170270812


## DecisionTreeClassifier

In [22]:
dtc_pipeline = Pipeline(
    [
        ('preprocesamiento', column_transformer),
        ('modelo',DecisionTreeClassifier(max_depth=5))
    ]
)

In [23]:
dtc_results = cross_validate(dtc_pipeline,X_train,y_train,
                        cv=10, return_train_score=True,
                        scoring=['accuracy', 'f1', 'precision', 'recall'])

In [25]:
print('train Accuracy ', dtc_results['train_accuracy'].mean(),'+-', dtc_results['train_accuracy'].std())
print('val Accuracy', dtc_results['test_accuracy'].mean(),'+-', dtc_results['test_accuracy'].std())
print('---'*30)
print('train f1 ', dtc_results['train_f1'].mean(),'+-', dtc_results['train_f1'].std())
print('val f1', dtc_results['test_f1'].mean(),'+-', dtc_results['test_f1'].std())
print('---'*30)
print('train precision ', dtc_results['train_precision'].mean(),'+-', dtc_results['train_precision'].std())
print('val precision', dtc_results['test_precision'].mean(),'+-', dtc_results['test_precision'].std())
print('---'*30)
print('train recall ', dtc_results['train_recall'].mean(),'+-', dtc_results['train_recall'].std())
print('val recall', dtc_results['test_recall'].mean(),'+-', dtc_results['test_recall'].std())

train Accuracy  0.6410097254572491 +- 0.0048950726522787496
val Accuracy 0.6016234727095928 +- 0.028574643027624847
------------------------------------------------------------------------------------------
train f1  0.7287259495143513 +- 0.014293080377532935
val f1 0.6965535864216982 +- 0.030363136780053566
------------------------------------------------------------------------------------------
train precision  0.6601989080451034 +- 0.014393107747038504
val precision 0.635720344294107 +- 0.02529446052643493
------------------------------------------------------------------------------------------
train recall  0.8165153521168665 +- 0.05296007231636327
val recall 0.7763989825581395 +- 0.07163929144623694


## Architecture

In [72]:
lr_pipeline.fit(X_train, y_train)

## Exporting joblib

In [73]:
dump(lr_pipeline,'model.joblib')
dump(lr_pipeline,'model.pkl')

['model.pkl']

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=49f0a3e6-eba7-4bd1-896c-a13c36012a92' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>