# Pipeline Diabetes Dataset

`David Andrés Torres Forero`

This notebook is a result of the previous analysis and understanding of the following dataset: `10 Years Diabetes Dataset` found on Kaggle: https://www.kaggle.com/datasets/jimschacko/10-years-diabetes-dataset.

The main idea of using this information is to propose a workflow by using the most indicated ML algorithm measured on EDA notebook. 

## Importing libraries

In [1]:
#General libraries 
import pandas as pd
import numpy as np
import seaborn as sns

#Pipeline creation libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

#Modeling libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

#Hyperparameter tunning
from sklearn.model_selection import GridSearchCV

#Automation libraries
from joblib import dump

In [2]:
df = pd.read_csv('/work/diabetic_data.csv')

In [3]:
db = df.copy()
db = db.replace("?",np.nan)
db = db.drop(columns=["encounter_id","patient_nbr",'discharge_disposition_id',
                    'admission_source_id','payer_code', 'medical_specialty',
                    'diag_1', 'diag_2', 'diag_3',"medical_specialty"])

db = db.replace({"No":0,"Down":0, "Steady":1, "Up":1, "Ch":1,"NO":0, "<30":1, ">30":1, "Yes": 1})

db = db.drop(columns=["examide", "citoglipton",'glyburide-metformin', 'glipizide-metformin',
                    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone','acarbose', 
                    'miglitol', 'troglitazone', 'tolazamide', 'tolbutamide','acetohexamide','chlorpropamide',
                    'repaglinide', 'nateglinide'])

In [4]:
train, test = train_test_split(db, test_size=0.3, 
                               random_state=42,
                              stratify=db['readmitted'])

In [5]:
train.to_csv('/work/diabetic_train.csv', index=False)
test.to_csv('/work/diabetic_test.csv', index=False)

## Reading datasets previous development

In [6]:
train = pd.read_csv('/work/diabetic_train.csv')
test = pd.read_csv('/work/diabetic_test.csv')

## Pipeline

### Pipeline for numeric and categorical types

In [7]:
numeric_pipeline = Pipeline(
    [
        ('Imputación con la media',   SimpleImputer(strategy='mean')),
        ('Escalado minmax', MinMaxScaler())
        
    ]
)

In [8]:
categorical_pipeline = Pipeline(
        [
            ('imputación moda', SimpleImputer(strategy='most_frequent')),
            ('onehot encoder', OneHotEncoder(handle_unknown="ignore",sparse=False))
        ]
)

### Database transformation

In [9]:
column_transformer = ColumnTransformer(
    [
        ('numeric pipeline', numeric_pipeline,[4,5,6,7,8,9,10,11,14,15,16,17,18,19,20,21,22]),
        ('categorical pipeline', categorical_pipeline,[0,1,2,3,12,13])
    ]
)

### Features and target selection

In [10]:
features = [['race', 'gender', 'age', 'weight',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'change', 'diabetesMed']]

In [11]:
#Train
train = train.dropna()
X_train = train[features[0]]
y_train = train['readmitted']
X_train = X_train.dropna()

#Test
test = test.dropna()
X_test = test[features[0]]
y_test = test['readmitted']
X_test = X_test.dropna()

## Pipeline ML model

In [12]:
final_pipeline = Pipeline(
    [
        ('preprocesamiento', column_transformer),
        ('modelo',SVC(C=1.0,kernel='rbf'))
    ]
)

## Results

In [13]:
results = cross_validate(final_pipeline,X_train,y_train,
                        cv=10, return_train_score=True,
                        scoring=['accuracy', 'f1', 'precision', 'recall'])

In [14]:
print('train Accuracy ', results['train_accuracy'].mean(),'+-', results['train_accuracy'].std())
print('val Accuracy', results['test_accuracy'].mean(),'+-', results['test_accuracy'].std())
print('---'*30)
print('train f1 ', results['train_f1'].mean(),'+-', results['train_f1'].std())
print('val f1', results['test_f1'].mean(),'+-', results['test_f1'].std())
print('---'*30)
print('train precision ', results['train_precision'].mean(),'+-', results['train_precision'].std())
print('val precision', results['test_precision'].mean(),'+-', results['test_precision'].std())
print('---'*30)
print('train recall ', results['train_recall'].mean(),'+-', results['train_recall'].std())
print('val recall', results['test_recall'].mean(),'+-', results['test_recall'].std())


train Accuracy  0.6602777455581297 +- 0.006229127475181921
val Accuracy 0.6136008117363547 +- 0.017982493674595777
------------------------------------------------------------------------------------------
train f1  0.7676909407503114 +- 0.002326566096526807
val f1 0.7375064922339936 +- 0.01331330494333595
------------------------------------------------------------------------------------------
train precision  0.6453855729167023 +- 0.006142850926717598
val precision 0.6173075222694355 +- 0.011641486992039846
------------------------------------------------------------------------------------------
train recall  0.9473790574514297 +- 0.008929329669090555
val recall 0.9168907461240311 +- 0.034438994153557834


## Architecture

In [15]:
final_pipeline.fit(X_train, y_train)

## Exporting joblib

In [17]:
dump(final_pipeline,'model.joblib')
dump(final_pipeline,'model.pkl')

['model.pkl']

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=49f0a3e6-eba7-4bd1-896c-a13c36012a92' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>