#### Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import clfmodels
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings("ignore")

#### Importing Dataset

In [2]:
df = pd.read_csv("Clean-Model-Data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,2,0,0,6,25,1,1,41,...,0,1,1,0,0,0,0,1,0,2
1,149190,55629189,2,0,1,1,1,7,3,59,...,0,3,1,0,0,0,0,0,1,1
2,64410,86047875,0,0,2,1,1,7,2,11,...,0,1,1,0,0,0,0,1,1,2
3,500364,82442376,2,1,3,1,1,7,2,44,...,0,3,1,0,0,0,0,0,1,2
4,16680,42519267,2,1,4,1,1,7,1,51,...,0,2,1,0,0,0,0,0,1,2


<b>
    Spliiting Dependent & Independent Variables
</b>

In [3]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [4]:
x = df[['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed']]

y = df[['readmitted']]

#### Splitting Training & Test Dataset

In [5]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)

#### Prediction Model

##### Gaussian Naive Bayes

In [6]:
gnb = clfmodels.GaussianNaiveBayes(x_train,y_train)
gnb

{'Name': 'Gaussian Naive Bayes Classifier',
 'Model': GaussianNB(),
 'Train Data - Accuracy/Score': 0.49585975577799907}

In [7]:
gnb['Model'].score(x_test,y_test)

0.4966197625972801

In [29]:
print(classification_report(y_test,gnb['Model'].predict(x_test)))

              precision    recall  f1-score   support

           0       0.17      0.39      0.24      2824
           1       0.42      0.03      0.05      8679
           2       0.60      0.79      0.68     13939

    accuracy                           0.49     25442
   macro avg       0.40      0.41      0.33     25442
weighted avg       0.49      0.49      0.42     25442



In [30]:
confusion_matrix(y_test,gnb['Model'].predict(x_test))

array([[ 1115,    59,  1650],
       [ 2725,   252,  5702],
       [ 2583,   293, 11063]], dtype=int64)

#### Multinomial Naive Bayes

In [31]:
mnb = clfmodels.MultinomialNaiveBayes(x_train,y_train)
mnb

{'Name': 'Multinomial Naive Bayes Classifier',
 'Model': MultinomialNB(),
 'Train Data - Accuracy/Score': 0.44050364236675227}

In [32]:
mnb['Model'].score(x_test,y_test)

0.4391557267510416

#### SVM

In [None]:
#svc = clfmodels.SVMClassifier(x_train,y_train)
#svc

In [None]:
#svc['Model'].score(x_test,y_test)

#### CART

In [33]:
cart = clfmodels.CART(x_train,y_train)
cart

{'Name': 'Classification & Regression Tree',
 'Model': DecisionTreeClassifier(),
 'Train Data - Accuracy/Score': 1.0}

In [34]:
cart['Model'].score(x_test,y_test)

0.47433377879097555

In [35]:
print(classification_report(y_test,cart['Model'].predict(x_test)))

              precision    recall  f1-score   support

           0       0.14      0.15      0.15      2824
           1       0.39      0.40      0.39      8679
           2       0.60      0.59      0.60     13939

    accuracy                           0.47     25442
   macro avg       0.38      0.38      0.38     25442
weighted avg       0.48      0.47      0.48     25442



In [36]:
confusion_matrix(y_test,cart['Model'].predict(x_test))

array([[ 431, 1125, 1268],
       [1146, 3457, 4076],
       [1405, 4354, 8180]], dtype=int64)

In [12]:
cart['Model'].get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

#### ID3 Tree

In [37]:
id3 = clfmodels.ID3(x_train,y_train)
id3

{'Name': 'ID3 Classifier',
 'Model': DecisionTreeClassifier(criterion='entropy'),
 'Train Data - Accuracy/Score': 1.0}

In [38]:
id3['Model'].score(x_test,y_test)

0.4735476770694128

#### C4.5 Tree

In [39]:
c45 = clfmodels.C4_5(x_train,y_train)
c45

{'Name': 'C4.5 Classifier',
 'Model': DecisionTreeClassifier(criterion='entropy'),
 'Train Data - Accuracy/Score': 1.0}

In [16]:
c45['Model'].score(x_test,y_test)

0.47971857558368053

#### Random Forest

In [40]:
rfc = clfmodels.RandomForest(x_train,y_train)
rfc

{'Name': 'Random Forest Classifier',
 'Model': RandomForestClassifier(),
 'Train Data - Accuracy/Score': 1.0}

In [41]:
rfc['Model'].score(x_test,y_test)

0.5877682572124833

<b> Hyperparameter Tuning </b>

In [10]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [12]:
params = {
    'n_estimators': np.arange(100,1500,100),
    'max_features': ['auto','sqrt','log2'],
    'max_depth': np.arange(3,15,1),
    'min_samples_split': np.arange(2,15,1),
    'min_samples_leaf': np.arange(1,15,1)
}

In [None]:
rfg = GridSearchCV(rfc['Model'],params,cv = 5)
rfg.fit(x_train,y_train)

In [26]:
rfr = RandomizedSearchCV(rfc['Model'],params,cv=5)

In [27]:
rfr.fit(x_train,y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'n_estimators': array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
       1200, 1300, 1400])})

In [28]:
rfr.best_params_

{'n_estimators': 1300,
 'min_samples_split': 13,
 'min_samples_leaf': 13,
 'max_features': 'auto',
 'max_depth': 14}

In [29]:
rfr.score(x_train,y_train)

0.6220978984329961

In [30]:
rfr.score(x_test,y_test)

0.5881613080732647

### Gradient Booster

In [19]:
grb = clfmodels.GradientBooster(x_train,y_train)
grb

{'Name': 'Gradient Boost Classifier',
 'Model': GradientBoostingClassifier(),
 'Train Data - Accuracy/Score': 0.5934044337298884}

In [20]:
grb['Model'].score(x_test,y_test)

0.5917380709063753

In [23]:
#Hyperparameter Tuning

### Ada Boost

In [24]:
adb = clfmodels.AdaBoost(x_train,y_train)
adb

{'Name': 'Ada Boost Classifier',
 'Model': AdaBoostClassifier(),
 'Train Data - Accuracy/Score': 0.5820580682354174}

In [26]:
adb['Model'].score(x_test,y_test)

0.5826192909362472

In [27]:
# Hyperparameter Tuning

### XGBoost

In [29]:
xgb = clfmodels.XGBoost(x_train,y_train)
xgb

{'Name': 'XGBoost Classifier',
 'Model': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, objective='multi:softprob', ...),
 'Train Data - Accuracy/Score': 0.6581940149887322}

In [30]:
xgb['Model'].score(x_test,y_test)

0.5941749862432198

In [31]:
# Hyperparameter Tuning

In [1]:
from sklearn.linear_model import LogisticRegression

In [44]:
mod = LogisticRegression()

In [45]:
mod.fit(x_train,y_train)

In [46]:
mod.score(x_train,y_train)

0.561854724595147

In [47]:
mod.score(x_test,y_test)

0.5685480701202735

In [48]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [49]:
params = {
    'C':[0.0001,0.01,0.1,1,10],
    'penalty':['l1','l2'],
    'solver':['liblinear','saga']}

In [50]:
gs = GridSearchCV(mod,params,cv=5,scoring='accuracy')