In [1]:
# Load all the important libraries to create the DecisionTree Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [2]:
#import the data set

Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Dataset\CKD.csv")

In [3]:
Data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,yes
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,yes


In [4]:
#Lets check total number of rows and columns 
Data.shape

(399, 25)

In [5]:
#Check the dataset info 
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             399 non-null    float64
 1   bp              399 non-null    float64
 2   sg              399 non-null    object 
 3   al              399 non-null    float64
 4   su              399 non-null    float64
 5   rbc             399 non-null    object 
 6   pc              399 non-null    object 
 7   pcc             399 non-null    object 
 8   ba              399 non-null    object 
 9   bgr             399 non-null    float64
 10  bu              399 non-null    float64
 11  sc              399 non-null    float64
 12  sod             399 non-null    float64
 13  pot             399 non-null    float64
 14  hrmo            399 non-null    float64
 15  pcv             399 non-null    float64
 16  wc              399 non-null    float64
 17  rc              399 non-null    flo

In [6]:
# Lets check the description of data set
Data.describe()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,pcv,wc,rc
count,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0
mean,51.492308,76.459948,0.899749,0.39599,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,38.868902,8408.191126,4.705597
std,16.995379,13.492053,1.314769,1.041155,74.864224,49.336046,5.623758,9.215829,2.823323,2.715753,8.157274,2526.204544,0.841006
min,2.0,50.0,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1
25%,42.0,70.0,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.85,34.0,6950.0,4.5
50%,54.0,76.459948,0.0,0.0,127.0,44.0,1.4,137.528754,4.627244,12.518156,38.868902,8408.191126,4.705597
75%,64.0,80.0,2.0,0.0,150.0,62.5,3.077356,141.0,4.8,14.6,44.0,9400.0,5.1
max,90.0,180.0,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [7]:
# Lets check if the data set has NA values
Data[Data.isna().any(axis=1)]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification


In [8]:
# Lets check if the data set has Null values
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification


In [9]:
# Lets check how many classes are there in the target variable "Classification"
Data.classification.value_counts()

yes    249
no     150
Name: classification, dtype: int64

# From above output we got to know that there are 250 records in "Yes" class and 150 records in "No" class. so its called as imbalanced data set

In [10]:
# Lets upsample the "No" records to match with "Yes" records
Data_No = Data[Data["classification"]=="no"]

In [11]:
import random

samples_index = random.sample(range(150),99)

In [12]:
Data_No_final = Data_No.iloc[samples_index]

In [13]:
Data_No_final.reset_index(inplace=True,drop=True)

In [14]:
Data_No_final

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,34.0,80.0,b,0.0,0.0,normal,normal,notpresent,notpresent,121.0,...,52.0,9200.0,6.3,no,no,no,yes,poor,no,no
1,55.0,80.0,b,0.0,0.0,normal,normal,notpresent,notpresent,130.0,...,41.0,9100.0,6.0,no,no,no,yes,poor,no,no
2,47.0,80.0,a,0.0,0.0,normal,normal,notpresent,notpresent,93.0,...,52.0,8100.0,5.2,no,no,no,yes,poor,no,no
3,43.0,60.0,b,0.0,0.0,normal,normal,notpresent,notpresent,108.0,...,43.0,7200.0,5.5,no,no,no,yes,poor,no,no
4,32.0,70.0,b,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,43.0,6700.0,5.9,no,no,no,yes,poor,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,45.0,60.0,a,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,43.0,9200.0,5.8,no,no,no,yes,poor,no,no
95,28.0,60.0,b,0.0,0.0,normal,normal,notpresent,notpresent,79.0,...,51.0,6500.0,5.0,no,no,no,yes,poor,no,no
96,38.0,60.0,a,0.0,0.0,normal,normal,notpresent,notpresent,91.0,...,46.0,9100.0,5.8,no,no,no,yes,poor,no,no
97,73.0,60.0,a,0.0,0.0,normal,normal,notpresent,notpresent,127.0,...,52.0,11000.0,4.7,no,no,no,yes,poor,no,no


In [15]:
Data = Data.append(Data_No_final,ignore_index=True)

  Data = Data.append(Data_No_final,ignore_index=True)


In [16]:
Data.classification.value_counts()

yes    249
no     249
Name: classification, dtype: int64

# From above output we have upsampled the "No" class counts to match with "Yes" counts

In [17]:
# Lets convert the nominal categoty columns to numerical columns using one-hot-encoding via pandas get_dummies method

Data = pd.get_dummies(Data,drop_first=True)

In [18]:
Data.head()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,1,0,0,0,0,0,1,0,0,1
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,1,0,0,0,0,0,1,0,0,1
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,1,0,0,0,0,0,1,0,1,1
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,1,0,0,0,0,0,1,0,0,1


In [19]:
Data.columns

Index(['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes', 'classification_yes'],
      dtype='object')

In [20]:
# Lets seperate the dependent and independent variables
dependent = Data[['classification_yes']]
independent = Data[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes']]

In [21]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [23]:
# Lets spli the data to trai  and test set

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,random_state=0,test_size=30)

In [24]:
# Model Parameters 
Model_Params = {"criterion":['gini','entropy','log_loss'],'splitter':['best', 'random'],"max_depth":[i for i in range(1,len(Data.columns.to_list()))],
               'min_samples_split':[i for i in np.arange(0.1,1.0,0.3)],'min_samples_leaf':[i for i in np.arange(0.1,1.0,0.3)],
               'max_features':['auto','sqrt','log2'],'min_impurity_decrease':[i for i in np.arange(0.1,1.0,0.3)],'ccp_alpha':[0.00001,0.0001,0.001,0.01,0.1,1.0]}

In [25]:
# Lets create a model now using the Gridsearch CV

DTC_Model_Creation = GridSearchCV(DecisionTreeClassifier(),Model_Params,n_jobs=-1,scoring={"roc_auc_score":'roc_auc','f1_weighted_score':'f1_weighted'},
                                            refit='roc_auc_score',verbose=2)

In [26]:
DTC_Model_Creation.fit(X_Train,Y_Train)

Fitting 5 folds for each of 78732 candidates, totalling 393660 fits




In [27]:
# The best parameter that has givem high performance scores
DTC_Model_Creation.best_params_

{'ccp_alpha': 1e-05,
 'criterion': 'log_loss',
 'max_depth': 5,
 'max_features': 'auto',
 'min_impurity_decrease': 0.1,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.4,
 'splitter': 'best'}

In [28]:
# The best score for the above parameters
DTC_Model_Creation.best_score_

0.9754462967701301

# Since there are lot of failures in the gridsearch CV results so the entries will be NA. so we don't need those entries to be saved as it may confuse us in future so we are removing those NA entries and have only the valid entries of grid search CV results saved and used for future reference

In [29]:
# Creating as data frame to save the GridSearch CV results
CV_Output = pd.DataFrame.from_dict(DTC_Model_Creation.cv_results_)

In [30]:
# Check the entries count
CV_Output.shape

(78732, 29)

In [31]:
# Final Data frame that has valid entries in it
CV_Output

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_ccp_alpha,param_criterion,param_max_depth,param_max_features,param_min_impurity_decrease,param_min_samples_leaf,...,std_test_roc_auc_score,rank_test_roc_auc_score,split0_test_f1_weighted_score,split1_test_f1_weighted_score,split2_test_f1_weighted_score,split3_test_f1_weighted_score,split4_test_f1_weighted_score,mean_test_f1_weighted_score,std_test_f1_weighted_score,rank_test_f1_weighted_score
0,0.008799,0.000748,0.010506,0.001411,0.00001,gini,1,auto,0.1,0.1,...,0.038416,4234,0.847685,0.789263,0.838949,0.902414,0.891344,0.853931,0.040487,3917
1,0.008793,0.001942,0.008976,0.000319,0.00001,gini,1,auto,0.1,0.1,...,0.123941,12960,0.801222,0.836256,0.599587,0.592028,0.339324,0.633683,0.178129,12714
2,0.009000,0.000002,0.009798,0.000400,0.00001,gini,1,auto,0.1,0.1,...,0.064679,5456,0.925117,0.836256,0.732575,0.881144,0.775086,0.830035,0.069583,4546
3,0.009202,0.002226,0.008599,0.001200,0.00001,gini,1,auto,0.1,0.1,...,0.118997,15530,0.789263,0.333333,0.704116,0.592028,0.339324,0.551613,0.186591,15220
4,0.007999,0.001549,0.009999,0.001414,0.00001,gini,1,auto,0.1,0.1,...,0.159985,9696,0.333333,0.789263,0.752395,0.697783,1.000000,0.714555,0.216381,9365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78727,0.003710,0.000611,0.005701,0.001336,1.0,log_loss,27,log2,0.7,0.7,...,0.000000,21203,0.333333,0.333333,0.333333,0.339324,0.339324,0.335730,0.002935,21203
78728,0.005404,0.000486,0.010398,0.001018,1.0,log_loss,27,log2,0.7,0.7,...,0.000000,21203,0.333333,0.333333,0.333333,0.339324,0.339324,0.335730,0.002935,21203
78729,0.005417,0.000377,0.009199,0.000400,1.0,log_loss,27,log2,0.7,0.7,...,0.000000,21203,0.333333,0.333333,0.333333,0.339324,0.339324,0.335730,0.002935,21203
78730,0.005400,0.000490,0.009000,0.000634,1.0,log_loss,27,log2,0.7,0.7,...,0.000000,21203,0.333333,0.333333,0.333333,0.339324,0.339324,0.335730,0.002935,21203


In [32]:
# Lets save the grid search CV results to the csv file
CV_Output.to_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Grid Serach CV Results\DTC_GridSerachCV_Results.csv")

In [33]:
# Lets test the model with best parameter that we have got agains the test data and check the performance

Y_Predicted = DTC_Model_Creation.predict(X_Test)

In [34]:
Y_Predicted

array([1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 1], dtype=uint8)

In [35]:
# Lets check the confusion matrix for the above predicted against the actual results
confusion_matrix(Y_Predicted,Y_Test)

array([[14,  2],
       [ 2, 12]], dtype=int64)

In [36]:
#lets check the classification test report for the predicted against the actual results
print(classification_report(Y_Predicted,Y_Test))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88        16
           1       0.86      0.86      0.86        14

    accuracy                           0.87        30
   macro avg       0.87      0.87      0.87        30
weighted avg       0.87      0.87      0.87        30



In [37]:
DTC_Model_Creation.predict_proba(X_Test)[:,1]

array([0.68      , 0.03636364, 0.03636364, 0.03636364, 0.97474747,
       0.03636364, 0.97474747, 0.03636364, 0.97474747, 0.03636364,
       0.97474747, 0.97474747, 0.03636364, 0.97474747, 0.03636364,
       0.97474747, 0.03636364, 0.97474747, 0.03636364, 0.03636364,
       0.03636364, 0.97474747, 0.97474747, 0.03636364, 0.03636364,
       0.03636364, 0.97474747, 0.03636364, 0.97474747, 0.97474747])

In [38]:
# lets check the roc_auc_score results for the predicted against the actual results
roc_auc_score(Y_Test,DTC_Model_Creation.predict_proba(X_Test)[:,1])

0.8928571428571428

In [40]:
# Wow we can see our auc score is 100 percent and weighted f1 score is also 93 percent lets save out model
import pickle
pickle.dump(DTC_Model_Creation,open(r'C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\DTC_Final_Model.sav','wb'))

In [41]:
# Lets load and test our model
def Final_model_prod(model,columns,stdscaler='None'):
    query_values=[]
    for col_idx in range(0,len(columns)-1):
        query_values.append(float(input("Please enter valid {}: \n Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' ".format(columns[col_idx]))))
    pred_class=model.predict(stdscaler.transform([query_values]))
    
    if pred_class==0:
        print ("This patient doesn't have Chronic Kidney Disorder")
    else:
        print ("This patient have Chronic Kidney Disorder Please proceed proper medication")

In [42]:
# Load the saved model
Final_model= pickle.load(open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\DTC_Final_Model.sav",'rb'))

# here the scaler is the varaible that we have used for standardising the independent variables so we should use the same variable here  


In [None]:
# Test the final model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=UserWarning)
Final_model_prod(Final_model,Data.columns.to_list(),scaler)

In [None]:
# Test the final model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=UserWarning)
Final_model_prod(Final_model,Data.columns.to_list(),scaler)