In [2]:
# Load all the important libraries to create the GaussianNaiveBayes Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [3]:
#import the data set

Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Dataset\CKD.csv")

In [4]:
Data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,yes
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,yes


In [5]:
#Lets check total number of rows and columns 
Data.shape

(399, 25)

In [6]:
#Check the dataset info 
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             399 non-null    float64
 1   bp              399 non-null    float64
 2   sg              399 non-null    object 
 3   al              399 non-null    float64
 4   su              399 non-null    float64
 5   rbc             399 non-null    object 
 6   pc              399 non-null    object 
 7   pcc             399 non-null    object 
 8   ba              399 non-null    object 
 9   bgr             399 non-null    float64
 10  bu              399 non-null    float64
 11  sc              399 non-null    float64
 12  sod             399 non-null    float64
 13  pot             399 non-null    float64
 14  hrmo            399 non-null    float64
 15  pcv             399 non-null    float64
 16  wc              399 non-null    float64
 17  rc              399 non-null    flo

In [7]:
# Lets check the description of data set
Data.describe()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,pcv,wc,rc
count,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0
mean,51.492308,76.459948,0.899749,0.39599,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,38.868902,8408.191126,4.705597
std,16.995379,13.492053,1.314769,1.041155,74.864224,49.336046,5.623758,9.215829,2.823323,2.715753,8.157274,2526.204544,0.841006
min,2.0,50.0,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1
25%,42.0,70.0,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.85,34.0,6950.0,4.5
50%,54.0,76.459948,0.0,0.0,127.0,44.0,1.4,137.528754,4.627244,12.518156,38.868902,8408.191126,4.705597
75%,64.0,80.0,2.0,0.0,150.0,62.5,3.077356,141.0,4.8,14.6,44.0,9400.0,5.1
max,90.0,180.0,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [8]:
# Lets check if the data set has NA values
Data[Data.isna().any(axis=1)]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification


In [9]:
# Lets check if the data set has Null values
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification


In [10]:
# Lets check how many classes are there in the target variable "Classification"
Data.classification.value_counts()

yes    249
no     150
Name: classification, dtype: int64

# From above output we got to know that there are 250 records in "Yes" class and 150 records in "No" class. so its called as imbalanced data set

In [11]:
# Lets upsample the "No" records to match with "Yes" records
Data_No = Data[Data["classification"]=="no"]

In [12]:
import random

samples_index = random.sample(range(150),99)

In [13]:
Data_No_final = Data_No.iloc[samples_index]

In [14]:
Data_No_final.reset_index(inplace=True,drop=True)

In [15]:
Data_No_final

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,56.0,70.000000,b,0.0,0.0,normal,normal,notpresent,notpresent,70.0,...,50.000000,11000.000000,5.100000,no,no,no,yes,poor,no,no
1,47.0,60.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,109.0,...,41.000000,8300.000000,5.200000,no,no,no,yes,poor,no,no
2,57.0,60.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,105.0,...,44.000000,10400.000000,6.200000,no,no,no,yes,poor,no,no
3,74.0,60.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,88.0,...,53.000000,6000.000000,4.500000,no,no,no,yes,poor,no,no
4,37.0,60.000000,b,0.0,0.0,normal,normal,notpresent,notpresent,111.0,...,50.000000,5500.000000,5.700000,no,no,no,yes,poor,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,41.0,80.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,122.0,...,41.000000,9100.000000,5.200000,no,no,no,yes,poor,no,no
95,35.0,60.000000,b,0.0,0.0,normal,normal,notpresent,notpresent,105.0,...,43.000000,5800.000000,6.200000,no,no,no,yes,poor,no,no
96,50.0,76.459948,a,0.0,0.0,normal,normal,notpresent,notpresent,92.0,...,48.000000,4700.000000,5.400000,no,no,no,yes,poor,no,no
97,30.0,60.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,138.0,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,no,no


In [16]:
Data = Data.append(Data_No_final,ignore_index=True)

  Data = Data.append(Data_No_final,ignore_index=True)


In [17]:
Data.classification.value_counts()

yes    249
no     249
Name: classification, dtype: int64

# From above output we have upsampled the "No" class counts to match with "Yes" counts

In [18]:
# Lets convert the nominal categoty columns to numerical columns using one-hot-encoding via pandas get_dummies method

Data = pd.get_dummies(Data,drop_first=True)

In [19]:
Data.head()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,1,0,0,0,0,0,1,0,0,1
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,1,0,0,0,0,0,1,0,0,1
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,1,0,0,0,0,0,1,0,1,1
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,1,0,0,0,0,0,1,0,0,1


In [20]:
Data.columns

Index(['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes', 'classification_yes'],
      dtype='object')

In [21]:
# Lets seperate the dependent and independent variables
dependent = Data[['classification_yes']]
independent = Data[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes']]

# Lets scale all the columns to same measuremet scale from 0 to 1 using standardisation as you can see everu column values are in different measurements

In [22]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
independent = scaler.fit_transform(independent)

In [23]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [25]:
# Lets spli the data to train  and test set

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,random_state=0,test_size=30)

In [26]:
# Model Parameters 
Model_Params = {'var_smoothing':[0.00000001,0.00000001,0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1,1.0] }

In [27]:
# Lets create a model now using the Gridsearch CV

GNB_Model_Creation = GridSearchCV(GaussianNB(),Model_Params,n_jobs=-1,scoring={"roc_auc_score":'roc_auc','f1_weighted_score':'f1_weighted'},
                                            refit='roc_auc_score',verbose=2)

In [28]:
GNB_Model_Creation.fit(X_Train,Y_Train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  y = column_or_1d(y, warn=True)


In [29]:
# The best parameter that has givem high performance scores
GNB_Model_Creation.best_params_

{'var_smoothing': 0.1}

In [30]:
# The best score for the above parameters
GNB_Model_Creation.best_score_

0.9988111874348021

In [31]:
# Creating as data frame to save the GridSearch CV results
CV_Output = pd.DataFrame.from_dict(GNB_Model_Creation.cv_results_)

In [32]:
# Check the entries count
CV_Output.shape

(10, 22)

In [33]:
# Final Data frame that has valid entries in it
CV_Output

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_roc_auc_score,split1_test_roc_auc_score,split2_test_roc_auc_score,split3_test_roc_auc_score,...,std_test_roc_auc_score,rank_test_roc_auc_score,split0_test_f1_weighted_score,split1_test_f1_weighted_score,split2_test_f1_weighted_score,split3_test_f1_weighted_score,split4_test_f1_weighted_score,mean_test_f1_weighted_score,std_test_f1_weighted_score,rank_test_f1_weighted_score
0,0.011904,0.01019,0.008401,0.001018,0.0,{'var_smoothing': 1e-08},1.0,0.988909,1.0,0.988668,...,0.005493,3,1.0,0.968081,0.98936,0.967742,0.989247,0.982886,0.012836,5
1,0.008201,0.002039,0.007304,0.000875,0.0,{'var_smoothing': 1e-08},1.0,0.988909,1.0,0.988668,...,0.005493,3,1.0,0.968081,0.98936,0.967742,0.989247,0.982886,0.012836,5
2,0.007503,0.002968,0.005802,0.001472,0.0,{'var_smoothing': 1e-07},1.0,0.988909,1.0,0.988668,...,0.005493,3,1.0,0.968081,0.98936,0.967742,0.989247,0.982886,0.012836,5
3,0.006598,0.002246,0.006,0.001788,1e-06,{'var_smoothing': 1e-06},1.0,0.988909,1.0,0.988668,...,0.005493,3,1.0,0.968081,0.98936,0.967742,0.989247,0.982886,0.012836,5
4,0.005001,0.003795,0.005305,0.000404,1e-05,{'var_smoothing': 1e-05},1.0,0.988909,1.0,0.988668,...,0.005493,3,1.0,0.98936,0.98936,0.967742,0.989247,0.987142,0.010545,1
5,0.004703,0.002785,0.0054,0.000489,0.0001,{'var_smoothing': 0.0001},1.0,0.988909,1.0,0.988668,...,0.005493,3,1.0,0.98936,0.98936,0.967742,0.989247,0.987142,0.010545,1
6,0.0048,0.001471,0.0056,0.0008,0.001,{'var_smoothing': 0.001},1.0,0.988909,1.0,0.988668,...,0.005493,3,1.0,0.98936,0.98936,0.967742,0.989247,0.987142,0.010545,1
7,0.0058,0.00172,0.0056,0.001743,0.01,{'var_smoothing': 0.01},1.0,0.988683,1.0,0.988668,...,0.005548,10,1.0,0.978723,0.98936,0.967742,0.989247,0.985015,0.010948,4
8,0.0042,0.001469,0.0042,0.000401,0.1,{'var_smoothing': 0.1},1.0,0.996831,1.0,0.997225,...,0.001461,1,1.0,0.957428,0.968053,0.956979,0.989247,0.974341,0.017364,9
9,0.0032,0.001166,0.0046,0.000801,1.0,{'var_smoothing': 1.0},1.0,0.997284,1.0,0.9963,...,0.001602,2,0.836256,0.824709,0.813034,0.711204,0.924364,0.821913,0.06789,10


In [34]:
# Lets save the grid search CV results to the csv file
CV_Output.to_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Grid Serach CV Results\GNB_GridSerachCV_Results.csv")

In [35]:
# Lets test the model with best parameter that we have got agains the test data and check the performance

Y_Predicted = GNB_Model_Creation.predict(X_Test)

In [36]:
Y_Predicted

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 1], dtype=uint8)

In [37]:
# Lets check the confusion matrix for the above predicted against the actual results
confusion_matrix(Y_Predicted,Y_Test)

array([[16,  0],
       [ 0, 14]], dtype=int64)

In [38]:
#lets check the classification test report for the predicted against the actual results
print(classification_report(Y_Predicted,Y_Test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        14

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [42]:
GNB_Model_Creation.predict_proba(X_Test)[:,1]

array([7.74316490e-13, 4.66538726e-17, 2.32938252e-16, 8.86701372e-17,
       1.00000000e+00, 2.38220895e-19, 1.00000000e+00, 2.68702016e-15,
       1.00000000e+00, 1.61650788e-15, 1.00000000e+00, 1.00000000e+00,
       6.08608530e-14, 1.00000000e+00, 8.80654657e-18, 1.00000000e+00,
       3.61011718e-14, 1.00000000e+00, 4.70022510e-20, 1.00000000e+00,
       4.53630194e-15, 1.00000000e+00, 1.00000000e+00, 9.98550985e-01,
       7.88821143e-15, 2.38220895e-19, 1.34722957e-16, 1.22330036e-14,
       1.00000000e+00, 1.00000000e+00])

In [46]:
# lets check the roc_auc_score results for the predicted against the actual results
roc_auc_score(Y_Test,GNB_Model_Creation.predict_proba(X_Test)[:,1])

1.0

In [45]:
# Wow we can see our auc score is 100 percent and weighted f1 score is also 93 percent lets save out model
import pickle
pickle.dump(GNB_Model_Creation,open(r'C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\GNB_Final_Model.sav','wb'))

In [47]:
# Lets load and test our model
def Final_model_prod(model,columns,stdscaler='None'):
    query_values=[]
    for col_idx in range(0,len(columns)-1):
        query_values.append(float(input("Please enter valid {}: \n Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' ".format(columns[col_idx]))))
    pred_class=model.predict(stdscaler.transform([query_values]))
    
    if pred_class==0:
        print ("This patient doesn't have Chronic Kidney Disorder")
    else:
        print ("This patient have Chronic Kidney Disorder Please proceed proper medication")

In [48]:
# Load the saved model
Final_model= pickle.load(open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\GNB_Final_Model.sav",'rb'))

# here the scaler is the varaible that we have used for standardising the independent variables so we should use the same variable here  


In [229]:
# Test the final model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=UserWarning)
Final_model_prod(Final_model,Data.columns.to_list(),scaler)

Please enter valid age: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 40
Please enter valid bp: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 179
Please enter valid al: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 3
Please enter valid su: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 1
Please enter valid bgr: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 165
Please enter valid bu: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 34
Please enter valid sc: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 0.9
Please enter valid sod: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 129
Please enter valid pot: 
 Note: If this is a boolean parame

In [53]:
# Test the final model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=UserWarning)
Final_model_prod(Final_model,Data.columns.to_list(),scaler)

Please enter valid age: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 40
Please enter valid bp: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 179
Please enter valid al: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 3
Please enter valid su: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 1
Please enter valid bgr: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 165
Please enter valid bu: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 34
Please enter valid sc: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 0.9
Please enter valid sod: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 129
Please enter valid pot: 
 Note: If this is a boolean parame