In [113]:
# Load all the important libraries to create the Logistic Regression Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [82]:
#import the data set

Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Dataset\CKD.csv")

In [83]:
Data.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.0,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.0,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.0,12300.0,4.705597,no,no,no,yes,poor,no,yes
2,4.0,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.0,...,34.0,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.0,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.0,50.0,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.0,12400.0,4.705597,no,no,no,yes,poor,no,yes


In [84]:
#Lets check total number of rows and columns 
Data.shape

(399, 25)

In [85]:
#Check the dataset info 
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             399 non-null    float64
 1   bp              399 non-null    float64
 2   sg              399 non-null    object 
 3   al              399 non-null    float64
 4   su              399 non-null    float64
 5   rbc             399 non-null    object 
 6   pc              399 non-null    object 
 7   pcc             399 non-null    object 
 8   ba              399 non-null    object 
 9   bgr             399 non-null    float64
 10  bu              399 non-null    float64
 11  sc              399 non-null    float64
 12  sod             399 non-null    float64
 13  pot             399 non-null    float64
 14  hrmo            399 non-null    float64
 15  pcv             399 non-null    float64
 16  wc              399 non-null    float64
 17  rc              399 non-null    flo

In [86]:
# Lets check the description of data set
Data.describe()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,pcv,wc,rc
count,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0,399.0
mean,51.492308,76.459948,0.899749,0.39599,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,38.868902,8408.191126,4.705597
std,16.995379,13.492053,1.314769,1.041155,74.864224,49.336046,5.623758,9.215829,2.823323,2.715753,8.157274,2526.204544,0.841006
min,2.0,50.0,0.0,0.0,22.0,1.5,0.4,4.5,2.5,3.1,9.0,2200.0,2.1
25%,42.0,70.0,0.0,0.0,101.0,27.0,0.9,135.0,4.0,10.85,34.0,6950.0,4.5
50%,54.0,76.459948,0.0,0.0,127.0,44.0,1.4,137.528754,4.627244,12.518156,38.868902,8408.191126,4.705597
75%,64.0,80.0,2.0,0.0,150.0,62.5,3.077356,141.0,4.8,14.6,44.0,9400.0,5.1
max,90.0,180.0,5.0,5.0,490.0,391.0,76.0,163.0,47.0,17.8,54.0,26400.0,8.0


In [87]:
# Lets check if the data set has NA values
Data[Data.isna().any(axis=1)]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification


In [88]:
# Lets check if the data set has Null values
Data[Data.isnull().any(axis=1)]

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification


In [89]:
# Lets check how many classes are there in the target variable "Classification"
Data.classification.value_counts()

yes    249
no     150
Name: classification, dtype: int64

# From above output we got to know that there are 250 records in "Yes" class and 150 records in "No" class. so its called as imbalanced data set

In [90]:
# Lets upsample the "No" records to match with "Yes" records
Data_No = Data[Data["classification"]=="no"]

In [91]:
import random

samples_index = random.sample(range(150),99)

In [92]:
Data_No_final = Data_No.iloc[samples_index]

In [93]:
Data_No_final.reset_index(inplace=True,drop=True)

In [94]:
Data_No_final

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,28.0,60.0,b,0.0,0.0,normal,normal,notpresent,notpresent,79.0,...,51.0,6500.0,5.0,no,no,no,yes,poor,no,no
1,52.0,80.0,a,0.0,0.0,normal,normal,notpresent,notpresent,125.0,...,43.0,4700.0,4.6,no,no,no,yes,poor,no,no
2,51.0,80.0,a,0.0,0.0,normal,normal,notpresent,notpresent,94.0,...,46.0,9500.0,6.4,no,no,no,yes,poor,no,no
3,60.0,60.0,a,0.0,0.0,normal,normal,notpresent,notpresent,134.0,...,48.0,10700.0,5.6,no,no,no,yes,poor,no,no
4,49.0,80.0,a,0.0,0.0,normal,normal,notpresent,notpresent,122.0,...,41.0,5600.0,4.9,no,no,no,yes,poor,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,43.0,80.0,b,0.0,0.0,normal,normal,notpresent,notpresent,81.0,...,48.0,6900.0,4.9,no,no,no,yes,poor,no,no
95,23.0,80.0,a,0.0,0.0,normal,normal,notpresent,notpresent,99.0,...,46.0,4300.0,5.5,no,no,no,yes,poor,no,no
96,62.0,80.0,a,0.0,0.0,normal,normal,notpresent,notpresent,132.0,...,44.0,4700.0,4.5,no,no,no,yes,poor,no,no
97,55.0,80.0,b,0.0,0.0,normal,normal,notpresent,notpresent,130.0,...,41.0,9100.0,6.0,no,no,no,yes,poor,no,no


In [95]:
Data = Data.append(Data_No_final,ignore_index=True)

  Data = Data.append(Data_No_final,ignore_index=True)


In [98]:
Data.classification.value_counts()

yes    249
no     249
Name: classification, dtype: int64

# From above output we have upsampled the "No" class counts to match with "Yes" counts

In [99]:
# Lets convert the nominal categoty columns to numerical columns using one-hot-encoding via pandas get_dummies method

Data = pd.get_dummies(Data,drop_first=True)

In [100]:
Data.head()

Unnamed: 0,age,bp,al,su,bgr,bu,sc,sod,pot,hrmo,...,pc_normal,pcc_present,ba_present,htn_yes,dm_yes,cad_yes,appet_yes,pe_yes,ane_yes,classification_yes
0,2.0,76.459948,3.0,0.0,148.112676,57.482105,3.077356,137.528754,4.627244,12.518156,...,0,0,0,0,0,0,1,1,0,1
1,3.0,76.459948,2.0,0.0,148.112676,22.0,0.7,137.528754,4.627244,10.7,...,1,0,0,0,0,0,1,0,0,1
2,4.0,76.459948,1.0,0.0,99.0,23.0,0.6,138.0,4.4,12.0,...,1,0,0,0,0,0,1,0,0,1
3,5.0,76.459948,1.0,0.0,148.112676,16.0,0.7,138.0,3.2,8.1,...,1,0,0,0,0,0,1,0,1,1
4,5.0,50.0,0.0,0.0,148.112676,25.0,0.6,137.528754,4.627244,11.8,...,1,0,0,0,0,0,1,0,0,1


In [102]:
Data.columns

Index(['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes', 'classification_yes'],
      dtype='object')

In [104]:
# Lets seperate the dependent and independent variables
dependent = Data[['classification_yes']]
independent = Data[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes']]

# Lets scale all the columns to same measuremet scale from 0 to 1 using standardisation as you can see everu column values are in different measurements

In [105]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
independent = scaler.fit_transform(independent)

In [121]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

In [112]:
# Lets spli the data to trai  and test set

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,random_state=0,test_size=30)

In [130]:
# Model Parameters 
Model_Params = {"penalty":['l1', 'l2', 'elasticnet'],'C':[0.0001,0.001,0.01,0.1,1.0,100,1000,10000],"fit_intercept":[False,True],
               "intercept_scaling":[0.001,0.01,0.1,1,10,100,1000],'tol':[0.00001,0.0001,0.001,0.1],'class_weight':['balanced',None],
               "random_state":[50],'solver':['lbfgs',"liblinear",'newton-cg',"newton-cholesky", 'sag', 'saga'],'warm_start':[True],'n_jobs':[-1],'max_iter':[i for i in range(100,1100,200)]}

In [131]:
# Lets create a model now using the Gridsearch CV

LogisticRegression_Model_Creation = GridSearchCV(LogisticRegression(),Model_Params,n_jobs=-1,scoring={"roc_auc_score":'roc_auc','f1_weighted_score':'f1_weighted'},
                                            refit='roc_auc_score',verbose=3)

In [132]:
LogisticRegression_Model_Creation.fit(X_Train,Y_Train)

Fitting 5 folds for each of 80640 candidates, totalling 403200 fits


246400 fits failed out of a total of 403200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
22400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Vinoth\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Vinoth\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Vinoth\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueEr

In [133]:
# The best parameter that has givem high performance scores
LogisticRegression_Model_Creation.best_params_

{'C': 0.1,
 'class_weight': 'balanced',
 'fit_intercept': False,
 'intercept_scaling': 0.001,
 'max_iter': 100,
 'n_jobs': -1,
 'penalty': 'l2',
 'random_state': 50,
 'solver': 'lbfgs',
 'tol': 1e-05,
 'warm_start': True}

In [134]:
# The best score for the above parameters
LogisticRegression_Model_Creation.best_score_

0.9998149861239594

# Since there are lot of failures in the gridsearch CV results so the entries will be NA. so we don't need those entries to be saved as it may confuse us in future so we are removing those NA entries and have only the valid entries of grid search CV results saved and used for future reference

In [211]:
# Creating as data frame to save the GridSearch CV results
CV_Output = pd.DataFrame.from_dict(LogisticRegression_Model_Creation.cv_results_)

In [212]:
# Check the entries count
CV_Output.shape

(80640, 32)

In [213]:
# Get the indexes of all the records that has NA entries in the dataframe
index_of_NA_Vals = CV_Output[CV_Output.isna().any(axis=1)].index

In [214]:
index_of_NA_Vals

Int64Index([    0,     1,     2,     3,     8,     9,    10,    11,    12,
               13,
            ...
            80630, 80631, 80632, 80633, 80634, 80635, 80636, 80637, 80638,
            80639],
           dtype='int64', length=64960)

In [215]:
# We are dropping all the indexed records that has NA entries in it
CV_Output.drop(index_of_NA_Vals,axis=0,inplace=True)

In [216]:
# We are resetting the index from 0 to 'n' post removing the above indexed records 
CV_Output.reset_index(drop=True,inplace=True)

In [217]:
# Final Data frame that has valid entries in it
CV_Output

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_fit_intercept,param_intercept_scaling,param_max_iter,param_n_jobs,...,std_test_roc_auc_score,rank_test_roc_auc_score,split0_test_f1_weighted_score,split1_test_f1_weighted_score,split2_test_f1_weighted_score,split3_test_f1_weighted_score,split4_test_f1_weighted_score,mean_test_f1_weighted_score,std_test_f1_weighted_score,rank_test_f1_weighted_score
0,0.008729,0.002795,0.006501,0.000446,0.0001,balanced,False,0.001,100,-1,...,0.000000,29121,0.333333,0.333333,0.333333,0.327377,0.327377,0.330951,0.002918,29681
1,0.007200,0.002785,0.005001,0.000633,0.0001,balanced,False,0.001,100,-1,...,0.000000,29121,0.333333,0.333333,0.333333,0.327377,0.327377,0.330951,0.002918,29681
2,0.008800,0.000980,0.004801,0.000980,0.0001,balanced,False,0.001,100,-1,...,0.000000,29121,0.333333,0.333333,0.333333,0.327377,0.327377,0.330951,0.002918,29681
3,0.006200,0.002040,0.007000,0.001673,0.0001,balanced,False,0.001,100,-1,...,0.000000,29121,0.333333,0.333333,0.333333,0.327377,0.327377,0.330951,0.002918,29681
4,0.012200,0.000401,0.004801,0.000980,0.0001,balanced,False,0.001,100,-1,...,0.000000,29121,0.333333,0.333333,0.333333,0.327377,0.327377,0.330951,0.002918,29681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15675,0.020009,0.002470,0.005705,0.000607,10000,balanced,True,1000,900,-1,...,0.000897,7731,0.989360,0.968053,0.968053,0.956930,0.989247,0.974328,0.012884,7546
15676,0.231076,0.004348,0.005661,0.000838,10000,balanced,True,1000,900,-1,...,0.000409,5842,0.989360,0.989360,0.978714,0.978490,0.989247,0.985034,0.005253,5504
15677,0.227255,0.007097,0.005861,0.000446,10000,balanced,True,1000,900,-1,...,0.000409,5842,0.989360,0.989360,0.978714,0.978490,0.989247,0.985034,0.005253,5504
15678,0.087422,0.005180,0.006220,0.000719,10000,balanced,True,1000,900,-1,...,0.000448,4241,0.989360,0.989360,0.978714,0.989247,0.989247,0.987186,0.004236,2049


In [218]:
# Lets save the grid search CV results to the csv file
CV_Output.to_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Grid Serach CV Results\Logistic_Resgression_GridSerachCV_Results.csv")

In [219]:
# Lets test the model with best parameter that we have got agains the test data and check the performance

Y_Predicted = LogisticRegression_Model_Creation.predict(X_Test)

In [220]:
Y_Predicted

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1], dtype=uint8)

In [221]:
# Lets check the confusion matrix for the above predicted against the actual results
confusion_matrix(Y_Predicted,Y_Test)

array([[16,  2],
       [ 0, 12]], dtype=int64)

In [222]:
#lets check the classification test report for the predicted against the actual results
print(classification_report(Y_Predicted,Y_Test))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94        18
           1       0.86      1.00      0.92        12

    accuracy                           0.93        30
   macro avg       0.93      0.94      0.93        30
weighted avg       0.94      0.93      0.93        30



In [223]:
LogisticRegression_Model_Creation.predict_proba(X_Test)[:,1]

array([0.18915851, 0.01585382, 0.01205345, 0.02519809, 0.89912246,
       0.00439177, 0.96238925, 0.01424368, 0.99980092, 0.02413994,
       0.99998562, 0.79631658, 0.00541114, 0.99798859, 0.01275697,
       0.99886072, 0.06604053, 0.5504819 , 0.0011873 , 0.33371934,
       0.03033563, 0.99705005, 0.58671103, 0.40487364, 0.06567904,
       0.03163264, 0.01894699, 0.02244858, 0.99593421, 0.99946417])

In [224]:
# lets check the roc_auc_score results for the predicted against the actual results
roc_auc_score(Y_Test,LogisticRegression_Model_Creation.predict_proba(X_Test)[:,1])

1.0

In [225]:
# Wow we can see our auc score is 100 percent and weighted f1 score is also 93 percent lets save out model
import pickle
pickle.dump(LogisticRegression_Model_Creation,open(r'C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\LogisticRegression_Final_Model.sav','wb'))

In [226]:
# Lets load and test our model
def Final_model_prod(model,columns,stdscaler='None'):
    query_values=[]
    for col_idx in range(0,len(columns)-1):
        query_values.append(float(input("Please enter valid {}: \n Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' ".format(columns[col_idx]))))
    pred_class=model.predict(stdscaler.transform([query_values]))
    
    if pred_class==0:
        print ("This patient doesn't have Chronic Kidney Disorder")
    else:
        print ("This patient have Chronic Kidney Disorder Please proceed proper medication")

In [227]:
# Load the saved model
Final_model= pickle.load(open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\LogisticRegression_Final_Model.sav",'rb'))

# here the scaler is the varaible that we have used for standardising the independent variables so we should use the same variable here  


In [229]:
# Test the final model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=UserWarning)
Final_model_prod(Final_model,Data.columns.to_list(),scaler)

Please enter valid age: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 40
Please enter valid bp: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 179
Please enter valid al: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 3
Please enter valid su: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 1
Please enter valid bgr: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 165
Please enter valid bu: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 34
Please enter valid sc: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 0.9
Please enter valid sod: 
 Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' 129
Please enter valid pot: 
 Note: If this is a boolean parame