In [None]:
# Load all the important libraries to create the RandomForestClassifier Model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [None]:
#import the data set

Data = pd.read_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Dataset\CKD.csv")

In [None]:
Data.head()

In [None]:
#Lets check total number of rows and columns 
Data.shape

In [None]:
#Check the dataset info 
Data.info()

In [None]:
# Lets check the description of data set
Data.describe()

In [None]:
# Lets check if the data set has NA values
Data[Data.isna().any(axis=1)]

In [None]:
# Lets check if the data set has Null values
Data[Data.isnull().any(axis=1)]

In [None]:
# Lets check how many classes are there in the target variable "Classification"
Data.classification.value_counts()

# From above output we got to know that there are 250 records in "Yes" class and 150 records in "No" class. so its called as imbalanced data set

In [None]:
# Lets upsample the "No" records to match with "Yes" records
Data_No = Data[Data["classification"]=="no"]

In [None]:
import random

samples_index = random.sample(range(150),99)

In [None]:
Data_No_final = Data_No.iloc[samples_index]

In [None]:
Data_No_final.reset_index(inplace=True,drop=True)

In [None]:
Data_No_final

In [None]:
Data = Data.append(Data_No_final,ignore_index=True)

In [None]:
Data.classification.value_counts()

# From above output we have upsampled the "No" class counts to match with "Yes" counts

In [None]:
# Lets convert the nominal categoty columns to numerical columns using one-hot-encoding via pandas get_dummies method

Data = pd.get_dummies(Data,drop_first=True)

In [None]:
Data.head()

In [None]:
Data.columns

In [None]:
# Lets seperate the dependent and independent variables
dependent = Data[['classification_yes']]
independent = Data[['age', 'bp', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hrmo', 'pcv',
       'wc', 'rc', 'sg_b', 'sg_c', 'sg_d', 'sg_e', 'rbc_normal', 'pc_normal',
       'pcc_present', 'ba_present', 'htn_yes', 'dm_yes', 'cad_yes',
       'appet_yes', 'pe_yes', 'ane_yes']]

In [None]:
import sklearn
sklearn.metrics.get_scorer_names()

In [None]:
# Lets spli the data to trai  and test set

X_Train,X_Test,Y_Train,Y_Test = train_test_split(independent,dependent,random_state=0,test_size=30)

In [None]:
# Model Parameters 
Model_Params = {'n_estimators':[i for i in range(1,500,100) ],"criterion":['gini','entropy','log_loss'],"max_depth":[i for i in range(1,int(len(Data.columns.to_list())/2))],
               'min_samples_split':[i for i in np.arange(0.1,1.0,0.3)],'min_samples_leaf':[i for i in np.arange(0.1,1.0,0.3)],
               'max_features':['auto','sqrt','log2'],'min_impurity_decrease':[i for i in np.arange(0.1,1.0,0.3)],'ccp_alpha':[0.00001,0.001,0.1],'n_jobs':[-1],'warm_start':[True],
                'class_weight':['balanced','balanced_subsample']}

In [None]:
# Lets create a model now using the Gridsearch CV

RFC_Model_Creation = GridSearchCV(RandomForestClassifier(),Model_Params,n_jobs=-1,scoring={"roc_auc_score":'roc_auc','f1_weighted_score':'f1_weighted'},
                                            refit='roc_auc_score',verbose=2)

In [None]:
RFC_Model_Creation.fit(X_Train,Y_Train)

In [None]:
# The best parameter that has givem high performance scores
RFC_Model_Creation.best_params_

In [None]:
# The best score for the above parameters
RFC_Model_Creation.best_score_

# Since there are lot of failures in the gridsearch CV results so the entries will be NA. so we don't need those entries to be saved as it may confuse us in future so we are removing those NA entries and have only the valid entries of grid search CV results saved and used for future reference

In [None]:
# Creating as data frame to save the GridSearch CV results
CV_Output = pd.DataFrame.from_dict(RFC_Model_Creation.cv_results_)

In [None]:
# Check the entries count
CV_Output.shape

In [None]:
# Final Data frame that has valid entries in it
CV_Output

In [None]:
# Lets save the grid search CV results to the csv file
CV_Output.to_csv(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Grid Serach CV Results\RFC_GridSerachCV_Results.csv")

In [None]:
# Lets test the model with best parameter that we have got agains the test data and check the performance

Y_Predicted = RFC_Model_Creation.predict(X_Test)

In [None]:
Y_Predicted

In [None]:
# Lets check the confusion matrix for the above predicted against the actual results
confusion_matrix(Y_Predicted,Y_Test)

In [None]:
#lets check the classification test report for the predicted against the actual results
print(classification_report(Y_Predicted,Y_Test))

In [None]:
RFC_Model_Creation.predict_proba(X_Test)[:,1]

In [None]:
# lets check the roc_auc_score results for the predicted against the actual results
roc_auc_score(Y_Test,RFC_Model_Creation.predict_proba(X_Test)[:,1])

In [None]:
# Wow we can see our auc score is 100 percent and weighted f1 score is also 93 percent lets save out model
import pickle
pickle.dump(RFC_Model_Creation,open(r'C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\RFC_Final_Model.sav','wb'))

In [None]:
# Lets load and test our model
def Final_model_prod(model,columns,stdscaler='None'):
    query_values=[]
    for col_idx in range(0,len(columns)-1):
        query_values.append(float(input("Please enter valid {}: \n Note: If this is a boolean parameter please provide values as 1 for 'yes' and 0 for 'No' ".format(columns[col_idx]))))
    pred_class=model.predict(stdscaler.transform([query_values]))
    
    if pred_class==0:
        print ("This patient doesn't have Chronic Kidney Disorder")
    else:
        print ("This patient have Chronic Kidney Disorder Please proceed proper medication")

In [3]:
# Load the saved model
import pickle
Final_model= pickle.load(open(r"C:\Users\Vinoth\Desktop\HOPE AI\Machine Learning\Classification_Models\Chronic Kidney Disease Prediction Assignment\Final Model\RFC_Final_Model.sav",'rb'))

In [4]:
Final_model.best_params_

{'ccp_alpha': 0.001,
 'class_weight': 'balanced',
 'criterion': 'log_loss',
 'max_depth': 1,
 'max_features': 'log2',
 'min_impurity_decrease': 0.1,
 'min_samples_leaf': 0.1,
 'min_samples_split': 0.1,
 'n_estimators': 101,
 'n_jobs': -1,
 'warm_start': True}

In [5]:
Final_model.best_score_

0.9984923337335406

# here the scaler is the varaible that we have used for standardising the independent variables so we should use the same variable here  


In [None]:
# Test the final model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=UserWarning)
Final_model_prod(Final_model,Data.columns.to_list(),scaler)

In [None]:
# Test the final model
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=UserWarning)
Final_model_prod(Final_model,Data.columns.to_list(),scaler)