In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
#!pip install xgboost

In [2]:
raw_dataset = pd.read_csv("CKD.csv", index_col=None)
raw_dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [3]:
df = raw_dataset

In [4]:
df = pd.get_dummies(df,dtype = int, drop_first = True)

In [5]:
indep_x = df.drop(['classification_yes'],axis=1)
dep_y = df['classification_yes']

# Feature Selection

## Recursive Feature Elimination - Regression

In [15]:
def split_scalar(indep_x,dep_y):
    x_train,x_test,y_train,y_test = train_test_split(indep_x,dep_y,test_size = 0.2,random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def R2_prediction(regressor,x_test,y_test):
    y_pred = regressor.predict(x_test)

    from sklearn.metrics import r2_score
    R2_score = r2_score(y_test,y_pred)
    return R2_score

def linear(x_train,y_train,x_test):
    from sklearn.linear_model import LinearRegression
    regressor = LinearRegression()
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return R2_score

def decision(x_train,y_train,x_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return R2_score    

def random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_setimators = 10, random_state=0)
    regressor.fit(x_train,y_train)
    R2_score = R2_prediction(regressor,x_test,y_test)
    return R2_score

def xgboost(x_train,y_train_x_test):
    from xgboost import XGBRegressor
    regressor = XGBRegressor(n_jobs=5, learning_rate=0.01, max_depth=10, randon_state=1)
    regressor.fit(x_train,y_train)
    R2 = R2_prediction(regressor,x_test,y_test)
    return R2_score

In [16]:
def RFEfeatures(indep_y,dep_y,n):
    rfelist = []
    cols_list = []
    R2_values = []

    from sklearn.linear_model import LinearRegression
    lin_model = LinearRegression()

    from sklearn.tree import DecisionTreeRegressor
    dc_model = DecisionTreeRegressor(random_state = 0)

    from sklearn.ensemble import RandomForestRegressor
    rf_model = RandomForestRegressor(n_estimators = 10,random_state = 0)

    from xgboost import XGBRegressor 
    xgb_model = XGBRegressor(n_jobs=5, learning_rate = 0.1, max_depth = 10, random_state = 1)

    rfemodellist = [lin_model,dc_model, rf_model, xgb_model]

    for model in rfemodellist:
        rfe = RFE(estimator = model, n_features_to_select = n)
        rfe.fit(indep_x,dep_y)
        rfe_features = rfe.transform(indep_x)
        rfelist.append(rfe_features)

        # Get the column names selected by RFE - (using list comprehension)
        selected_columns = [col for col, selected in zip(indep_x.columns,rfe.support_) if selected]
        cols_list.append(selected_columns)
        
        # Get the R2 values
        x_train,x_test,y_train,y_test = split_scalar(pd.DataFrame(rfe_features),dep_y)
        model.fit(x_train,y_train)
        R2_score = R2_prediction(model,x_test,y_test)
        R2_values.append(R2_score)

    return rfelist, cols_list, R2_values

In [24]:
# call the function with my data
rfelist, cols_list, R2_values = RFEfeatures(indep_x, dep_y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_features, R2_score in zip(['Linear','Decision', 'Random', 'xgboost'], cols_list, R2_values):
    print(f"Model : {model_name}")
    print("Selected Features :", selected_features)
    print(f"R2 Score : {R2_score}\n")

Model : Linear
Selected Features : ['sg_b', 'sg_c', 'sg_d', 'sg_e', 'htn_yes']
R2 Score : 0.6519261299873549

Model : Decision
Selected Features : ['bu', 'hrmo', 'rc', 'sg_c', 'sg_d']
R2 Score : 0.7395833333333334

Model : Random
Selected Features : ['al', 'hrmo', 'rc', 'sg_c', 'sg_d']
R2 Score : 0.909786074114886

Model : xgboost
Selected Features : ['al', 'hrmo', 'rc', 'sg_c', 'sg_d']
R2 Score : 0.9624820186037115



## Train Test Split
## Model Creation 

## DecisionTree

### Grid Search Cross Validation Method

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor


params_grid = {'criterion' : ['absolute_error', 'friedman_mse','sqaured_error','poisson'],
              'splitter' : ['best','auto'],
              'max_features' : ['sqrt','log2']}

grid = GridSearchCV(DecisionTreeRegressor(), params_grid, refit=True,verbose=3, n_jobs=-1)
grid.fit(indep_x,dep_y)

In [None]:
result = grid.cv_results_

grid_predictions = grid.predict(indep_x)

from sklearn.metrics import r2_score
R2_score = r2_score(dep_y,grid_predictions)

# Getting grid.best_params_
print("Best parameter of DecisionTreeRegressor Grid {}:",format(grid.best_params_), R2_score )

Total = pd.DataFrame(result)
Total

### Hyper Tuning Parameter Method - Decision Tree

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(indep_x, dep_y, test_size=0.2, random_state=0)

from sklearn.tree import DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor(criterion='squared_error', splitter='random')
regressor_dt = regressor_dt.fit(x_train,y_train)

y_pred=regressor_dt.predict(x_test)

from sklearn.metrics import r2_score
R2_score = r2_score(y_test,y_pred)

R2_score

## Random Forest

### Grid Search Cross Validation Method

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


params_grid = {'n_estimators' : [50,100,500,1000,2000],
              'criterion' : ['absolute_error', 'friedman_mse','sqaured_error','poisson'],
              'max_features' : ['sqrt','log2']}

grid = GridSearchCV(RandomForestRegressor(), params_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(indep_x,dep_y)

In [None]:
result = grid.cv_results_

grid_predictions = grid.predict(indep_x)

from sklearn.metrics import r2_score
R2_score = r2_score(dep_y, grid_predictions)

print("Best parameter of RandomForestRegressor Grid {}:",format(grid.best_params_),R2_score)

Total = pd.DataFrame(result)
Total

### Hyper Tuning Parameter Method - Random Forest

from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators=100,random_state=0)
regressor_rf.fit(x_train, y_train)

y_pred=regressor_rf.predict(x_test)

from sklearn.metrics import r2_score
R2_score = r2_score(y_test,y_pred)
R2_score

## Gradient Boosting

### Grid Search Cross Validation Method

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor


params_grid = {'loss' : ['squared_error', 'absolute_error', 'huber', 'quantile'],
              'criterion' : ['friedman_mse','sqaured_error'],
              'max_features' : ['sqrt','log2']}

grid = GridSearchCV(GradientBoostingRegressor(), params_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(indep_x,dep_y)

In [None]:
result = grid.cv_results_

grid_predictions = grid.predict(indep_x)

from sklearn.metrics import r2_score
R2_score = r2_score(dep_y, grid_predictions)

print("Best parameter of GradientBoostingRegressor Grid {}:",format(grid.best_params_),R2_score)

Total = pd.DataFrame(result)
Total

### Hyper Tuning Parameter Method - GradientBoosting

from sklearn.ensemble import GradientBoostingRegressor
regressor_rf = GradientBoostingRegressor(n_estimators=100,random_state=0)
regressor_rf.fit(x_train, y_train)

y_pred=regressor_rf.predict(x_test)

from sklearn.metrics import r2_score
R2_score = r2_score(y_test,y_pred)
R2_score

## By Comparing all the Models
# 'Gradient Boosting Algorithm' gives Maximum Accuracy of 98.7%

#### Save the Best Model

In [None]:
Finalized_Model = 'Finalized_CKD_Regression_Model.sav'

In [None]:
pickle.dump(grid,open('Finalized_CKD_Regression_Model.sav','wb'))