## RFE _Classification - Feature Selection & Model Creation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt

In [2]:
raw_dataset = pd.read_csv("CKD.csv", index_col=None)
raw_dataset

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,2.000000,76.459948,c,3.0,0.0,normal,abnormal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,yes,no,yes
1,3.000000,76.459948,c,2.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,34.000000,12300.000000,4.705597,no,no,no,yes,poor,no,yes
2,4.000000,76.459948,a,1.0,0.0,normal,normal,notpresent,notpresent,99.000000,...,34.000000,8408.191126,4.705597,no,no,no,yes,poor,no,yes
3,5.000000,76.459948,d,1.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,38.868902,8408.191126,4.705597,no,no,no,yes,poor,yes,yes
4,5.000000,50.000000,c,0.0,0.0,normal,normal,notpresent,notpresent,148.112676,...,36.000000,12400.000000,4.705597,no,no,no,yes,poor,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,51.492308,70.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,219.000000,...,37.000000,9800.000000,4.400000,no,no,no,yes,poor,no,yes
395,51.492308,70.000000,c,0.0,2.0,normal,normal,notpresent,notpresent,220.000000,...,27.000000,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes
396,51.492308,70.000000,c,3.0,0.0,normal,normal,notpresent,notpresent,110.000000,...,26.000000,9200.000000,3.400000,yes,yes,no,poor,poor,no,yes
397,51.492308,90.000000,a,0.0,0.0,normal,normal,notpresent,notpresent,207.000000,...,38.868902,8408.191126,4.705597,yes,yes,no,yes,poor,yes,yes


In [3]:
df = raw_dataset

In [4]:
df = pd.get_dummies(df,dtype = int, drop_first = True)

In [5]:
indep_x = df.drop(['classification_yes'],axis=1)
dep_y = df['classification_yes']

# Feature Selection

## Recursive Feature Elimination - Classsification

In [6]:
def split_scalar(indep_x,dep_y):
    x_train,x_test,y_train,y_test = train_test_split(indep_x,dep_y,test_size = 0.2,random_state = 0)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def cm_prediction(classifier,x_test,y_test):
    y_pred = classifier.predict(x_test)

    from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
    cm = confusion_matrix(y_test,y_pred)
    accuracy = accuracy_score(y_test,y_pred)
    clas_report = classification_report(y_test,y_pred)
    return accuracy

def logistic(x_train,y_train,x_test):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(max_iter=1000, random_state=0)
    classifier.fit(x_train,y_train)
    Accuracy = cm_prediction(classifier,x_test,y_test)
    return Accuracy

def decision(x_train,y_train,x_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(random_state=0)
    classifier.fit(x_train,y_train)
    Accuracy = cm_prediction(classifier,x_test,y_test)
    return Accuracy    

def random(x_train,y_train,x_test):
    from sklearn.ensemble import RandomForestClassifier
    regressor = RandomForestClassifier(n_setimators = 10, random_state=0)
    regressor.fit(x_train,y_train)
    Accuracy = cm_prediction(classifier,x_test,y_test)
    return Accuracy

def xgboost(x_train,y_train_x_test):
    from xgboost import XGBClassifier
    classifier = XGBClassifier(n_jobs=5, learning_rate=0.01, max_depth=10, randon_state=1)
    classifier.fit(x_train,y_train)
    Accuracy = Rcm_prediction(classifier,x_test,y_test)
    return Accuracy

In [7]:
def RFEfeatures(indep_y,dep_y,n):
    rfelist = []
    cols_list = []
    Accuracy = []

    from sklearn.linear_model import LogisticRegression
    log_model = LogisticRegression()

    from sklearn.tree import DecisionTreeClassifier
    dc_model = DecisionTreeClassifier(random_state = 0)

    from sklearn.ensemble import RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators = 10,random_state = 0)

    from xgboost import XGBClassifier
    xgb_model = XGBClassifier(n_jobs=5, learning_rate = 0.1, max_depth = 10, random_state = 1)

    rfemodellist = [log_model, dc_model, rf_model, xgb_model]

    for model in rfemodellist:
        rfe = RFE(estimator = model, n_features_to_select = n)
        rfe.fit(indep_x,dep_y)
        rfe_features = rfe.transform(indep_x)
        rfelist.append(rfe_features)

        # Get the column names selected by RFE - (using list comprehension)
        selected_columns = [col for col, selected in zip(indep_x.columns,rfe.support_) if selected]
        cols_list.append(selected_columns)
        
        # Get the R2 values
        x_train,x_test,y_train,y_test = split_scalar(pd.DataFrame(rfe_features), dep_y)
        model.fit(x_train,y_train)
        accuracy = cm_prediction(model,x_test,y_test)
        Accuracy.append(accuracy)

    return rfelist, cols_list, Accuracy

In [8]:
# call the function with my data
rfelist, cols_list, Accuracy = RFEfeatures(indep_x, dep_y, 5)

# Print the selected column names and R2 values for each model
for model_name, selected_features, accuracy in zip(["Logistic", "Decision", "Random", "XGBoost"], cols_list, Accuracy):
    print(f"Model: {model_name}")
    print("Selected Features:", selected_features)
    print(f"Accuracy: {accuracy}\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model: Logistic
Selected Features: ['al', 'sg_c', 'sg_d', 'htn_yes', 'dm_yes']
Accuracy: 0.9875

Model: Decision
Selected Features: ['bu', 'hrmo', 'rc', 'sg_c', 'sg_d']
Accuracy: 0.9375

Model: Random
Selected Features: ['bgr', 'sc', 'hrmo', 'pcv', 'rc']
Accuracy: 0.9375

Model: XGBoost
Selected Features: ['al', 'hrmo', 'pcv', 'sg_c', 'sg_d']
Accuracy: 0.975



## Train Test Split

## Model Creation 

## Logistic Regression

### Grid Search Cross Validation Method

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

params_grid = {'penalty' : ['l1', 'l2', 'elasticnet'],
              'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
              'multi_class': ['auto', 'ovr', 'multinomial']}

grid = GridSearchCV(LogisticRegression(), params_grid, refit=True, verbose = 3, n_jobs=-1)
grid.fit(indep_x,dep_y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


165 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [10]:
result = grid.cv_results_

grid_predictions = grid.predict(indep_x)

from sklearn.metrics import accuracy_score
Accuracy = accuracy_score(dep_y,grid_predictions)

print("Best parameters for Logistic Regression : {}", format (grid.best_params_), Accuracy)

Best parameters for Logistic Regression : {} {'multi_class': 'auto', 'penalty': 'l2', 'solver': 'newton-cg'} 0.9924812030075187


## Decision Tree

### Grid Search Cross Validation Method

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier


params_grid = {'criterion' : ['ginni', 'entropy', 'log_loss'],  # This parameter is tree-specific
              'splitter' : ['best','random'],
              'max_features': ['sqrt','log2']}

grid = GridSearchCV(DecisionTreeClassifier(), params_grid, refit=True, verbose=3, n_jobs=-1, scoring = 'f1_weighted')
grid.fit(indep_x,dep_y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

In [12]:
result = grid.cv_results_

grid_predictions = grid.predict(indep_x)

from sklearn.metrics import accuracy_score
Accuracy = accuracy_score(dep_y,grid_predictions)

print("Best parameters for Decision Tree : {}", format (grid.best_params_), Accuracy)

Best parameters for Decision Tree : {} {'criterion': 'log_loss', 'max_features': 'sqrt', 'splitter': 'random'} 1.0


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(indep_x, dep_y, test_size=0.2, random_state=0)

from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion='entropy', splitter='random')
classifier_dt = classifier_dt.fit(x_train,y_train)

y_pred=classifier_dt.predict(x_test)

from sklearn.metrics import accuracy_score
Accuracy = accuracy_score(y_test,y_pred)

Accuracy

## Random Forest

### Grid Search Cross Validation Method

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


params_grid = {'criterion' : ['ginni', 'entropy', 'log_loss'],  # This parameter is tree-specific
              'max_features': ['sqrt','log2'],
             'class_weight':['balanced','balanced_subsample']}

grid = GridSearchCV(RandomForestClassifier(), params_grid, refit=True, verbose=3, n_jobs=-1, scoring = 'f1_weighted')
grid.fit(indep_x,dep_y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
s

In [14]:
result = grid.cv_results_

grid_predictions = grid.predict(indep_x)

from sklearn.metrics import accuracy_score
Accuracy = accuracy_score(dep_y,grid_predictions)

print("Best parameters for Decision Tree : {}", format (grid.best_params_), Accuracy)

Best parameters for Decision Tree : {} {'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'log2'} 1.0


## Gradient Bossting Claasifier

### Grid Search Cross Validation Method

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


params_grid = {'loss' : ['log_loss', 'exponential'],
              'criterion' : ['friedman_mse','sqaured_error'],
              'max_features' : ['sqrt','log2']}

grid = GridSearchCV(GradientBoostingClassifier(), params_grid, refit=True, verbose=3, n_jobs=-1)
grid.fit(indep_x,dep_y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Anaconda3\envs\AIML\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
s

In [16]:
result = grid.cv_results_

grid_predictions = grid.predict(indep_x)

from sklearn.metrics import accuracy_score
Accuracy = accuracy_score(dep_y,grid_predictions)

print("Best parameters for Decision Tree : {}", format (grid.best_params_), Accuracy)

Best parameters for Decision Tree : {} {'criterion': 'friedman_mse', 'loss': 'exponential', 'max_features': 'log2'} 1.0


## By Comparing all the Models
# 'Logistic Regression Algorithm' gives  Accuracy of 99%
# 'Decision Tree Algorithm' gives  Accuracy of 100%
# 'Random Forest Algorithm' gives  Accuracy of 100%
# 'Gradient Boosting Algorithm' gives  Accuracy of 100%