# Project: Chronic Kidney Disease (CKD) Classification

## Topic: Import libraries

- **Purpose:** The overall purpose of the libraries used in your project is to facilitate machine learning workflows. Each library contributes to specific tasks such as data manipulation, feature selection, model training, evaluation, and deployment.

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier

## Topic: Dataset Loading and Initial Exploration
- **Purpose:** Load and examine the cleaned Chronic Kidney Disease (CKD) dataset to ensure data readiness for analysis.

In [2]:
# Load dataset
dataset = pd.read_csv("cleaned_kidney_disease.csv")
dataset.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0.0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1.0,9.0,55.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,...,38,6000,5.2,no,no,no,good,no,no,ckd
2,2.0,62.0,80.0,1.01,2.0,1.125356,normal,normal,notpresent,notpresent,...,31,7500,5.2,no,yes,no,poor,no,yes,ckd
3,3.0,48.0,70.0,1.0075,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4.0,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


## Topic: Data Preprocessing
- **Purpose**: Prepare the dataset by converting categorical variables into numerical ones and separating features (X) and the target variable (y).

In [3]:
dataset.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [4]:
# One-hot encoding
df = pd.get_dummies(dataset, dtype=int, drop_first=True)
df.head()

Unnamed: 0,id,age,bp,sg,al,su,bgr,bu,sc,sod,...,dm_ yes,dm_no,dm_yes,cad_no,cad_yes,appet_poor,pe_yes,ane_yes,classification_ckd\t,classification_notckd
0,0.0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,137.528754,...,0,0,1,1,0,0,0,0,0,0
1,1.0,9.0,55.0,1.02,4.0,0.0,148.036517,18.0,0.8,137.528754,...,0,1,0,1,0,0,0,0,0,0
2,2.0,62.0,80.0,1.01,2.0,1.125356,223.5,53.0,1.8,137.528754,...,0,0,1,1,0,1,0,1,0,0
3,3.0,48.0,70.0,1.0075,4.0,0.0,117.0,56.0,3.8,126.0,...,0,1,0,1,0,1,1,1,0,0
4,4.0,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,137.528754,...,0,1,0,1,0,0,0,0,0,0


- **Purpose:** Drop irrelevant columns and seperate independent and dependent columns to predictions`

In [5]:
# Separate independent and dependent variables
X = df.drop(['id', 'classification_notckd'], axis=1)
y = df['classification_notckd']

In [6]:
X

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,...,dm_\tyes,dm_ yes,dm_no,dm_yes,cad_no,cad_yes,appet_poor,pe_yes,ane_yes,classification_ckd\t
0,48.0,80.0,1.0200,1.0,0.000000,121.000000,36.0,1.2,137.528754,4.627244,...,0,0,0,1,1,0,0,0,0,0
1,9.0,55.0,1.0200,4.0,0.000000,148.036517,18.0,0.8,137.528754,4.627244,...,0,0,1,0,1,0,0,0,0,0
2,62.0,80.0,1.0100,2.0,1.125356,223.500000,53.0,1.8,137.528754,4.627244,...,0,0,0,1,1,0,1,0,1,0
3,48.0,70.0,1.0075,4.0,0.000000,117.000000,56.0,3.8,126.000000,2.800000,...,0,0,1,0,1,0,1,1,1,0
4,51.0,80.0,1.0100,2.0,0.000000,106.000000,26.0,1.4,137.528754,4.627244,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.0200,0.0,0.000000,140.000000,49.0,0.5,150.000000,4.900000,...,0,0,1,0,1,0,0,0,0,0
396,42.0,70.0,1.0250,0.0,0.000000,75.000000,31.0,1.2,141.000000,3.500000,...,0,0,1,0,1,0,0,0,0,0
397,12.0,80.0,1.0200,0.0,0.000000,100.000000,26.0,0.6,137.000000,4.400000,...,0,0,1,0,1,0,0,0,0,0
398,17.0,60.0,1.0250,0.0,0.000000,114.000000,50.0,1.0,135.000000,4.900000,...,0,0,1,0,1,0,0,0,0,0


In [7]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    1
399    1
Name: classification_notckd, Length: 400, dtype: int32

## Topic: Feature Selection
- **Purpose:** Select the top 5 features most relevant to the target variable to improve model performance.

In [8]:
# Select top 5 features using chi2
kbest = SelectKBest(score_func=chi2, k=5)
X_kbest = kbest.fit_transform(X, y)
selected_features = X.columns[kbest.get_support()]
X_kbest = pd.DataFrame(X_kbest, columns=selected_features)


In [9]:
selected_features

Index(['al', 'bgr', 'bu', 'sc', 'hemo'], dtype='object')

## Topic: Data Splitting and Scaling
**Purpose:** Split the data into training and testing sets and standardize feature values for better model performance.

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.25, random_state=0)


In [11]:
X_train

Unnamed: 0,al,bgr,bu,sc,hemo
250,0.0,140.000000,10.000000,1.200000,15.000000
63,1.0,150.000000,111.000000,6.100000,7.500000
312,0.0,148.036517,57.425722,3.072454,15.300000
159,1.0,223.500000,35.000000,1.300000,10.400000
283,0.0,148.036517,57.425722,3.072454,16.400000
...,...,...,...,...,...
323,0.0,130.000000,30.000000,1.100000,15.900000
192,0.0,130.000000,16.000000,0.900000,12.526437
117,0.0,219.000000,36.000000,1.300000,12.500000
47,3.0,148.036517,17.000000,0.800000,15.000000


In [12]:
X_test

Unnamed: 0,al,bgr,bu,sc,hemo
132,1.016949,219.0,113.875,6.331136,8.600000
309,0.000000,129.0,25.000,1.200000,17.200000
341,0.000000,130.0,37.000,0.900000,13.400000
196,3.000000,129.0,113.875,6.331136,8.100000
246,3.000000,106.0,113.875,6.331136,8.600000
...,...,...,...,...,...
146,1.000000,213.0,23.000,1.000000,12.526437
135,0.000000,214.0,24.000,1.300000,13.200000
390,0.000000,99.0,25.000,0.800000,15.000000
264,0.000000,132.0,24.000,0.700000,14.400000


In [13]:
y_train

250    1
63     0
312    1
159    0
283    1
      ..
323    1
192    0
117    0
47     0
172    0
Name: classification_notckd, Length: 300, dtype: int32

In [14]:
y_test

132    0
309    1
341    1
196    0
246    0
      ..
146    0
135    0
390    1
264    1
364    1
Name: classification_notckd, Length: 100, dtype: int32

In [15]:
# Feature scaling
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=selected_features)
X_test = pd.DataFrame(scaler.transform(X_test), columns=selected_features)

In [16]:
X_train

Unnamed: 0,al,bgr,bu,sc,hemo
0,-0.780612,0.051025,-1.371004,-0.544485,0.916361
1,0.033628,0.265627,2.051499,2.195695,-1.971588
2,-0.780612,0.223490,0.236072,0.502630,1.031879
3,0.033628,1.842953,-0.523850,-0.488563,-0.854914
4,-0.780612,0.223490,0.236072,0.502630,1.455445
...,...,...,...,...,...
295,-0.780612,-0.163577,-0.693281,-0.600407,1.262915
296,-0.780612,-0.163577,-1.167687,-0.712251,-0.036109
297,-0.780612,1.746382,-0.489964,-0.488563,-0.046288
298,1.662106,0.223490,-1.133801,-0.768173,0.916361


In [17]:
X_test

Unnamed: 0,al,bgr,bu,sc,hemo
0,0.047428,1.746382,2.148922,2.324951,-1.548022
1,-0.780612,-0.185038,-0.862712,-0.544485,1.763493
2,-0.780612,-0.163577,-0.456078,-0.712251,0.300266
3,1.662106,-0.185038,2.148922,2.324951,-1.740552
4,1.662106,-0.678623,2.148922,2.324951,-1.548022
...,...,...,...,...,...
95,0.033628,1.617621,-0.930484,-0.656329,-0.036109
96,-0.780612,1.639081,-0.896598,-0.488563,0.223254
97,-0.780612,-0.828844,-0.862712,-0.768173,0.916361
98,-0.780612,-0.120657,-0.896598,-0.824095,0.685325


## Topic: Model Training and Evaluation
- **Purpose:** Train and evaluate different machine learning models for CKD classification.

In [18]:
# Initialize and train Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print(f"Decision Tree Selected Features: {selected_features}")
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt):.2f}")
print(classification_report(y_test, y_pred_dt))

Decision Tree Selected Features: Index(['al', 'bgr', 'bu', 'sc', 'hemo'], dtype='object')
Decision Tree Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        62
           1       1.00      1.00      1.00        38

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [19]:
# Initialize and train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print(f"SVM Selected Features: {selected_features}")
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm):.2f}")
print(classification_report(y_test, y_pred_svm))

SVM Selected Features: Index(['al', 'bgr', 'bu', 'sc', 'hemo'], dtype='object')
SVM Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        62
           1       0.97      0.97      0.97        38

    accuracy                           0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100



In [20]:
# Initialize and train Logistic Regression model
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
print(f"Logistic Regression Selected Features: {selected_features}")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_log):.2f}")
print(classification_report(y_test, y_pred_log))

Logistic Regression Selected Features: Index(['al', 'bgr', 'bu', 'sc', 'hemo'], dtype='object')
Logistic Regression Accuracy: 0.98
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        62
           1       0.97      0.97      0.97        38

    accuracy                           0.98       100
   macro avg       0.98      0.98      0.98       100
weighted avg       0.98      0.98      0.98       100



In [21]:
# Initialize and train XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(f"XGBoost Selected Features: {selected_features}")
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb):.2f}")
print(classification_report(y_test, y_pred_xgb))

XGBoost Selected Features: Index(['al', 'bgr', 'bu', 'sc', 'hemo'], dtype='object')
XGBoost Accuracy: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        62
           1       1.00      1.00      1.00        38

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



Parameters: { "use_label_encoder" } are not used.



In [22]:
# Initialize and train AdaBoost model
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)
y_pred_ada = ada_model.predict(X_test)
print(f"AdaBoost Selected Features: {selected_features}")
print(f"AdaBoost Accuracy: {accuracy_score(y_test, y_pred_ada):.2f}")
print(classification_report(y_test, y_pred_ada))



AdaBoost Selected Features: Index(['al', 'bgr', 'bu', 'sc', 'hemo'], dtype='object')
AdaBoost Accuracy: 0.99
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        62
           1       0.97      1.00      0.99        38

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



## Topic: Saving and Loading Models
- **Purpose:** Persist the trained model and scaler for future predictions without retraining.

In [23]:
import pickle
# Save Random Forest model and scaler
with open('Ckd_dt_model.pkl', 'wb') as model_file:
    pickle.dump(dt_model, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [25]:
import pickle

# Saving a model as `.sav`
with open('Ckd_dt_model.sav', 'wb') as file:
    pickle.dump(dt_model, file)

# Loading a model from `.sav`
with open('Ckd_dt_model.sav', 'rb') as file:
    loaded_model = pickle.load(file)


In [26]:
# Load the saved model and scaler
with open('Ckd_dt_model.pkl', 'rb') as model_file:
    loaded_rf_model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    loaded_scaler = pickle.load(scaler_file)

## Topic: Prediction on New Input
- **Purpose:** Use the trained model to predict CKD classification for new input data.

In [27]:
new_input = np.array([[0,140, 10, 1.20, 15]])  # Example: [Seating Capacity=100, Meal Price=20.5, Location_Rural=1, Cuisine_Japanese=0, Cuisine_Mexican=1]

# Scale the new input data
scaled_input = loaded_scaler.transform(new_input)

# Make the prediction using the trained model
prediction = loaded_rf_model.predict(scaled_input)

# Print the prediction
print("Prediction for the new input:", prediction)

Prediction for the new input: [1]


