# Cronic Disease Prediction Using ML

## 1. Importing the tools

In [66]:
import numpy as np
import pandas as pd

# Data Preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Classification Model's tools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# For Model Evaluation Tools
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## 2. Loading the DataSet

In [67]:
df = pd.read_csv("kidney_disease.csv", na_values=['?', '\t?', ' ', '\t', 'NA', 'nan', 'NaN'])
df.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38.0,6000.0,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31.0,7500.0,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


## 3. Data Cleaning and preprocessing

In [68]:
# Printing the important Column
important_cloumns = ['age', 'bp', 'sg', 'al', 'hemo', 'sc', 'htn', 'dm', 'cad', 'appet', 'pc', 'classification']
df = df[important_cloumns]
df


Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,yes,yes,no,good,normal,ckd
1,7.0,50.0,1.020,4.0,11.3,0.8,no,no,no,good,normal,ckd
2,62.0,80.0,1.010,2.0,9.6,1.8,no,yes,no,poor,normal,ckd
3,48.0,70.0,1.005,4.0,11.2,3.8,yes,no,no,poor,abnormal,ckd
4,51.0,80.0,1.010,2.0,11.6,1.4,no,no,no,good,normal,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,no,no,no,good,normal,notckd
396,42.0,70.0,1.025,0.0,16.5,1.2,no,no,no,good,normal,notckd
397,12.0,80.0,1.020,0.0,15.8,0.6,no,no,no,good,normal,notckd
398,17.0,60.0,1.025,0.0,14.2,1.0,no,no,no,good,normal,notckd


In [69]:
df.isnull().sum()

Unnamed: 0,0
age,9
bp,12
sg,47
al,46
hemo,52
sc,17
htn,2
dm,2
cad,2
appet,1


In [70]:
df.shape

(400, 12)

In [71]:
# Fill missing values with appropriate methods
df['age'].fillna(df['age'].median(), inplace=True)         # Numerical → median
df['bp'].fillna(df['bp'].median(), inplace=True)           # Numerical → median
df['sg'].fillna(df['sg'].mode()[0], inplace=True)          # Categorical (discrete numeric) → mode
df['al'].fillna(df['al'].mode()[0], inplace=True)          # Categorical (discrete numeric) → mode
df['hemo'].fillna(df['hemo'].median(), inplace=True)       # Numerical → median
df['sc'].fillna(df['sc'].median(), inplace=True)           # Numerical → median

# Removed fillna for 'htn', 'dm', 'cad', 'appet', 'pc' as they are handled in the encoding step (cell 6b120198)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)         # Numerical → median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bp'].fillna(df['bp'].median(), inplace=True)           # Numerical → median
The behavior will change in pandas 3.0. This inplace method will never w

Unnamed: 0,0
age,0
bp,0
sg,0
al,0
hemo,0
sc,0
htn,2
dm,2
cad,2
appet,1


## 4. Encoding

In [72]:
categorical_cols_to_map = ['htn', 'dm', 'cad', 'appet', 'pc', 'classification']

for col in categorical_cols_to_map:
    # Convert to string, clean whitespace/tabs, convert to lowercase for consistency
    df[col] = df[col].astype(str).str.strip().str.replace("\t", '', regex=True).str.lower()

    # Replace any string 'nan' with actual np.nan
    df[col] = df[col].replace('nan', np.nan)

    # Fill NaN values. If mode is empty (e.g., column is entirely NaN), provide a fallback default.
    if not df[col].mode().empty:
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        # Fallback for columns that are entirely NaN after cleaning.
        # This should ideally not be hit for 'classification' given original data,
        # but provides robustness.
        if col in ['htn', 'dm', 'cad']:
            df[col].fillna('no', inplace=True)
        elif col == 'appet':
            df[col].fillna('good', inplace=True)
        elif col == 'pc':
            df[col].fillna('normal', inplace=True)
        elif col == 'classification':
            df[col].fillna('notckd', inplace=True)

# Now apply mapping to the cleaned and filled columns
df['htn'] = df['htn'].map({'yes':1, "no":0})
df['dm'] = df['dm'].map({'yes':1, "no":0})
df['cad'] = df['cad'].map({'yes':1, "no":0})
df['appet'] = df['appet'].map({'good':1, "poor":0})
df['pc'] = df['pc'].map({'normal':1, "abnormal":0})
df['classification'] = df['classification'].map({'ckd':1, "notckd":0})

df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,48.0,80.0,1.020,1.0,15.4,1.2,1,1,0,1,1,1
1,7.0,50.0,1.020,4.0,11.3,0.8,0,0,0,1,1,1
2,62.0,80.0,1.010,2.0,9.6,1.8,0,1,0,0,1,1
3,48.0,70.0,1.005,4.0,11.2,3.8,1,0,0,0,0,1
4,51.0,80.0,1.010,2.0,11.6,1.4,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,15.7,0.5,0,0,0,1,1,0
396,42.0,70.0,1.025,0.0,16.5,1.2,0,0,0,1,1,0
397,12.0,80.0,1.020,0.0,15.8,0.6,0,0,0,1,1,0
398,17.0,60.0,1.025,0.0,14.2,1.0,0,0,0,1,1,0


## 5. Scaling: Normalization

In [73]:
# yaha kuchh columns pe scalling is liye kr rhe hai because , sometimes machine learning model higher values ko jyada importance de deta hain
# so agar hm ye chaahte hain ki ki ML Model sb ko equal importance de, tb hm MinMax scalling kr ke so ko normalize kr te hain
# so that ki ml model, sare columns ko equal importance de paaye

# Selecting the column to normalize
numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']

# Initialize scaler
scaler = MinMaxScaler()

# fit transform the numeric columns
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

df.head()

Unnamed: 0,age,bp,sg,al,hemo,sc,htn,dm,cad,appet,pc,classification
0,0.522727,0.230769,0.75,0.2,0.836735,0.010582,1,1,0,1,1,1
1,0.056818,0.0,0.75,0.8,0.557823,0.005291,0,0,0,1,1,1
2,0.681818,0.230769,0.25,0.4,0.442177,0.018519,0,1,0,0,1,1
3,0.522727,0.153846,0.0,0.8,0.55102,0.044974,1,0,0,0,0,1
4,0.556818,0.230769,0.25,0.4,0.578231,0.013228,0,0,0,1,1,1


In [74]:
df.isnull().sum(0)

Unnamed: 0,0
age,0
bp,0
sg,0
al,0
hemo,0
sc,0
htn,0
dm,0
cad,0
appet,0


## 6. Data Balancing

In [75]:
!pip install imbalanced-learn



In [76]:
df['classification'].value_counts()

Unnamed: 0_level_0,count
classification,Unnamed: 1_level_1
1,250
0,150


In [77]:
from imblearn.over_sampling import SMOTE

X = df.drop('classification', axis=1)
y = df['classification']


smote = SMOTE(random_state=42)

X_balanced, y_balanced = smote.fit_resample(X, y)

y_balanced.value_counts()

Unnamed: 0_level_0,count
classification,Unnamed: 1_level_1
1,250
0,250


# 7. Train test & Split

In [78]:
# Split into train and test sets (80% train and 20% test)
x_train, x_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Checking the shape
print("Train shape : ", x_train.shape)
print("Test Shape : ", x_test.shape)

Train shape :  (400, 11)
Test Shape :  (100, 11)


# 8. Training the Multiple Classifier

In [79]:
# Training the multiple model

models = {
    "Logistic Regression" : LogisticRegression(),
    "Support Vector Machine" : SVC(),
    "Random Forest Classifier" : RandomForestClassifier(),
    "K-Nearest Neighbors" : KNeighborsClassifier(),
    "Decision Tree Classifier" : DecisionTreeClassifier(),
    "Gaussian Naive Bayes" : GaussianNB(),
    "AdaBoost Classifier" : AdaBoostClassifier(),
    "Gradient Boosting Classifier" : GradientBoostingClassifier()
}

# Training and evaluating each and every model
for name, model in models.items():
  print("="*60)
  print("Model : ", name)

  # Train the model
  model.fit(x_train, y_train)

  # Make predictions on the test set
  y_pred = model.predict(x_test)

 # Calculate metrices
  accuracy = accuracy_score(y_test, y_pred)
  classification_rep = classification_report(y_test, y_pred)
  confusion = confusion_matrix(y_test, y_pred)

 # Print the metrics
  print("Accuracy : ", accuracy)
  print("Classification Report : \n", classification_rep)
  print("Confusion Matrix : \n", confusion)

Model :  Logistic Regression
Accuracy :  0.96
Classification Report : 
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       1.00      0.91      0.95        46

    accuracy                           0.96       100
   macro avg       0.97      0.96      0.96       100
weighted avg       0.96      0.96      0.96       100

Confusion Matrix : 
 [[54  0]
 [ 4 42]]
Model :  Support Vector Machine
Accuracy :  0.97
Classification Report : 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       1.00      0.93      0.97        46

    accuracy                           0.97       100
   macro avg       0.97      0.97      0.97       100
weighted avg       0.97      0.97      0.97       100

Confusion Matrix : 
 [[54  0]
 [ 3 43]]
Model :  Random Forest Classifier
Accuracy :  1.0
Classification Report : 
               precision    recall  f1-score

# 9. Selecting Best Model

In [80]:
model_gbc = GradientBoostingClassifier()
model_gbc.fit(x_train, y_train)
y_pred = model_gbc.predict(x_test)

print("Confusion Matrix : \n", confusion_matrix(y_test, y_pred))
print("Classification Report : \n", classification_report(y_test, y_pred))

Confusion Matrix : 
 [[54  0]
 [ 1 45]]
Classification Report : 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        54
           1       1.00      0.98      0.99        46

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100



# 10. Saving the Ml Model, Encoder and scaler for aproduction

In [86]:
import pickle
import os

# Create the 'models' directory if it doesn't exist
os.makedirs('models', exist_ok=True)

pickle.dump(scaler, open("scaler.pkl", "wb"))
pickle.dump(model_gbc, open("models/model_gbc.pkl", "wb"))

# 11. Prediction on the new data

In [100]:
# loading form the save file
scaler = pickle.load(open("scaler.pkl", "rb"))    # load the scaler
model_gbc = pickle.load(open("models/model_gbc.pkl", "rb")) # load the trained model

def predict_cronic_disease(age, bp, sg, al, hemo, sc, htn, dm, cad, appet, pc):
    #create a dataframe for taking the input variable following the same order
    df_dict = {
        'age' : [age],
        'bp' : [bp],
        'sg' : [sg], # Added 'sg' feature
        'al' : [al],
        'hemo' : [hemo],
        'sc' : [sc],
        'htn' : [htn],
        'dm' : [dm],
        'cad' : [cad],
        'appet' : [appet],
        'pc' : [pc]
    }
    df = pd.DataFrame(df_dict)

    # Encode the categorical columns and assign them back
    df['htn'] = df['htn'].map({'yes':1, "no":0})
    df['dm'] = df['dm'].map({'yes':1, 'no':0})
    df['cad'] = df['cad'].map({'yes':1, 'no':0})
    df['appet'] = df['appet'].map({'good':1, 'poor':0})
    df['pc'] = df['pc'].map({'normal':1, 'abnormal':0})

    # Normalize the numeric columns
    numeric_cols = ['age', 'bp', 'sg', 'al', 'hemo', 'sc']
    df[numeric_cols] = scaler.transform(df[numeric_cols])

    # Make the prediction
    prediction = model_gbc.predict(df)

    # Return the predicted value
    return prediction[0]



In [101]:
result = predict_cronic_disease(age = 30, bp = 30, sg = 1.020, al = 1.0, hemo = 15.9, sc = 1.2, htn = 'no', dm = 'no', cad = 'no', appet = 'good', pc = 'normal')

if (result == 1):
    print("Result : Cronic Disease Detected in the Patient")
else:
    print("Result : No Cronic Disease Detected in the Patient")

Result : No Cronic Disease Detected in the Patient
