In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 📌 Step 1: Load the dataset
kidney_df = pd.read_csv("kidney_disease1.csv")

# 📌 Step 2: Drop unnecessary columns
kidney_df.drop(columns=['id'], inplace=True, errors='ignore')

# 📌 Step 3: Convert categorical labels to binary
kidney_df['classification'] = kidney_df['classification'].map({'ckd': 1, 'notckd': 0})

# 📌 Step 4: Convert Yes/No categorical values into binary (1 = Yes, 0 = No)
binary_cols = ['htn', 'dm', 'cad', 'pcc', 'ba', 'pe', 'ane']
for col in binary_cols:
    kidney_df[col] = kidney_df[col].map({'yes': 1, 'no': 0})

# 📌 Step 5: Convert categorical variables into numerical values
kidney_df['appet'] = kidney_df['appet'].map({'good': 1, 'poor': 0})
kidney_df['rbc'] = kidney_df['rbc'].map({'abnormal': 1, 'normal': 0})
kidney_df['pc'] = kidney_df['pc'].map({'abnormal': 1, 'normal': 0})

# 📌 Step 6: Convert 'pcv', 'wc', 'rc' to numeric
kidney_df[['pcv', 'wc', 'rc']] = kidney_df[['pcv', 'wc', 'rc']].apply(pd.to_numeric, errors='coerce')

# 📌 Step 7: Fill missing values
# Fill numerical columns with median
num_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
kidney_df[num_cols] = kidney_df[num_cols].apply(lambda x: x.fillna(x.median()))

# Fill categorical columns with mode
cat_cols = ['htn', 'dm', 'cad', 'appet', 'pe', 'ane']
kidney_df[cat_cols] = kidney_df[cat_cols].apply(lambda x: x.fillna(x.mode()[0]))

# Fill missing values in 'rbc' and 'pc' with mode
kidney_df['rbc'].fillna(kidney_df['rbc'].mode()[0], inplace=True)
kidney_df['pc'].fillna(kidney_df['pc'].mode()[0], inplace=True)

# 📌 Step 8: Drop 'pcc' and 'ba' (entirely missing)
kidney_df.drop(columns=['pcc', 'ba'], inplace=True, errors='ignore')

# Verify there are no missing values left
# Fill missing values in the classification column
kidney_df['classification'].fillna(kidney_df['classification'].mode()[0], inplace=True)

# Verify again that no missing values exist
print("Missing Values After Cleaning:\n", kidney_df.isnull().sum())


Missing Values After Cleaning:
 age               0
bp                0
sg                0
al                0
su                0
rbc               0
pc                0
bgr               0
bu                0
sc                0
sod               0
pot               0
hemo              0
pcv               0
wc                0
rc                0
htn               0
dm                0
cad               0
appet             0
pe                0
ane               0
classification    0
dtype: int64


In [2]:
# Define features (X) and target variable (y)
X_kidney = kidney_df.drop(columns=['classification'])  # All columns except target
y_kidney = kidney_df['classification']  # Target column


In [3]:
X_train_kidney, X_test_kidney, y_train_kidney, y_test_kidney = train_test_split(
    X_kidney, y_kidney, test_size=0.2, random_state=42
)

In [4]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit-transform training data, transform test data
X_train_kidney = pd.DataFrame(scaler.fit_transform(X_train_kidney), columns=X_train_kidney.columns)
X_test_kidney = pd.DataFrame(scaler.transform(X_test_kidney), columns=X_test_kidney.columns)

In [5]:
# Initialize SMOTE
smote = SMOTE(random_state=42)
X_train_kidney_balanced, y_train_kidney_balanced = smote.fit_resample(X_train_kidney, y_train_kidney)

# Check new class distribution
print("Balanced Class Distribution:\n", y_train_kidney_balanced.value_counts())

Balanced Class Distribution:
 classification
1.0    198
0.0    198
Name: count, dtype: int64


In [6]:
# Initialize XGBoost Classifier
xgb_kidney = XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42)

# Train the model
xgb_kidney.fit(X_train_kidney_balanced, y_train_kidney_balanced)

# Predict on test data
y_pred_kidney = xgb_kidney.predict(X_test_kidney)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

print("Model Accuracy:", accuracy_score(y_test_kidney, y_pred_kidney))
print("Classification Report:\n", classification_report(y_test_kidney, y_pred_kidney))


Model Accuracy: 0.9875
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.98        28
         1.0       1.00      0.98      0.99        52

    accuracy                           0.99        80
   macro avg       0.98      0.99      0.99        80
weighted avg       0.99      0.99      0.99        80



In [7]:
print("Final Features Used for Training:", X_kidney.columns.tolist())
print("Number of Features in Training Data:", len(X_kidney.columns))


Final Features Used for Training: ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
Number of Features in Training Data: 22


In [8]:
import pandas as pd
import numpy as np

# Ensure test input matches exactly with the 22 training features
test_input_values = [
    30,    # Age
    120,   # Blood Pressure (Normal: 120/80)
    1.020, # Specific Gravity (Normal Range: 1.015 - 1.025)
    0,     # Albumin (No protein leakage)
    0,     # Sugar (No sugar in urine)
    0,     # RBC (Normal)
    0,     # Pus Cells (Normal)
    100,   # **BGR (Blood Glucose Random) - Healthy: 80-120**
    15,    # Blood Urea (Healthy: < 20)
    0.8,   # Serum Creatinine (Healthy: 0.6 - 1.2)
    140,   # Sodium (Healthy: 135-145)
    4.5,   # Potassium (Healthy: 3.5-5.0)
    15,    # Hemoglobin (Normal Range: 12-17)
    42,    # Packed Cell Volume (Normal: 40-50)
    9000,  # WBC Count (Normal: 4000-11000)
    5.2,   # RBC Count (Healthy: 4.7-6.1)
    0,     # Hypertension (No)
    0,     # Diabetes Mellitus (No)
    0,     # Coronary Artery Disease (No)
    1,     # Appetite (Good)
    0,     # Pedal Edema (No)
    0      # Anemia (No)
]

# Convert to DataFrame with correct column names
test_input_df = pd.DataFrame([test_input_values], columns=X_kidney.columns)

# Scale test input
test_input_scaled = scaler.transform(test_input_df)

# Predict probability
prob = xgb_kidney.predict_proba(test_input_scaled)[0]

# Adjust threshold (Only predict `1` if probability > 0.6)
prediction = 1 if prob[1] > 0.6 else 0

print("✅ Final Kidney Disease Prediction:", prediction)  # Expected: 0


✅ Final Kidney Disease Prediction: 0


In [9]:
import pandas as pd
import numpy as np

# Ensure test input matches exactly with the 22 training features
test_input_values = [
    65,     # Age (Older individuals are more prone)
    150,    # Blood Pressure (High BP is a risk factor)
    1.010,  # Specific Gravity (Lower values indicate kidney issues)
    3,      # Albumin (Higher values suggest kidney damage)
    2,      # Sugar (Presence of sugar can indicate problems)
    1,      # RBC (Abnormal RBC count)
    1,      # Pus Cells (Abnormal)
    180,    # **Blood Glucose Random (High, common in diabetes & CKD)**
    120,    # Blood Urea (Very high, normal < 20)
    4.5,    # Serum Creatinine (High, normal ~0.6-1.2)
    125,    # Sodium (Lower sodium indicates kidney dysfunction)
    6.0,    # Potassium (Elevated potassium is a red flag)
    9.0,    # Hemoglobin (Low hemoglobin is common in CKD)
    30,     # Packed Cell Volume (Low PCV is a sign of CKD)
    6000,   # WBC Count (Lower than usual)
    3.5,    # RBC Count (Lower than normal ~4.7-6.1)
    1,      # Hypertension (Yes, common in CKD patients)
    1,      # Diabetes Mellitus (Yes, a risk factor)
    1,      # Coronary Artery Disease (Yes, another risk factor)
    0,      # Appetite (Poor)
    1,      # Pedal Edema (Yes, common in CKD)
    1       # Anemia (Yes, common in CKD)
]

# Convert to DataFrame with correct column names
test_input_df = pd.DataFrame([test_input_values], columns=X_kidney.columns)

# Scale test input
test_input_scaled = scaler.transform(test_input_df)

# Predict probability
prob = xgb_kidney.predict_proba(test_input_scaled)[0]

# Adjust threshold (Only predict `1` if probability > 0.6)
prediction = 1 if prob[1] > 0.6 else 0

print("✅ Final Kidney Disease Prediction:", prediction)  # Expected: 1


✅ Final Kidney Disease Prediction: 1


In [10]:
import joblib

# Save the trained model
joblib.dump(xgb_kidney, "kidney_disease_model.joblib")

print("✅ Model saved as 'kidney_disease_model.joblib'")


✅ Model saved as 'kidney_disease_model.joblib'
