In [None]:
!kaggle datasets download -d mansoordaku/ckdisease
!unzip ckdisease.zip

Dataset URL: https://www.kaggle.com/datasets/mansoordaku/ckdisease
License(s): unknown
ckdisease.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  ckdisease.zip
replace kidney_disease.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder


In [None]:
# Load the dataset
data = pd.read_csv('/content/kidney_disease.csv')  # Update filename if needed

# Inspect data
data.info()
data.head()

# Preprocess data (handle missing values, encoding, scaling)
data = data.dropna()  # Drop missing values for simplicity; handle more precisely if needed



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [None]:
# Inspect column names and types to identify non-numeric columns
print(data.dtypes)

id                  int64
age               float64
bp                float64
sg                float64
al                float64
su                float64
rbc                object
pc                 object
pcc                object
ba                 object
bgr               float64
bu                float64
sc                float64
sod               float64
pot               float64
hemo              float64
pcv                object
wc                 object
rc                 object
htn                object
dm                 object
cad                object
appet              object
pe                 object
ane                object
classification     object
dtype: object


In [None]:

# Preprocess data (handle missing values, encoding)
data = data.dropna()  # Drop missing values for simplicity

In [None]:

# Convert categorical data to numerical format
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])

In [None]:
# Separate features and target
# Adjust 'classification' with actual target column name
X = data.drop('classification', axis=1)
y = data['classification']

In [None]:

# Encode the target variable
le_y = LabelEncoder()
y = le_y.fit_transform(y)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Fit MLR model
mlr = LinearRegression()
mlr.fit(X_train, y_train)
mlr_preds = mlr.predict(X_test)
mlr_mse = mean_squared_error(y_test, mlr_preds)
print(f"MLR Mean Squared Error: {mlr_mse}")

MLR Mean Squared Error: 0.021218173781410277


In [None]:
slr = LinearRegression()
slr.fit(X_train.iloc[:, [0]], y_train)  # First feature only
slr_preds = slr.predict(X_test.iloc[:, [0]])
slr_mse = mean_squared_error(y_test, slr_preds)
print(f"SLR Mean Squared Error: {slr_mse}")


SLR Mean Squared Error: 0.046060055773704624


In [None]:
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)
log_preds = log_reg.predict(X_test)
log_accuracy = accuracy_score(y_test, log_preds)
print(f"Logistic Regression Accuracy: {log_accuracy}")


Logistic Regression Accuracy: 1.0


In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_preds)
print(f"Decision Tree Accuracy: {dt_accuracy}")


Decision Tree Accuracy: 1.0


In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"Random Forest Accuracy: {rf_accuracy}")


Random Forest Accuracy: 1.0


In [None]:
ensemble_model = VotingClassifier(
    estimators=[
        ('log_reg', log_reg),
        ('rf', rf),
        ('dt', dt)
    ],
    voting='soft'
)

ensemble_model.fit(X_train, y_train)
ensemble_preds = ensemble_model.predict(X_test)
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
print(f"Ensemble Voting Classifier Accuracy: {ensemble_accuracy}")


Ensemble Voting Classifier Accuracy: 1.0
