In [44]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, r2_score


In [23]:
data= pd.read_csv('chronic_kidney_disease_full.arff.csv')

In [24]:
data=data.replace('?', np.nan)

In [25]:
numeric_columns = ['age', 'bp','sg','bgr', 'bu', 'sc', 'sod', 'pot',
                   'hemo', 'pcv',"wbcc","rbcc",'al', 'su']
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')
    data[col].fillna(data[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


In [26]:
data

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.000000,...,44.0,7800.0,5.200000,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,148.036517,...,38.0,6000.0,4.707435,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.000000,...,31.0,7500.0,4.707435,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.000000,...,32.0,6700.0,3.900000,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.000000,...,35.0,7300.0,4.600000,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.000000,...,47.0,6700.0,4.900000,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.000000,...,54.0,7800.0,6.200000,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.000000,...,49.0,6600.0,5.400000,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.000000,...,51.0,7200.0,5.900000,no,no,no,good,no,no,notckd


In [29]:
columns_to_impute = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in columns_to_impute:
        data[col] = data.groupby('class')[col].transform(lambda x: x.fillna(x.mode()[0]))


In [31]:
data.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,...,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,148.036517,...,38.0,6000.0,4.707435,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31.0,7500.0,4.707435,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35.0,7300.0,4.6,no,no,no,good,no,no,ckd
5,60.0,90.0,1.015,3.0,0.0,normal,normal,notpresent,notpresent,74.0,...,39.0,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68.0,70.0,1.01,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,36.0,8406.122449,4.707435,no,no,no,good,no,no,ckd
7,24.0,76.469072,1.015,2.0,4.0,normal,abnormal,notpresent,notpresent,410.0,...,44.0,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52.0,100.0,1.015,3.0,0.0,normal,abnormal,present,notpresent,138.0,...,33.0,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [32]:
# Define your binary mapping
binary_map = {
    'yes': 1, 'no': 0,
    'present': 1, 'notpresent': 0,
    'abnormal': 1, 'normal': 0,
    'good': 1, 'poor': 0
}

# List of categorical columns to encode
binary_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

# Apply mapping
for col in binary_cols:
    data[col] = data[col].map(binary_map)


In [33]:
data

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.020,1.0,0.0,0,0,0,0,121.000000,...,44.0,7800.0,5.200000,1,1,0,1,0,0,ckd
1,7.0,50.0,1.020,4.0,0.0,0,0,0,0,148.036517,...,38.0,6000.0,4.707435,0,0,0,1,0,0,ckd
2,62.0,80.0,1.010,2.0,3.0,0,0,0,0,423.000000,...,31.0,7500.0,4.707435,0,1,0,0,0,1,ckd
3,48.0,70.0,1.005,4.0,0.0,0,1,1,0,117.000000,...,32.0,6700.0,3.900000,1,0,0,0,1,1,ckd
4,51.0,80.0,1.010,2.0,0.0,0,0,0,0,106.000000,...,35.0,7300.0,4.600000,0,0,0,1,0,0,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,0,0,0,0,140.000000,...,47.0,6700.0,4.900000,0,0,0,1,0,0,notckd
396,42.0,70.0,1.025,0.0,0.0,0,0,0,0,75.000000,...,54.0,7800.0,6.200000,0,0,0,1,0,0,notckd
397,12.0,80.0,1.020,0.0,0.0,0,0,0,0,100.000000,...,49.0,6600.0,5.400000,0,0,0,1,0,0,notckd
398,17.0,60.0,1.025,0.0,0.0,0,0,0,0,114.000000,...,51.0,7200.0,5.900000,0,0,0,1,0,0,notckd


In [34]:
X=data.drop(columns=['class'])
y=data['class']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [42]:
y_pred = model.predict(X_test)


In [48]:

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Precision
print("Precision:", precision_score(y_test, y_pred,pos_label='ckd'))

# Recall
print("Recall:", recall_score(y_test, y_pred, pos_label='ckd'))

# F1 Score
print("F1 Score:", f1_score(y_test, y_pred, pos_label='ckd'))
# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[52  0]
 [ 0 28]]


In [49]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print("CV Accuracy:", scores.mean())


CV Accuracy: 0.99


In [50]:

from sklearn.linear_model import LogisticRegression

In [74]:
X_scaled= StandardScaler().fit_transform(X)

In [75]:
X_train_scaled,X_test_scaled,y_train_s,y_test_s=train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [None]:
log_model = LogisticRegression(max_iter=1000)  # Increase max_iter to ensure convergence
log_model.fit(X_train_scaled, y_train_s)


In [84]:
y_pred_log = log_model.predict(X_test_scaled)


In [85]:

print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log, pos_label='ckd'))  # or pos_label=1
print("Recall:", recall_score(y_test, y_pred_log, pos_label='ckd'))
print("F1 Score:", f1_score(y_test, y_pred_log, pos_label='ckd'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


Logistic Regression Results:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
 [[52  0]
 [ 0 28]]


In [87]:
scores = cross_val_score(log_model, X_scaled, y, cv=5)
print("Cross-validation Accuracy:", scores.mean())


Cross-validation Accuracy: 0.9974999999999999
