In [208]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.io import arff

cols = []
with open("chronic_kidney_disease.arff", "r") as f:
    for line in f:
        line = line.strip()
        if line.lower().startswith("@attribute"):
            parts = line.split()
            colname = parts[1].strip("'\"") 
            cols.append(colname)
        elif line.lower().startswith("@data"):
            break

df = pd.read_csv(
    "chronic_kidney_disease.arff",
    comment="@",
    header=None,
    names=cols,
    na_values=["?", " "],
    engine="python",
    on_bad_lines="skip"
)

df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [209]:
filtered_data = df.drop(columns=["rbc", "pc", "pcc", "ba", "sc", "htn", "dm", "cad", "appet", "pe", "ane"])
filtered_data.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sod,pot,hemo,pcv,wbcc,rbcc,class
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,,,15.4,44,7800,5.2,ckd
1,7.0,50.0,1.02,4.0,0.0,,18.0,,,11.3,38,6000,,ckd
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,,,9.6,31,7500,,ckd
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,111.0,2.5,11.2,32,6700,3.9,ckd
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,,,11.6,35,7300,4.6,ckd


In [210]:
filtered_data["hemo"] = filtered_data["hemo"] / 10
filtered_data.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sod,pot,hemo,pcv,wbcc,rbcc,class
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,,,1.54,44,7800,5.2,ckd
1,7.0,50.0,1.02,4.0,0.0,,18.0,,,1.13,38,6000,,ckd
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,,,0.96,31,7500,,ckd
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,111.0,2.5,1.12,32,6700,3.9,ckd
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,,,1.16,35,7300,4.6,ckd


In [211]:
filtered_data["class"].replace("notckd", "c", inplace=True)
filtered_data["class"].replace("ckd", "a", inplace=True)
filtered_data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  filtered_data["class"].replace("notckd", "c", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  filtered_data["class"].replace("ckd", "a", inplace=True)


Unnamed: 0,age,bp,sg,al,su,bgr,bu,sod,pot,hemo,pcv,wbcc,rbcc,class
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,,,1.54,44,7800,5.2,a
1,7.0,50.0,1.02,4.0,0.0,,18.0,,,1.13,38,6000,,a
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,,,0.96,31,7500,,a
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,111.0,2.5,1.12,32,6700,3.9,a
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,,,1.16,35,7300,4.6,a


In [212]:
# print(filtered_data.shape)
filtered_data.dropna(thresh=12, inplace=True)
# print(filtered_data.shape)
filtered_data.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sod,pot,hemo,pcv,wbcc,rbcc,class
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,,,1.54,44,7800.0,5.2,a
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,111.0,2.5,1.12,32,6700.0,3.9,a
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,,,1.16,35,7300.0,4.6,a
5,60.0,90.0,1.015,3.0,0.0,74.0,25.0,142.0,3.2,1.22,39,7800.0,4.4,a
6,68.0,70.0,1.01,0.0,0.0,100.0,54.0,104.0,4.0,1.24,36,,,a


In [213]:
affected = filtered_data[filtered_data['class'] == 'a']
not_affected = filtered_data[filtered_data['class'] == 'c']

In [214]:
affected.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sod,pot,hemo
count,123.0,124.0,125.0,126.0,126.0,120.0,125.0,109.0,109.0,125.0
mean,56.902439,80.806452,1.0136,1.857143,0.777778,179.941667,80.444,133.770642,4.748624,1.06144
std,14.979447,15.803508,0.004561,1.429485,1.337992,93.631703,60.477776,7.642029,4.184221,0.214719
min,4.0,50.0,1.005,0.0,0.0,22.0,1.5,104.0,2.5,0.31
25%,49.5,70.0,1.01,1.0,0.0,107.0,37.0,132.0,3.8,0.95
50%,60.0,80.0,1.015,2.0,0.0,156.5,60.0,136.0,4.2,1.08
75%,65.0,90.0,1.015,3.0,1.0,239.5,107.0,139.0,4.9,1.2
max,90.0,180.0,1.025,5.0,5.0,490.0,322.0,145.0,47.0,1.61


In [215]:
not_affected.describe()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sod,pot,hemo
count,138.0,136.0,138.0,138.0,138.0,134.0,134.0,135.0,135.0,136.0
mean,46.362319,71.323529,1.0225,0.0,0.0,107.38806,32.761194,141.718519,4.341481,1.52
std,15.774387,8.59016,0.002509,0.0,0.0,18.818049,11.44991,4.806171,0.594174,0.128195
min,12.0,60.0,1.02,0.0,0.0,70.0,10.0,135.0,3.3,1.3
25%,34.0,60.0,1.02,0.0,0.0,93.0,23.25,138.0,3.7,1.41
50%,46.0,70.0,1.0225,0.0,0.0,107.5,32.5,141.0,4.5,1.5
75%,58.0,80.0,1.025,0.0,0.0,123.75,44.0,146.0,4.9,1.62
max,80.0,80.0,1.025,0.0,0.0,140.0,50.0,150.0,5.0,1.78


In [None]:
def plot_data(df, title):
    plt.figure(figsize=(20, 10))
    plt.title(title)
    for i, col in enumerate(df.select_dtypes(include=['int', 'float'])):
        plt.subplot(3, 3, i + 1)
        df[col].plot(kind='hist', title=col)
        plt.xlabel(col)
    plt.show()

plot_data(affected, "Affected")

TypeError: 'str' object is not callable

<Figure size 2000x1000 with 0 Axes>