## Data preprocessing
Start by importing the dataset as instructed in the uci archive.

In [13]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
  
# fetch dataset 
chronic_kidney_disease = fetch_ucirepo(id=336) 
  
# data (as pandas dataframes) 
X = chronic_kidney_disease.data.features 
y = chronic_kidney_disease.data.targets 


### Examine and prepare data
Imported data is now merged to a single dataframe, ready for handling.
Description read using the describe method from pandas.


In [14]:
df = pd.concat([X, y], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     391 non-null    float64
 1   bp      388 non-null    float64
 2   sg      353 non-null    float64
 3   al      354 non-null    float64
 4   su      351 non-null    float64
 5   rbc     248 non-null    object 
 6   pc      335 non-null    object 
 7   pcc     396 non-null    object 
 8   ba      396 non-null    object 
 9   bgr     356 non-null    float64
 10  bu      381 non-null    float64
 11  sc      383 non-null    float64
 12  sod     313 non-null    float64
 13  pot     312 non-null    float64
 14  hemo    348 non-null    float64
 15  pcv     329 non-null    float64
 16  wbcc    294 non-null    float64
 17  rbcc    269 non-null    float64
 18  htn     398 non-null    object 
 19  dm      398 non-null    object 
 20  cad     398 non-null    object 
 21  appet   399 non-null    object 
 22  pe

In [15]:
df.describe(include='all')

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
count,391.0,388.0,353.0,354.0,351.0,248,335,396,396,356.0,...,329.0,294.0,269.0,398,398,398,399,399,399,400
unique,,,,,,2,2,2,2,,...,,,,2,3,2,2,2,2,3
top,,,,,,normal,normal,notpresent,notpresent,,...,,,,no,no,no,good,no,no,ckd
freq,,,,,,201,259,354,374,,...,,,,251,260,364,317,323,339,248
mean,51.483376,76.469072,1.017408,1.016949,0.450142,,,,,148.036517,...,38.884498,8406.122449,4.707435,,,,,,,
std,17.169714,13.683637,0.005717,1.352679,1.099191,,,,,79.281714,...,8.990105,2944.47419,1.025323,,,,,,,
min,2.0,50.0,1.005,0.0,0.0,,,,,22.0,...,9.0,2200.0,2.1,,,,,,,
25%,42.0,70.0,1.01,0.0,0.0,,,,,99.0,...,32.0,6500.0,3.9,,,,,,,
50%,55.0,80.0,1.02,0.0,0.0,,,,,121.0,...,40.0,8000.0,4.8,,,,,,,
75%,64.5,80.0,1.02,2.0,0.0,,,,,163.0,...,45.0,9800.0,5.4,,,,,,,


### Modification pipeline
The next code cell restructures the data frame to fit criteria set by the task.

In [16]:
# Map for replacing short names to full length
column_map = {
    'age': 'age',
    'bp': 'blood pressure',
    'sg': 'specific gravity',
    'al': 'albumin',
    'su': 'sugar',
    'bgr': 'blood glucose random',
    'bu': 'blood urea',
    'sod': 'sodium',
    'pot': 'potassium',
    'hemo': 'hemoglobin',
    'pcv': 'packed cell volume',
    'wbcc': 'white blood cell count',
    'rbcc': 'red blood cell count',
    'class': 'class'
}

class_map = {
    'ckd': 'a',
    'notckd': 'c'
}

# Rename columns according to the column map
df_modified = df[list(column_map.keys())].rename(columns=column_map)

# Remove white space to fix inconsistent values
df_modified['class'] = df_modified['class'].str.strip()
# Then rename all instances according to map
df_modified['class'] = df_modified['class'].map(class_map)

# Convert hemoglobin from g/dl --> g/l
df_modified['hemoglobin'] = df_modified['hemoglobin'] * 10

rows_before_clean = df_modified.shape[0]
# Remove rows with >=3 missing values
df_modified = df_modified[df_modified.isna().sum(axis=1) < 3]
rows_after_clean = df_modified.shape[0]

# Check that modified columns look correct
df_modified.info()


<class 'pandas.core.frame.DataFrame'>
Index: 265 entries, 0 to 399
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   age                     262 non-null    float64
 1   blood pressure          262 non-null    float64
 2   specific gravity        264 non-null    float64
 3   albumin                 265 non-null    float64
 4   sugar                   265 non-null    float64
 5   blood glucose random    255 non-null    float64
 6   blood urea              260 non-null    float64
 7   sodium                  244 non-null    float64
 8   potassium               244 non-null    float64
 9   hemoglobin              262 non-null    float64
 10  packed cell volume      263 non-null    float64
 11  white blood cell count  249 non-null    float64
 12  red blood cell count    244 non-null    float64
 13  class                   265 non-null    object 
dtypes: float64(13), object(1)
memory usage: 31.1+ K

### Confirming modifications
The next code cells confirm that earlier modifications worked as intended.
The class column should only display a/c values, with invalid values trimmed.
Hemoglobin mean should be 10x higher than the previous check (g/dl --> g/l).
Row count should have changed after cleanup.

In [17]:
print("Row count before cleanup: " + str(rows_before_clean))
print("Row count after cleanup: " + str(rows_after_clean))
print("Number of rows removed: " + str(rows_before_clean - rows_after_clean))

Row count before cleanup: 400
Row count after cleanup: 265
Number of rows removed: -135


In [18]:
# check class was modified correctly
print(df_modified['class'].value_counts())

class
c    139
a    126
Name: count, dtype: int64


In [19]:
# hemoglobin check
print(df_modified['hemoglobin'].mean())

130.19847328244273


### Splitting the data
Next, the data frame will be split in two between the a / c individuals.
As there are 139 control and 126 affected counts in "class", it is expected that the same numbers are
reflected in the data frame's split.