In [1]:
from ucimlrepo import fetch_ucirepo 

In [2]:
# fetch dataset
chronic_kidney_disease = fetch_ucirepo(id=336)


In [3]:
# data (as pandas dataframes)
X = chronic_kidney_disease.data.features
y = chronic_kidney_disease.data.targets 

In [4]:
# metadata
print(chronic_kidney_disease.metadata)

{'uci_id': 336, 'name': 'Chronic Kidney Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/336/chronic+kidney+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/336/data.csv', 'abstract': 'This dataset can be used to predict the chronic kidney disease and it can be collected from the hospital nearly 2 months of period.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 400, 'num_features': 24, 'feature_types': ['Real'], 'demographics': ['Age'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2015, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5G020', 'creators': ['L. Rubini', 'P. Soundarapandian', 'P. Eswaran'], 'intro_paper': None, 'additional_info': {'summary': 'We use the following representation to collect the dataset\r\n                        age\t\t-\tage\t\r\n\t\t\tbp\t\t-\tblood pressure\r\n\t\t\tsg\t

In [5]:
# variable information
print(chronic_kidney_disease.variables) 


     name     role         type demographic              description  \
0     age  Feature      Integer         Age                     None   
1      bp  Feature      Integer        None           blood pressure   
2      sg  Feature  Categorical        None         specific gravity   
3      al  Feature  Categorical        None                  albumin   
4      su  Feature  Categorical        None                    sugar   
5     rbc  Feature       Binary        None          red blood cells   
6      pc  Feature       Binary        None                 pus cell   
7     pcc  Feature       Binary        None          pus cell clumps   
8      ba  Feature       Binary        None                 bacteria   
9     bgr  Feature      Integer        None     blood glucose random   
10     bu  Feature      Integer        None               blood urea   
11     sc  Feature   Continuous        None         serum creatinine   
12    sod  Feature      Integer        None                   so

In [1]:
import pandas as pd
import numpy as np  
from sklearn.impute import SimpleImputer

In [3]:
from ucimlrepo import fetch_ucirepo

# Fetch the dataset
chronic_kidney_disease = fetch_ucirepo(id=336)

# Extract features and targets
X = chronic_kidney_disease.data.features 
y = chronic_kidney_disease.data.targets

# Load the data into a DataFrame
df = X.copy()

# Check the first few rows of the original data
print("Original Data (First few rows):")
print(df.head())

Original Data (First few rows):
    age    bp     sg   al   su     rbc  ...  htn   dm cad  appet   pe  ane
0  48.0  80.0  1.020  1.0  0.0     NaN  ...  yes  yes  no   good   no   no
1   7.0  50.0  1.020  4.0  0.0     NaN  ...   no   no  no   good   no   no
2  62.0  80.0  1.010  2.0  3.0  normal  ...   no  yes  no   poor   no  yes
3  48.0  70.0  1.005  4.0  0.0  normal  ...  yes   no  no   poor  yes  yes
4  51.0  80.0  1.010  2.0  0.0  normal  ...   no   no  no   good   no   no

[5 rows x 24 columns]


In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo

# Fetch the dataset
chronic_kidney_disease = fetch_ucirepo(id=336)

# Convert features into a DataFrame and add the target column
df = pd.DataFrame(chronic_kidney_disease.data.features)
df['class'] = chronic_kidney_disease.data.targets

# Print the first few rows of the data
print("First few rows:")
print(df.head())


First few rows:
    age    bp     sg   al   su     rbc  ...   dm cad appet   pe  ane  class
0  48.0  80.0  1.020  1.0  0.0     NaN  ...  yes  no  good   no   no    ckd
1   7.0  50.0  1.020  4.0  0.0     NaN  ...   no  no  good   no   no    ckd
2  62.0  80.0  1.010  2.0  3.0  normal  ...  yes  no  poor   no  yes    ckd
3  48.0  70.0  1.005  4.0  0.0  normal  ...   no  no  poor  yes  yes    ckd
4  51.0  80.0  1.010  2.0  0.0  normal  ...   no  no  good   no   no    ckd

[5 rows x 25 columns]


In [5]:
# Iterate over the columns
for column in df.columns:
    if df[column].dtype == 'object':  # Check if the column is categorical
        # Fill missing values with the most frequent value (mode)
        df[column] = df[column].fillna(df[column].mode()[0])
    else:  # If the column is numerical
        # Fill missing values with the median value
        df[column] = df[column].fillna(df[column].median())

# Print the missing values after handling
print("Missing values after handling:\n", df.isnull().sum())

# Display the first few rows of the updated DataFrame
import matplotlib.pyplot as plt
from IPython.display import display

display(df.head())


Missing values after handling:
 age      0
bp       0
sg       0
al       0
su       0
rbc      0
pc       0
pcc      0
ba       0
bgr      0
bu       0
sc       0
sod      0
pot      0
hemo     0
pcv      0
wbcc     0
rbcc     0
htn      0
dm       0
cad      0
appet    0
pe       0
ane      0
class    0
dtype: int64


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,normal,normal,notpresent,notpresent,121.0,36.0,1.2,138.0,4.4,15.4,44.0,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.02,4.0,0.0,normal,normal,notpresent,notpresent,121.0,18.0,0.8,138.0,4.4,11.3,38.0,6000.0,4.8,no,no,no,good,no,no,ckd
2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,423.0,53.0,1.8,138.0,4.4,9.6,31.0,7500.0,4.8,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,106.0,26.0,1.4,138.0,4.4,11.6,35.0,7300.0,4.6,no,no,no,good,no,no,ckd


In [6]:
# List of columns to be mapped
binary_columns = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']

# Check unique values in each binary column
for column in binary_columns:
    print(f"{column}: {df[column].unique()}")  # Corrected parenthesis

    # Strip string values if needed
    df[column] = df[column].map(lambda x: x.strip() if isinstance(x, str) else x)

# Binary mappings for categorical columns
binary_mappings = {
    'rbc': {'normal': 0, 'abnormal': 1},
    'pc': {'normal': 0, 'abnormal': 1},
    'pcc': {'notpresent': 0, 'present': 1},
    'ba': {'notpresent': 0, 'present': 1},
    'htn': {'no': 0, 'yes': 1},
    'dm': {'no': 0, 'yes': 1},
    'cad': {'no': 0, 'yes': 1},
    'appet': {'poor': 0, 'good': 1},
    'pe': {'no': 0, 'yes': 1},
    'ane': {'no': 0, 'yes': 1},
    'class': {'notckd': 0, 'ckd': 1}
}

# Applying the binary mappings to the DataFrame
for column, mapping in binary_mappings.items():
    df[column] = df[column].map(mapping)

# Print the first few rows to check the changes
print(df.head())


rbc: ['normal' 'abnormal']
pc: ['normal' 'abnormal']
pcc: ['notpresent' 'present']
ba: ['notpresent' 'present']
htn: ['yes' 'no']
dm: ['yes' 'no' '\tno']
cad: ['no' 'yes']
appet: ['good' 'poor']
pe: ['no' 'yes']
ane: ['no' 'yes']
class: ['ckd' 'ckd\t' 'notckd']
    age    bp     sg   al   su  rbc  pc  ...  htn  dm  cad  appet  pe  ane  class
0  48.0  80.0  1.020  1.0  0.0    0   0  ...    1   1    0      1   0    0      1
1   7.0  50.0  1.020  4.0  0.0    0   0  ...    0   0    0      1   0    0      1
2  62.0  80.0  1.010  2.0  3.0    0   0  ...    0   1    0      0   0    1      1
3  48.0  70.0  1.005  4.0  0.0    0   1  ...    1   0    0      0   1    1      1
4  51.0  80.0  1.010  2.0  0.0    0   0  ...    0   0    0      1   0    0      1

[5 rows x 25 columns]


In [7]:
import numpy as np
from scipy.stats import zscore

# Assuming 'df' is the DataFrame
# Calculate the Z-scores for all numerical columns
z_scores = np.abs(zscore(df.select_dtypes(include=['float64', 'int64'])))  # Only numerical columns

# Define the threshold for outliers (e.g., Z-score > 3)
threshold = 3

# Identify outliers by checking where Z-scores are above the threshold
outliers = (z_scores > threshold)

# Remove rows with any outliers
df_no_outliers = df[~outliers.any(axis=1)]  # '~' is used to negate the boolean mask

# Print the shape of the original and cleaned DataFrame
print(f"Original DataFrame shape: {df.shape}")
print(f"Cleaned DataFrame shape: {df_no_outliers.shape}")

# Show the cleaned DataFrame
from IPython.display import display
display(df_no_outliers.head())


Original DataFrame shape: (400, 25)
Cleaned DataFrame shape: (313, 25)


Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,80.0,1.02,1.0,0.0,0,0,0,0,121.0,36.0,1.2,138.0,4.4,15.4,44.0,7800.0,5.2,1,1,0,1,0,0,1
1,7.0,50.0,1.02,4.0,0.0,0,0,0,0,121.0,18.0,0.8,138.0,4.4,11.3,38.0,6000.0,4.8,0,0,0,1,0,0,1
3,48.0,70.0,1.005,4.0,0.0,0,1,1,0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,1,0,0,0,1,1,1
4,51.0,80.0,1.01,2.0,0.0,0,0,0,0,106.0,26.0,1.4,138.0,4.4,11.6,35.0,7300.0,4.6,0,0,0,1,0,0,1
5,60.0,90.0,1.015,3.0,0.0,0,0,0,0,74.0,25.0,1.1,142.0,3.2,12.2,39.0,7800.0,4.4,1,1,0,1,1,0,1


In [8]:
from sklearn.preprocessing import MinMaxScaler

# Identify binary columns (values are only 0 or 1)
binary_columns = [col for col in df.columns if df[col].nunique() == 2]

# Identify non-binary numeric columns (exclude binary columns)
non_binary_columns = [col for col in df.select_dtypes(include=['float64', 'int64']).columns if col not in binary_columns]

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMaxScaler only to non-binary numeric columns
df[non_binary_columns] = scaler.fit_transform(df[non_binary_columns])

# Print the scaled dataset (non-binary columns only)
print("Scaled dataset (non-binary columns only):")
print(df.head())


Scaled dataset (non-binary columns only):
        age        bp    sg   al   su  rbc  ...  dm  cad  appet  pe  ane  class
0  0.522727  0.230769  0.75  0.2  0.0    0  ...   1    0      1   0    0      1
1  0.056818  0.000000  0.75  0.8  0.0    0  ...   0    0      1   0    0      1
2  0.681818  0.230769  0.25  0.4  0.6    0  ...   1    0      0   0    1      1
3  0.522727  0.153846  0.00  0.8  0.0    0  ...   0    0      0   1    1      1
4  0.556818  0.230769  0.25  0.4  0.0    0  ...   0    0      1   0    0      1

[5 rows x 25 columns]


In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

# Example data (replace with your dataset)
X = np.random.rand(400, 24)  # Replace with your feature data
y = np.random.randint(2, size=400)  # Replace with your target labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the MLP Classifier
mlp = MLPClassifier(
    hidden_layer_sizes=(10,),  # One hidden layer with 10 neurons
    activation='relu',         # Activation function for the hidden layer
    solver='adam',             # Optimizer
    max_iter=500,              # Maximum number of iterations
    random_state=42
)

# Train the model
mlp.fit(X_train, y_train)

# Make predictions
y_pred = mlp.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[23 10]
 [22 25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.70      0.59        33
           1       0.71      0.53      0.61        47

    accuracy                           0.60        80
   macro avg       0.61      0.61      0.60        80
weighted avg       0.63      0.60      0.60        80



