Importing the Dependencies

In [87]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

Data Collection and Analysis

PIMA Diabetes Dataset

In [88]:
kidney_dataset = pd.read_csv('/content/kidney_disease.csv')

# Drop 'id' if present
columns_to_drop = ['id', 'rbc', 'pc', 'pcc', 'ba', 'rc', 'ane', 'cad', 'al']

# Drop these columns
kidney_dataset.drop(columns=columns_to_drop, inplace=True)

# Replace common missing values like '?' or empty strings with NaN
kidney_dataset.replace(['?', '\t?', ''], np.nan, inplace=True)

# Binary mappings
binary_mappings = {
    'rbc': {'normal': 1, 'abnormal': 0},
    'pc': {'normal': 1, 'abnormal': 0},
    'pcc': {'present': 1, 'notpresent': 0},
    'ba': {'present': 1, 'notpresent': 0},
    'htn': {'yes': 1, 'no': 0},
    'dm': {'yes': 1, 'no': 0},
    'cad': {'yes': 1, 'no': 0},
    'appet': {'good': 1, 'poor': 0},
    'pe': {'yes': 1, 'no': 0},
    'ane': {'yes': 1, 'no': 0},
    'classification': {'ckd': 1, 'notckd': 0}
}

# Apply binary mappings
for col, mapping in binary_mappings.items():
    if col in kidney_dataset.columns:
        kidney_dataset[col] = kidney_dataset[col].map(mapping)

# Separate numeric and categorical columns
num_cols = kidney_dataset.select_dtypes(include='number').columns
cat_cols = kidney_dataset.select_dtypes(include='object').columns

# Impute missing values in numeric columns
imputer_num = SimpleImputer(strategy='mean')
kidney_dataset[num_cols] = imputer_num.fit_transform(kidney_dataset[num_cols])

# Impute missing values in categorical columns
imputer_cat = SimpleImputer(strategy='most_frequent')
kidney_dataset[cat_cols] = imputer_cat.fit_transform(kidney_dataset[cat_cols])

# Convert numeric columns to integers (after imputation)
kidney_dataset[num_cols] = kidney_dataset[num_cols].round().astype(int)


In [89]:
# printing the first 5 rows of the dataset
kidney_dataset.head()

Unnamed: 0,age,bp,sg,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,htn,dm,appet,pe,classification
0,48,80,1,0,121,36,1,138,5,15,44,7800,1,1,1,0,1
1,7,50,1,0,148,18,1,138,5,11,38,6000,0,0,1,0,1
2,62,80,1,3,423,53,2,138,5,10,31,7500,0,1,0,0,1
3,48,70,1,0,117,56,4,111,2,11,32,6700,1,0,0,1,1
4,51,80,1,0,106,26,1,138,5,12,35,7300,0,0,1,0,1


In [90]:
# number of rows and Columns in this dataset
kidney_dataset.shape

(400, 17)

In [91]:
# getting the statistical measures of the data
kidney_dataset.describe()

Unnamed: 0,age,bp,sg,su,bgr,bu,sc,sod,pot,hemo,htn,dm,appet,pe,classification
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,51.4725,76.455,1.0,0.395,148.0325,57.4075,3.0525,137.63,4.7475,12.5925,0.3675,0.335,0.795,0.19,0.625
std,16.975118,13.476536,0.0,1.040038,74.782635,49.285435,5.616369,9.224469,2.821106,2.720284,0.482728,0.472582,0.404207,0.392792,0.484729
min,2.0,50.0,1.0,0.0,22.0,2.0,0.0,4.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0
25%,42.0,70.0,1.0,0.0,101.0,27.0,1.0,135.0,4.0,11.0,0.0,0.0,1.0,0.0,0.0
50%,54.0,78.0,1.0,0.0,126.0,44.0,1.0,138.0,5.0,13.0,0.0,0.0,1.0,0.0,1.0
75%,64.0,80.0,1.0,0.0,150.0,61.75,3.0,141.0,5.0,15.0,1.0,1.0,1.0,0.0,1.0
max,90.0,180.0,1.0,5.0,490.0,391.0,76.0,163.0,47.0,18.0,1.0,1.0,1.0,1.0,1.0


In [92]:
kidney_dataset['classification'].value_counts()

Unnamed: 0_level_0,count
classification,Unnamed: 1_level_1
1,250
0,150


0 --> No Kidney Disease

1 --> Kidney Disease

In [93]:
kidney_dataset.groupby('classification').mean(numeric_only=True)

Unnamed: 0_level_0,age,bp,sg,su,bgr,bu,sc,sod,pot,hemo,htn,dm,appet,pe
classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,46.546667,71.413333,1.0,0.0,109.333333,33.766667,0.913333,141.606667,4.48,15.106667,0.0,0.0,1.0,0.0
1,54.428,79.48,1.0,0.632,171.252,71.592,4.336,135.244,4.908,11.084,0.588,0.536,0.672,0.304


In [94]:
# separating the data and labels
X = kidney_dataset.drop(columns = 'classification', axis=1)
Y = kidney_dataset['classification']

In [95]:
print(X)

     age  bp  sg  su  bgr  bu  sc  sod  pot  hemo pcv    wc  htn  dm  appet  \
0     48  80   1   0  121  36   1  138    5    15  44  7800    1   1      1   
1      7  50   1   0  148  18   1  138    5    11  38  6000    0   0      1   
2     62  80   1   3  423  53   2  138    5    10  31  7500    0   1      0   
3     48  70   1   0  117  56   4  111    2    11  32  6700    1   0      0   
4     51  80   1   0  106  26   1  138    5    12  35  7300    0   0      1   
..   ...  ..  ..  ..  ...  ..  ..  ...  ...   ...  ..   ...  ...  ..    ...   
395   55  80   1   0  140  49   0  150    5    16  47  6700    0   0      1   
396   42  70   1   0   75  31   1  141    4    16  54  7800    0   0      1   
397   12  80   1   0  100  26   1  137    4    16  49  6600    0   0      1   
398   17  60   1   0  114  50   1  135    5    14  51  7200    0   0      1   
399   58  80   1   0  131  18   1  141    4    16  53  6800    0   0      1   

     pe  
0     0  
1     0  
2     0  
3     1  
4

In [107]:
print(Y)

0      1
1      1
2      1
3      1
4      1
      ..
395    0
396    0
397    0
398    0
399    0
Name: classification, Length: 400, dtype: int64


Train Test Split

In [108]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [109]:
print(X.shape, X_train.shape, X_test.shape)

(400, 16) (320, 16) (80, 16)


Training the Model

In [110]:
classifier = svm.SVC(kernel='linear')

In [111]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [112]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [113]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9625


In [114]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [115]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9375


Making a Predictive System

In [119]:
# Correct input data to have the same number of features as the trained model (16 features)
input_data_reduced = [
    40,    # age
    80,    # bp (Blood Pressure)
    1.025, # sg (Specific Gravity)
    0,     # su (Sugar)
    140,   # bgr (Blood Glucose Random)
    10,    # bu (Blood Urea)
    1.2,   # sc (Serum Creatinine)
    135,   # sod (Sodium)
    5,     # pot (Potassium)
    15,    # hemo (Hemoglobin)
    48,    # pcv (Packed Cell Volume)
    10400, # wc (White Blood Cell Count)
    0,     # htn (no -> 0)
    0,     # dm (no -> 0)
    1,     # appet (good -> 1)
    0      # pe (no -> 0)
]

# Convert the input data to numpy array
input_data_as_numpy_array = np.asarray(input_data_reduced)

# Reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make the prediction using the classifier
prediction = classifier.predict(input_data_reshaped)

# Display the prediction
if prediction[0] == 0:
    print('The person is not having kidney disease')
else:
    print('The person is having kidney disease')


The person is not having kidney disease




Saving the trained model

In [120]:
import pickle

In [121]:
filename = 'kidney_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [122]:
# loading the saved model
loaded_model = pickle.load(open('kidney_model.sav', 'rb'))

In [125]:
# Correct input data to have the same number of features as the trained model (16 features)
input_data = [
    40,    # age
    80,    # bp (Blood Pressure)
    1.025, # sg (Specific Gravity)
    0,     # su (Sugar)
    140,   # bgr (Blood Glucose Random)
    10,    # bu (Blood Urea)
    1.2,   # sc (Serum Creatinine)
    135,   # sod (Sodium)
    5,     # pot (Potassium)
    15,    # hemo (Hemoglobin)
    48,    # pcv (Packed Cell Volume)
    10400, # wc (White Blood Cell Count)
    0,     # htn (no -> 0)
    0,     # dm (no -> 0)
    1,     # appet (good -> 1)
    0      # pe (no -> 0)
]

# changing the input_data_reduced to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Prediction using the loaded model
prediction = loaded_model.predict(input_data_reshaped)

# Output the prediction
print(prediction)

if prediction[0] == 0:
    print('The person is not having kidney disease')
else:
    print('The person is having kidney disease')


[0]
The person is not having kidney disease




In [126]:
for column in X.columns:
  print(column)

age
bp
sg
su
bgr
bu
sc
sod
pot
hemo
pcv
wc
htn
dm
appet
pe
