In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [69]:
df = pd.read_csv("kidney_disease.csv")[["age","wc","htn","dm","classification"]]
df.head()

Unnamed: 0,age,wc,htn,dm,classification
0,48.0,7800,yes,yes,ckd
1,7.0,6000,no,no,ckd
2,62.0,7500,no,yes,ckd
3,48.0,6700,yes,no,ckd
4,51.0,7300,no,no,ckd


In [70]:
df.isnull().sum()

age                 9
wc                105
htn                 2
dm                  2
classification      0
dtype: int64

#### missing data

In [71]:
from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan,strategy="most_frequent") # just for deployment i am taking this
df_imputed = pd.DataFrame(imp_mode.fit_transform(df))
df_imputed.columns = df.columns
df_imputed

Unnamed: 0,age,wc,htn,dm,classification
0,48.0,7800,yes,yes,ckd
1,7.0,6000,no,no,ckd
2,62.0,7500,no,yes,ckd
3,48.0,6700,yes,no,ckd
4,51.0,7300,no,no,ckd
...,...,...,...,...,...
395,55.0,6700,no,no,notckd
396,42.0,7800,no,no,notckd
397,12.0,6600,no,no,notckd
398,17.0,7200,no,no,notckd


In [72]:
df_imputed.isnull().sum()

age               0
wc                0
htn               0
dm                0
classification    0
dtype: int64

### Lets check for any anomalies

In [73]:
for i in df_imputed.columns:
    print(f"-------------------------------- {i} ----------------------------")
    print(set(df_imputed[i].tolist()))
    print()

-------------------------------- age ----------------------------
{2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 14.0, 15.0, 17.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 90.0}

-------------------------------- wc ----------------------------
{'4700', '9000', '7100', '11300', '6300', '8500', '7300', '16300', '9800', '9200', '10800', '7200', '12200', '10500', '14900', '9600', '8000', '6000', '5900', '9700', '5000', '12700', '2200', '8600', '16700', '11400', '12300', '5500', '8200', '5200', '3800', '6700', '10400', '\t6200', '9900', '5600', '10700', '19100', '6400', '11800', '\t?', '5300', '9400', '11200', '7900', '7700', '5700', '4200', '15200', '14600',

#### we can see wc is not a number 

In [74]:
df_imputed['wc'].astype('int64')

ValueError: invalid literal for int() with base 10: '\t?'

#### first we need to replace \t ones

In [75]:
for i in df_imputed.columns:
    print(f"-------------------------------- {i} ----------------------------")
    print(set(df_imputed[i].tolist()))
    print()

-------------------------------- age ----------------------------
{2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 14.0, 15.0, 17.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 90.0}

-------------------------------- wc ----------------------------
{'4700', '9000', '7100', '11300', '6300', '8500', '7300', '16300', '9800', '9200', '10800', '7200', '12200', '10500', '14900', '9600', '8000', '6000', '5900', '9700', '5000', '12700', '2200', '8600', '16700', '11400', '12300', '5500', '8200', '5200', '3800', '6700', '10400', '\t6200', '9900', '5600', '10700', '19100', '6400', '11800', '\t?', '5300', '9400', '11200', '7900', '7700', '5700', '4200', '15200', '14600',

In [76]:
df_imputed['wc']=df_imputed['wc'].apply(lambda x: '6200' if x=='\t6200' else x)
df_imputed['wc']=df_imputed['wc'].apply(lambda x: '8400' if x=='\t8400' else x)
df_imputed['wc']=df_imputed['wc'].apply(lambda x: '9800' if x=='\t?' else x)

In [77]:
df_imputed['dm']=df_imputed['dm'].apply(lambda x: 'no' if x=='\tno' else x)
df_imputed['dm']=df_imputed['dm'].apply(lambda y: 'yes' if y=='\tyes' else y)
df_imputed['dm']=df_imputed['dm'].apply(lambda z: z.lstrip() if isinstance(z,str) else z) # doing lstrip for all except float
# we can directly do this also for y like df['dm']=df['dm'].apply(lambda z: 'yes' if z==' yes' else z) but just for showing that what issue can come in lstrip as it take values as float in case of NaN

In [78]:
df_imputed['classification'] = df['classification'].replace("ckd\t","ckd")

In [79]:
df_imputed['classification'].unique()

array(['ckd', 'notckd'], dtype=object)

In [80]:
for i in df_imputed.columns:
    print(f"-------------------------------- {i} ----------------------------")
    print(set(df_imputed[i].tolist()))
    print()

-------------------------------- age ----------------------------
{2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 11.0, 12.0, 14.0, 15.0, 17.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 90.0}

-------------------------------- wc ----------------------------
{'4700', '9000', '7100', '11300', '6300', '8500', '7300', '16300', '9800', '9200', '10800', '7200', '12200', '10500', '14900', '9600', '8000', '6000', '5900', '9700', '5000', '12700', '2200', '8600', '16700', '11400', '12300', '5500', '8200', '5200', '3800', '6700', '10400', '9900', '5600', '10700', '19100', '6400', '11800', '5300', '9400', '11200', '7900', '7700', '5700', '4200', '15200', '14600', '7400', '5400', 

In [81]:
df_imputed['wc'] = df_imputed['wc'].astype('int64')

#### Encoding the categorical variables

In [82]:
from sklearn.preprocessing import LabelEncoder

In [83]:
le = LabelEncoder()

In [84]:
df_enco = df_imputed.apply(le.fit_transform)
df_enco

Unnamed: 0,age,wc,htn,dm,classification
0,40,35,1,1,0
1,5,19,0,0,0
2,54,33,0,1,0
3,40,25,1,0,0
4,43,31,0,0,0
...,...,...,...,...,...
395,47,25,0,0,1
396,34,35,0,0,1
397,8,24,0,0,1
398,11,30,0,0,1


In [85]:
x = df_enco.drop(['classification'],axis=1)
y = df_enco['classification']

### Feature Scaling 

In [86]:
from sklearn.preprocessing import MinMaxScaler

In [87]:
scale = MinMaxScaler((-1,1))

In [88]:
x = scale.fit_transform(x)
x

array([[ 0.06666667, -0.20454545,  1.        ,  1.        ],
       [-0.86666667, -0.56818182, -1.        , -1.        ],
       [ 0.44      , -0.25      , -1.        ,  1.        ],
       ...,
       [-0.78666667, -0.45454545, -1.        , -1.        ],
       [-0.70666667, -0.31818182, -1.        , -1.        ],
       [ 0.33333333, -0.40909091, -1.        , -1.        ]])

### Train Test Split

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [91]:
x_train

array([[ 0.36      , -0.47727273, -1.        , -1.        ],
       [ 0.81333333,  0.45454545,  1.        ,  1.        ],
       [ 0.65333333, -0.11363636,  1.        ,  1.        ],
       ...,
       [ 0.01333333,  0.20454545, -1.        , -1.        ],
       [ 0.65333333,  0.20454545, -1.        ,  1.        ],
       [ 0.17333333,  0.15909091,  1.        ,  1.        ]])

In [92]:
x_test

array([[ 0.92      ,  0.11363636,  1.        ,  1.        ],
       [ 0.41333333,  0.06818182,  1.        ,  1.        ],
       [ 0.30666667, -0.45454545, -1.        , -1.        ],
       [ 0.25333333, -0.43181818, -1.        , -1.        ],
       [-0.52      , -0.36363636, -1.        , -1.        ],
       [ 0.28      , -0.70454545,  1.        ,  1.        ],
       [ 0.2       , -0.88636364, -1.        , -1.        ],
       [ 0.65333333, -0.61363636,  1.        ,  1.        ],
       [-0.38666667,  0.11363636, -1.        , -1.        ],
       [-0.70666667, -0.36363636, -1.        , -1.        ],
       [ 0.28      , -0.5       ,  1.        , -1.        ],
       [ 0.78666667,  0.20454545,  1.        , -1.        ],
       [-0.73333333, -0.54545455, -1.        , -1.        ],
       [ 0.6       ,  0.40909091,  1.        ,  1.        ],
       [ 0.04      , -0.36363636, -1.        , -1.        ],
       [ 0.81333333,  0.20454545,  1.        ,  1.        ],
       [-0.97333333,  0.

In [93]:
y_train

380    1
56     0
126    0
371    1
333    1
      ..
154    0
274    1
192    0
85     0
8      0
Name: classification, Length: 320, dtype: int32

In [94]:
y_test

160    0
20     0
392    1
303    1
339    1
      ..
235    0
369    1
258    1
157    0
207    0
Name: classification, Length: 80, dtype: int32

In [95]:
pd.DataFrame(y_train).value_counts()

classification
0                 200
1                 120
dtype: int64

In [96]:
pd.DataFrame(y_test).value_counts()

classification
0                 50
1                 30
dtype: int64

#### model

In [97]:
from sklearn.ensemble import RandomForestClassifier

In [98]:
model = RandomForestClassifier(max_depth=4,random_state=10)
model.fit(x_train,y_train)

In [99]:
from sklearn.metrics import accuracy_score
pred_cv = model.predict(x_test)
accuracy_score(y_test,pred_cv)

0.8125

In [100]:
accuracy_score

<function sklearn.metrics._classification.accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None)>

## Deployment Part

## saving the model

In [108]:
import pickle
# we use pickle_out when saving a model and pickle_in when we use it in .py
# Save the model
with open("classifier.pkl", "wb") as pickle_out:
    pickle.dump(model, pickle_out) # serialization and store trained model

In [109]:
df_enco.columns

Index(['age', 'wc', 'htn', 'dm', 'classification'], dtype='object')

### Start of coding for deployment

##### When we do %%writfile app.py , everything that we did , including creating model etc, gets saved in app.py

In [111]:
%%writefile app.py
import streamlit as st
import pickle

# Load the trained model
with open('classifier.pkl', 'rb') as pickle_in:
    classifier = pickle.load(pickle_in)

# Prediction function
def prediction(age, wc, htn, dm):
    htn = 0 if htn == 'no' else 1
    dm = 0 if dm == 'no' else 1

    pred = classifier.predict([[age, wc, htn, dm]])
    return '✅ Kidney disease not detected' if pred[0] == 0 else '⚠️ Kidney disease found'

# Streamlit UI
st.title("Chronic Kidney Disease Classifier")

age = st.number_input("Age", min_value=1, max_value=120)
wc = st.number_input("White Blood Cell Count (cells/cumm)", min_value=3000, max_value=18000)
htn = st.selectbox("Hypertension", ['yes', 'no'])
dm = st.selectbox("Diabetes Mellitus", ['yes', 'no'])

if st.button("Predict"):
    result = prediction(age, wc, htn, dm)
    st.success(result)


Overwriting app.py


#### We use VS code because python file we cannot deploy in jupyter notebook , streamlit works with .py format only

## Run in terminal of VSCODE

### streamlit run app.py