# GET THE DATASET

- Scraped the dataset that later found that was in a pdf form which had to be converted into a csv file that will later be easier to work on . The Dataset gives a list of all the Hospitals that offer dialysis to people and patients with Renal issues and dialysis procedures . The list has combined all institutions that support government medical help formerly known as NHIF . 

In [1]:
# Import the necessary libraries 
import pdfplumber
import pandas as pd

with pdfplumber.open("Dialysis-Facilities-Comprehensive.pdf") as pdf:
    tables = []
    standardized_columns = None  # Placeholder for consistent column names

    for page in pdf.pages:
        table = page.extract_table()
        if table:
            # Convert table to DataFrame
            df = pd.DataFrame(table[1:], columns=[f"Column_{i}" if col is None or col == "" else col for i, col in enumerate(table[0])])

            # Standardize columns across pages
            if standardized_columns is None:
                standardized_columns = df.columns  # Set the first page’s columns as standard
            else:
                df.columns = standardized_columns  # Rename to match standard columns

            tables.append(df.reset_index(drop=True))  # Reset index to avoid conflicts

# Concatenate all tables
if tables:
    renal_df = pd.concat(tables, ignore_index=True, axis=0)
    print(renal_df.head())

# Save to CSV if needed
renal_df.to_csv("Hospitals.csv", index=False)


                           Column_0     Column_1            Column_2  \
0  COMPREHENSIVE DIALYSIS HOSPITALS         None                None   
1                                           None                None   
2                            COUNTY  NHIF OFFICE  NHIF HOSPITAL CODE   
3                             BOMET        BOMET                4718   
4                             BOMET        BOMET                4119   

                           Column_3  
0                              None  
1                              None  
2                     HOSPITAL NAME  
3  LONGISA COUNTY REFERRAL HOSPITAL  
4     TENWEK HOSPITAL BOMET (SOTIK)  


In [2]:
# Load the CSV file
df = pd.read_csv("Hospitals.csv")
df.head()


Unnamed: 0,Column_0,Column_1,Column_2,Column_3
0,COMPREHENSIVE DIALYSIS HOSPITALS,,,
1,,,,
2,COUNTY,NHIF OFFICE,NHIF HOSPITAL CODE,HOSPITAL NAME
3,BOMET,BOMET,4718,LONGISA COUNTY REFERRAL HOSPITAL
4,BOMET,BOMET,4119,TENWEK HOSPITAL BOMET (SOTIK)


- The Data is correct however not correctly labelled and with the proper columns .Therefore it needs correction and cleaning

In [3]:
import pandas as pd

# Load your DataFrame (after converting from PDF or reading from CSV)
df = pd.read_csv("Hospitals.csv")

# Drop rows that contain "COMPREHENSIVE DIALYSIS HOSPITALS" in the first column
df = df[df['Column_0'] != "COMPREHENSIVE DIALYSIS HOSPITALS"]

# Drop rows where all entries are NaN
df = df.dropna(how="all")

df.reset_index(drop=True, inplace=True)
df = df.drop(index=0)

# Rename specific columns
df = df.rename(columns={"Column_0": "COUNTY", "Column_1": "NHIF OFFICE","Column_2": "NHIF HOSPITAL CODE", "Column_3": "HOSPITAL NAME" })

# Display the cleaned DataFrame
df.head()



Unnamed: 0,COUNTY,NHIF OFFICE,NHIF HOSPITAL CODE,HOSPITAL NAME
1,BOMET,BOMET,4718,LONGISA COUNTY REFERRAL HOSPITAL
2,BOMET,BOMET,4119,TENWEK HOSPITAL BOMET (SOTIK)
3,BUNGOMA,BUNGOMA,4440082,BUNGOMA DISTRICT HOSPITAL
4,NAIROBI,B U R U B U R U,8 0 0 0 1 4 2,E D E L V A L E T R U S T J A M A A H $ M H O ...
5,BUSIA,BUSIA,4446758,BUSIA DISTRICT HOSPITAL


# ANALYSE THE KIDNEY DISEASE DATASET  

In [4]:
# Classic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Advanced Visualization Libraries
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True) #enables plotly plots to be displayed in notebook
cmap1 = "gist_gray"

#Models
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB      

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

#Metrics, Preprocessing and Tuning Tools
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
import missingno as msno
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

#Customization
import warnings
warnings.filterwarnings("ignore")
from termcolor import colored

In [5]:
# Reading the dataset
df = pd.read_csv('kidney_disease.csv')
df

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [6]:
# see the first couple of observations and transpose 10 observations
# think of it as rolling over your dataset
df.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
id,0,1,2,3,4,5,6,7,8,9
age,48.0,7.0,62.0,48.0,51.0,60.0,68.0,24.0,52.0,53.0
bp,80.0,50.0,80.0,70.0,80.0,90.0,70.0,,100.0,90.0
sg,1.02,1.02,1.01,1.005,1.01,1.015,1.01,1.015,1.015,1.02
al,1.0,4.0,2.0,4.0,2.0,3.0,0.0,2.0,3.0,2.0
su,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
rbc,,,normal,normal,normal,,,normal,normal,abnormal
pc,normal,normal,normal,abnormal,normal,,normal,abnormal,abnormal,abnormal
pcc,notpresent,notpresent,notpresent,present,notpresent,notpresent,notpresent,notpresent,present,present
ba,notpresent,notpresent,notpresent,notpresent,notpresent,notpresent,notpresent,notpresent,notpresent,notpresent


In [7]:
# see a concise summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

# DATA DICTIONARY

1. age - age
2. bp - blood pressure
3. sg - specific gravity
4. al - albumin
5. su - sugar
6. rbc - red blood cells
7. pc - pus cell
8. pcc - pus cell clumps
9. ba - bacteria
10. bgr - blood glucose random
11. bu - blood urea
12. sc - serum creatinine
13. sod - sodium
14. pot - potassium
15. hemo - hemoglobin
16. pcv - packed cell volume
17. wc - white blood cell count
18. rc - red blood cell count
19. htn - hypertension
20. dm - diabetes mellitus
21. cad - coronary artery disease
22. appet - appetite
23. pe - pedal edema
24. ane - anemia
25. classification - class

In [8]:
df.columns = ['id','age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'anemia', 'class']
df.head(5)

Unnamed: 0,id,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [9]:
# display summary statistics of each column
# this helps me confirm my assertion on missing data
df.describe(include="all").transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
id,400.0,,,,199.5,115.614301,0.0,99.75,199.5,299.25,399.0
age,391.0,,,,51.483376,17.169714,2.0,42.0,55.0,64.5,90.0
blood_pressure,388.0,,,,76.469072,13.683637,50.0,70.0,80.0,80.0,180.0
specific_gravity,353.0,,,,1.017408,0.005717,1.005,1.01,1.02,1.02,1.025
albumin,354.0,,,,1.016949,1.352679,0.0,0.0,0.0,2.0,5.0
sugar,351.0,,,,0.450142,1.099191,0.0,0.0,0.0,0.0,5.0
red_blood_cells,248.0,2.0,normal,201.0,,,,,,,
pus_cell,335.0,2.0,normal,259.0,,,,,,,
pus_cell_clumps,396.0,2.0,notpresent,354.0,,,,,,,
bacteria,396.0,2.0,notpresent,374.0,,,,,,,


In [10]:
# looking for the number of missing observations 
# In the code below a boolean is being tried on each observation asking if the observation is missing or not
# then add all instances of NaN(Not a number) 
missing_values = df.isnull().sum()

# calculating the percentage of missing values in the dataframe
# simply taking the sum of the values we got above dividing by the no of observations in the df
# you could use len(df) instead df.index.size
missing_count_pct = ((missing_values / df.index.size) * 100)

# see how many observations are missing
print(missing_count_pct)

id                          0.00
age                         2.25
blood_pressure              3.00
specific_gravity           11.75
albumin                    11.50
sugar                      12.25
red_blood_cells            38.00
pus_cell                   16.25
pus_cell_clumps             1.00
bacteria                    1.00
blood_glucose_random       11.00
blood_urea                  4.75
serum_creatinine            4.25
sodium                     21.75
potassium                  22.00
haemoglobin                13.00
packed_cell_volume         17.50
white_blood_cell_count     26.25
red_blood_cell_count       32.50
hypertension                0.50
diabetes_mellitus           0.50
coronary_artery_disease     0.50
appetite                    0.25
peda_edema                  0.25
anemia                      0.25
class                       0.00
dtype: float64


In [11]:
df.value_counts()

id   age   blood_pressure  specific_gravity  albumin  sugar  red_blood_cells  pus_cell  pus_cell_clumps  bacteria    blood_glucose_random  blood_urea  serum_creatinine  sodium  potassium  haemoglobin  packed_cell_volume  white_blood_cell_count  red_blood_cell_count  hypertension  diabetes_mellitus  coronary_artery_disease  appetite  peda_edema  anemia  class 
3    48.0  70.0            1.005             4.0      0.0    normal           abnormal  present          notpresent  117.0                 56.0        3.8               111.0   2.5        11.2         32                  6700                    3.9                   yes           no                 no                       poor      yes         yes     ckd       1
343  37.0  60.0            1.025             0.0      0.0    normal           normal    notpresent       notpresent  111.0                 35.0        0.8               135.0   4.1        16.2         50                  5500                    5.7                   no  

In [12]:
# EXTRACTING CATEGORICAL AND NUMERICAL COLUMNS DATA 

cat_cols = [col for col in df.columns if df[col].dtype == 'object']
num_cols = [col for col in df.columns if df[col].dtype != 'object']

In [13]:
# Looking At unique values in Categorical Columns 

for col in cat_cols:
 print(f"{col} has {df[col].unique()} values\n")

red_blood_cells has [nan 'normal' 'abnormal'] values

pus_cell has ['normal' 'abnormal' nan] values

pus_cell_clumps has ['notpresent' 'present' nan] values

bacteria has ['notpresent' 'present' nan] values

packed_cell_volume has ['44' '38' '31' '32' '35' '39' '36' '33' '29' '28' nan '16' '24' '37' '30'
 '34' '40' '45' '27' '48' '\t?' '52' '14' '22' '18' '42' '17' '46' '23'
 '19' '25' '41' '26' '15' '21' '43' '20' '\t43' '47' '9' '49' '50' '53'
 '51' '54'] values

white_blood_cell_count has ['7800' '6000' '7500' '6700' '7300' nan '6900' '9600' '12100' '4500'
 '12200' '11000' '3800' '11400' '5300' '9200' '6200' '8300' '8400' '10300'
 '9800' '9100' '7900' '6400' '8600' '18900' '21600' '4300' '8500' '11300'
 '7200' '7700' '14600' '6300' '\t6200' '7100' '11800' '9400' '5500' '5800'
 '13200' '12500' '5600' '7000' '11900' '10400' '10700' '12700' '6800'
 '6500' '13600' '10200' '9000' '14900' '8200' '15200' '5000' '16300'
 '12400' '\t8400' '10500' '4200' '4700' '10900' '8100' '9500' '2200'
 '

In [14]:
#MIISING VALUES
df.isnull().sum().sort_values(ascending = False)

red_blood_cells            152
red_blood_cell_count       130
white_blood_cell_count     105
potassium                   88
sodium                      87
packed_cell_volume          70
pus_cell                    65
haemoglobin                 52
sugar                       49
specific_gravity            47
albumin                     46
blood_glucose_random        44
blood_urea                  19
serum_creatinine            17
blood_pressure              12
age                          9
bacteria                     4
pus_cell_clumps              4
hypertension                 2
diabetes_mellitus            2
coronary_artery_disease      2
anemia                       1
appetite                     1
peda_edema                   1
id                           0
class                        0
dtype: int64

In [15]:
#Removing Ambiguous Values In Columns

# replace incorrect values
df['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)

df['coronary_artery_disease'] = df['coronary_artery_disease'].replace(to_replace = '\tno', value='no')


In [16]:
df['class'] = df['class'].map({'ckd': 0, 'not ckd': 1})
df['class'] = pd.to_numeric(df['class'], errors='coerce')

- This way we can change the classification column from object variables to numerical variables so as to easily work with Machine Learning Algorithms for the predictive model. In this case:

          *ckd(Chronic Kidney Disease)= 0 
          *not ckd(Not Having Chronic Kidney Disease)=1

In [17]:
cols = ['diabetes_mellitus', 'coronary_artery_disease', 'class']

for col in cols:
    print(f"{col} has {df[col].unique()} values\n")

diabetes_mellitus has ['yes' 'no' nan] values

coronary_artery_disease has ['no' 'yes' nan] values

class has [ 0. nan] values



# VISUALISATION 

In [18]:
fig = px.bar(df, x="red_blood_cells", y="red_blood_cell_count",color='class', barmode='group',height=400)
fig.show()

In [19]:
px.bar(df, x="specific_gravity", y="packed_cell_volume", color='class', barmode='group', template = 'plotly_dark', height = 400)

In [20]:
def outlier_thresholds(dataframe, col_name, q1 = 0.25, q3 = 0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable, q1 = 0.25, q3 = 0.75):
    low_limit, up_limit = outlier_thresholds(dataframe, variable, q1 = q1, q3 = q3)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    
for col in df[num_cols].columns:
    replace_with_thresholds(df, col, q1 = 0.1, q3 = 0.9)
    
def cat_var_summary(df, cat_var):
    colors = ['#a2b9bc', '#6b5b95', '#b2ad7f', '#feb236', '#b5e7a0', '#878f99',
              '#d64161', '#86af49', '#ff7b25']
    
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=('Countplot', 'Percentages'),
                        specs=[[{"type": "xy"}, {'type': 'domain'}]])
    
    x = [str(i) for i in df[cat_var].value_counts().index]
    y = df[cat_var].value_counts().values.tolist()
    
    fig.add_trace(go.Bar(x = x, y = y, text = y, 
                         textposition = "auto",
                       showlegend = False,
                        marker=dict(color=colors,
                              line = dict(color = 'black',
                                          width = 2))), row=1, col=1)
    
    fig.add_trace(go.Pie(labels = df[cat_var].value_counts().keys(),
                         values = df[cat_var].value_counts().values, 
                         hoverinfo ='label',
                  textinfo ='percent',
                  textfont_size = 20,
                  textposition ='auto',
                  marker=dict(colors=colors,
                              line = dict(color = 'black',
                                          width = 2))), row=1, col=2)

    
    fig.update_layout(title={'text': cat_var,
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  template='plotly_dark')
    
    iplot(fig)    

# DATA PROCESSING

In [21]:
df[num_cols].isnull().sum()

id                       0
age                      9
blood_pressure          12
specific_gravity        47
albumin                 46
sugar                   49
blood_glucose_random    44
blood_urea              19
serum_creatinine        17
sodium                  87
potassium               88
haemoglobin             52
dtype: int64

In [22]:
df[cat_cols].isnull().sum()

red_blood_cells            152
pus_cell                    65
pus_cell_clumps              4
bacteria                     4
packed_cell_volume          70
white_blood_cell_count     105
red_blood_cell_count       130
hypertension                 2
diabetes_mellitus            2
coronary_artery_disease      2
appetite                     1
peda_edema                   1
anemia                       1
class                      152
dtype: int64

In [23]:
df[num_cols].isnull().sum()

id                       0
age                      9
blood_pressure          12
specific_gravity        47
albumin                 46
sugar                   49
blood_glucose_random    44
blood_urea              19
serum_creatinine        17
sodium                  87
potassium               88
haemoglobin             52
dtype: int64

In [24]:
def random_value_imputation(feature):
    random_sample = df[feature].dropna().sample(df[feature].isna().sum())
    random_sample.index = df[df[feature].isnull()].index
    df.loc[df[feature].isnull(), feature] = random_sample
    
def impute_mode(feature):
    mode = df[feature].mode()[0]
    df[feature] = df[feature].fillna(mode)

In [25]:
df[num_cols].isnull().sum()

id                       0
age                      9
blood_pressure          12
specific_gravity        47
albumin                 46
sugar                   49
blood_glucose_random    44
blood_urea              19
serum_creatinine        17
sodium                  87
potassium               88
haemoglobin             52
dtype: int64

In [26]:
random_value_imputation('red_blood_cells')
random_value_imputation('pus_cell')

for col in cat_cols:
    impute_mode(col)

In [27]:
df[cat_cols].isnull().sum()

red_blood_cells            0
pus_cell                   0
pus_cell_clumps            0
bacteria                   0
packed_cell_volume         0
white_blood_cell_count     0
red_blood_cell_count       0
hypertension               0
diabetes_mellitus          0
coronary_artery_disease    0
appetite                   0
peda_edema                 0
anemia                     0
class                      0
dtype: int64

In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for col in cat_cols:
    df[col] = le.fit_transform(df[col])
    
    
    
df.head()    
    

Unnamed: 0,id,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,peda_edema,anemia,class
0,0.0,48.0,80.0,1.02,1.0,0.0,1,1,0,0,...,32,72,34,1,1,0,0,0,0,0
1,1.0,7.0,50.0,1.02,4.0,0.0,1,1,0,0,...,26,56,34,0,0,0,0,0,0,0
2,2.0,62.0,80.0,1.01,2.0,3.0,1,1,0,0,...,19,70,34,0,1,0,1,0,1,0
3,3.0,48.0,70.0,1.005,4.0,0.0,1,0,1,0,...,20,62,19,1,0,0,1,1,1,0
4,4.0,51.0,80.0,1.01,2.0,0.0,1,1,0,0,...,23,68,27,0,0,0,0,0,0,0


In [31]:
num_cols


['id',
 'age',
 'blood_pressure',
 'specific_gravity',
 'albumin',
 'sugar',
 'blood_glucose_random',
 'blood_urea',
 'serum_creatinine',
 'sodium',
 'potassium',
 'haemoglobin']

# MODEL BUILDING

In [33]:
ind_col = [col for col in num_cols if col != 'class']
dep_col = 'class'


X = df[ind_col]
y = df[dep_col]


In [34]:

from sklearn.model_selection import train_test_split

# Split data while ensuring class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1243)


In [41]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  # Choose "median" or "most_frequent" as needed
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


# LOGISTIC REGRESSION

In [45]:
log=LogisticRegression(class_weight="balanced", random_state=1243, max_iter=1000)
log.fit(X_train, y_train) 
log.score(X_test, y_test)

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [40]:
from sklearn.ensemble import HistGradientBoostingClassifier

model = HistGradientBoostingClassifier(random_state=1243)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print("Test score:", score)


Test score: 1.0


In [47]:
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Pipeline with imputer, SMOTE, and logistic regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('smote', SMOTE(random_state=1243)),
    ('classifier', LogisticRegression(class_weight="balanced", random_state=1243, max_iter=1000))
])

pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print("Test score:", score)


TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=1243)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't

In [48]:
log=LogisticRegression(class_weight="balanced", random_state=1243, max_iter=1000)
log.fit(X_train, y_train) 
log.score(X_test, y_test) 

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

# Step 1: Check class distribution
print("Class distribution before balancing:", y.value_counts())

# Step 2: Create train-test split with stratification to ensure class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1243)

# Step 3: Define a pipeline with imputation, SMOTE, and logistic regression
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),       # Fill missing values
    ('smote', SMOTE(random_state=1243)),               # Balance classes in y_train
    ('classifier', LogisticRegression(class_weight="balanced", random_state=1243, max_iter=1000))
])

# Step 4: Fit the pipeline and evaluate
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print("Test score:", score)



Class distribution before balancing: class
0    400
Name: count, dtype: int64


TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SMOTE(random_state=1243)' (type <class 'imblearn.over_sampling._smote.base.SMOTE'>) doesn't

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import pandas as pd

# Split data with stratification to retain class balance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1243)

# Step 1: Impute missing values
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Step 2: Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=1243)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Step 3: Train Logistic Regression on the resampled data
log = LogisticRegression(class_weight="balanced", random_state=1243, max_iter=1000)
log.fit(X_train_resampled, y_train_resampled)

# Step 4: Evaluate the model on the test set
score = log.score(X_test, y_test)
print("Test score:", score)


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead