In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [2]:
file_link = "https://drive.google.com/file/d/1cEHJ7HK8LLNARzJmF3Y1mTU6cPMuRW-I/view?usp=drive_link"
id = file_link.split("/")[-2]
new_link = f'https://drive.google.com/uc?id={id}'
df_20k = pd.read_csv(new_link)

In [3]:
file_link = "https://drive.google.com/file/d/15L8bYRdFb9Hf2oWz3GbMXVTU7Qx3Vjgi/view?usp=drive_link"
id = file_link.split("/")[-2]
new_link = f'https://drive.google.com/uc?id={id}'
df_400 = pd.read_csv(new_link)

In [4]:
merged = pd.concat([df_20k, df_400])

In [5]:
merged['classification'].value_counts()

Unnamed: 0_level_0,count
classification,Unnamed: 1_level_1
0,16582
1,4356


In [6]:
merged.head()

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,hypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,classification
0,54.0,167.0,1.023,1.0,4.0,0.0,1.0,0.0,0.0,96.0,...,35.0,5791.0,5.6,1.0,1.0,0.0,0.0,0.0,0.0,0
1,42.0,127.0,1.023,3.0,2.0,0.0,0.0,0.0,1.0,73.0,...,25.0,5390.0,4.6,0.0,1.0,0.0,0.0,1.0,1.0,1
2,38.0,148.0,1.016,0.0,0.0,1.0,0.0,0.0,0.0,77.0,...,46.0,12098.0,4.7,0.0,0.0,1.0,0.0,1.0,0.0,0
3,7.0,98.0,1.017,4.0,0.0,1.0,0.0,0.0,1.0,225.0,...,24.0,6747.0,4.8,0.0,0.0,1.0,0.0,0.0,1.0,0
4,67.0,174.0,1.015,1.0,1.0,0.0,1.0,0.0,0.0,376.0,...,46.0,5759.0,5.7,0.0,0.0,0.0,0.0,1.0,1.0,0


In [7]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20938 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      20929 non-null  float64
 1   blood pressure           20926 non-null  float64
 2   specific gravity         20891 non-null  float64
 3   albumin                  20892 non-null  float64
 4   sugar                    20889 non-null  float64
 5   red blood cells          20786 non-null  float64
 6   pus cell                 20873 non-null  float64
 7   pus cell clumps          20934 non-null  float64
 8   bacteria                 20934 non-null  float64
 9   blood glucose random     20894 non-null  float64
 10  blood urea               20919 non-null  float64
 11  serum creatinine         20921 non-null  float64
 12  sodium                   20851 non-null  float64
 13  potassium                20850 non-null  float64
 14  hemoglobin               2088

In [8]:
binary_cols = []
for col in merged.columns:
    unique_values = merged[col].dropna().unique()
    if np.array_equal(np.sort(unique_values), np.array([0.0, 1.0])):
        binary_cols.append(col)

for col in binary_cols:
    print(f"Value counts for column '{col}':")
    display(merged[col].value_counts())
    print("-" * 20)

Value counts for column 'red blood cells':


Unnamed: 0_level_0,count
red blood cells,Unnamed: 1_level_1
0.0,10473
1.0,10313


--------------------
Value counts for column 'pus cell':


Unnamed: 0_level_0,count
pus cell,Unnamed: 1_level_1
0.0,10523
1.0,10350


--------------------
Value counts for column 'pus cell clumps':


Unnamed: 0_level_0,count
pus cell clumps,Unnamed: 1_level_1
0.0,10594
1.0,10340


--------------------
Value counts for column 'bacteria':


Unnamed: 0_level_0,count
bacteria,Unnamed: 1_level_1
0.0,10727
1.0,10207


--------------------
Value counts for column 'hypertension':


Unnamed: 0_level_0,count
hypertension,Unnamed: 1_level_1
0.0,10478
1.0,10458


--------------------
Value counts for column 'diabetes mellitus':


Unnamed: 0_level_0,count
diabetes mellitus,Unnamed: 1_level_1
0.0,10561
1.0,10375


--------------------
Value counts for column 'coronary artery disease':


Unnamed: 0_level_0,count
coronary artery disease,Unnamed: 1_level_1
0.0,10599
1.0,10337


--------------------
Value counts for column 'appetite':


Unnamed: 0_level_0,count
appetite,Unnamed: 1_level_1
0.0,10701
1.0,10236


--------------------
Value counts for column 'pedal edema':


Unnamed: 0_level_0,count
pedal edema,Unnamed: 1_level_1
0.0,10574
1.0,10363


--------------------
Value counts for column 'anemia':


Unnamed: 0_level_0,count
anemia,Unnamed: 1_level_1
0.0,10475
1.0,10462


--------------------
Value counts for column 'classification':


Unnamed: 0_level_0,count
classification,Unnamed: 1_level_1
0,16582
1,4356


--------------------


In [9]:
for col in binary_cols:
    mode_value = merged[col].mode()[0]
    merged[col] = merged[col].fillna(mode_value)

print("Null values after imputation:")
display(merged[binary_cols].isnull().sum())

Null values after imputation:


Unnamed: 0,0
red blood cells,0
pus cell,0
pus cell clumps,0
bacteria,0
hypertension,0
diabetes mellitus,0
coronary artery disease,0
appetite,0
pedal edema,0
anemia,0


In [10]:
numerical_cols = merged.select_dtypes(include=np.number).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in binary_cols]

for col in numerical_cols:
    mean_value = merged[col].mean()
    merged[col] = merged[col].fillna(mean_value)

print("Null values after imputing numerical columns:")
display(merged[numerical_cols].isnull().sum())

Null values after imputing numerical columns:


Unnamed: 0,0
age,0
blood pressure,0
specific gravity,0
albumin,0
sugar,0
blood glucose random,0
blood urea,0
serum creatinine,0
sodium,0
potassium,0


In [11]:
merged.head()

Unnamed: 0,age,blood pressure,specific gravity,albumin,sugar,red blood cells,pus cell,pus cell clumps,bacteria,blood glucose random,...,packed cell volume,white blood cell count,red blood cell count,hypertension,diabetes mellitus,coronary artery disease,appetite,pedal edema,anemia,classification
0,54.0,167.0,1.023,1.0,4.0,0.0,1.0,0.0,0.0,96.0,...,35.0,5791.0,5.6,1.0,1.0,0.0,0.0,0.0,0.0,0
1,42.0,127.0,1.023,3.0,2.0,0.0,0.0,0.0,1.0,73.0,...,25.0,5390.0,4.6,0.0,1.0,0.0,0.0,1.0,1.0,1
2,38.0,148.0,1.016,0.0,0.0,1.0,0.0,0.0,0.0,77.0,...,46.0,12098.0,4.7,0.0,0.0,1.0,0.0,1.0,0.0,0
3,7.0,98.0,1.017,4.0,0.0,1.0,0.0,0.0,1.0,225.0,...,24.0,6747.0,4.8,0.0,0.0,1.0,0.0,0.0,1.0,0
4,67.0,174.0,1.015,1.0,1.0,0.0,1.0,0.0,0.0,376.0,...,46.0,5759.0,5.7,0.0,0.0,0.0,0.0,1.0,1.0,0


In [12]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20938 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      20938 non-null  float64
 1   blood pressure           20938 non-null  float64
 2   specific gravity         20938 non-null  float64
 3   albumin                  20938 non-null  float64
 4   sugar                    20938 non-null  float64
 5   red blood cells          20938 non-null  float64
 6   pus cell                 20938 non-null  float64
 7   pus cell clumps          20938 non-null  float64
 8   bacteria                 20938 non-null  float64
 9   blood glucose random     20938 non-null  float64
 10  blood urea               20938 non-null  float64
 11  serum creatinine         20938 non-null  float64
 12  sodium                   20938 non-null  float64
 13  potassium                20938 non-null  float64
 14  hemoglobin               2093

In [15]:
for col in binary_cols:
    merged[col] = merged[col].astype('int64')

print("Binary columns converted to int64:")
print(merged[binary_cols].dtypes)

Binary columns converted to int64:
red blood cells            int64
pus cell                   int64
pus cell clumps            int64
bacteria                   int64
hypertension               int64
diabetes mellitus          int64
coronary artery disease    int64
appetite                   int64
pedal edema                int64
anemia                     int64
classification             int64
dtype: object


In [16]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20938 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      20938 non-null  float64
 1   blood pressure           20938 non-null  float64
 2   specific gravity         20938 non-null  float64
 3   albumin                  20938 non-null  float64
 4   sugar                    20938 non-null  float64
 5   red blood cells          20938 non-null  int64  
 6   pus cell                 20938 non-null  int64  
 7   pus cell clumps          20938 non-null  int64  
 8   bacteria                 20938 non-null  int64  
 9   blood glucose random     20938 non-null  float64
 10  blood urea               20938 non-null  float64
 11  serum creatinine         20938 non-null  float64
 12  sodium                   20938 non-null  float64
 13  potassium                20938 non-null  float64
 14  hemoglobin               2093

In [19]:
merged.to_csv('20.4k updated_dataset.csv', index=False)
print("Updated dataset exported successfully as 'updated_dataset.csv'")

Updated dataset exported successfully as 'updated_dataset.csv'
