In [3]:
import pandas as pd
import numpy as np
import warnings
from typing import Literal, Dict, Any
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
import logging

In [10]:
indian_liver_patient = pd.read_csv('/content/Data/indian_liver_patient.csv', encoding="utf-8");
cirrhosis = pd.read_csv('/content/Data/cirrhosis.csv', encoding="utf-8");
print(indian_liver_patient.head())
print(cirrhosis.head())

   Age of the patient Gender of the patient  Total Bilirubin  \
0                65.0                Female              0.7   
1                62.0                  Male             10.9   
2                62.0                  Male              7.3   
3                58.0                  Male              1.0   
4                72.0                  Male              3.9   

   Direct Bilirubin  Alkphos Alkaline Phosphotase  \
0               0.1                         187.0   
1               5.5                         699.0   
2               4.1                         490.0   
3               0.4                         182.0   
4               2.0                         195.0   

   Sgpt Alamine Aminotransferase  Sgot Aspartate Aminotransferase  \
0                           16.0                             18.0   
1                           64.0                            100.0   
2                           60.0                             68.0   
3                   

In [11]:
def print_nan_percentage(df: pd.DataFrame) -> None:
    total_nan = df.isna().sum()
    percent_nan = (total_nan / len(df)) * 100
    print("="*80)
    print("NaN Analysis Summary:")
    print(pd.DataFrame({"NaN Count": total_nan, "NaN %": percent_nan.round(2)}))
    print("="*80)


def replace_nan(df: pd.DataFrame, option: str = "mean") -> pd.DataFrame:
    df_filled = df.copy()
    for col in df_filled.columns:
        if pd.api.types.is_numeric_dtype(df_filled[col]):
            fill_value = df_filled[col].mean() if option == "mean" else df_filled[col].median()
            df_filled[col] = df_filled[col].fillna(fill_value)
        else:
            df_filled[col] = df_filled[col].fillna("Unknown")
    return df_filled


def apply_smote(df: pd.DataFrame, target_col: str) -> pd.DataFrame:
    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_encoded = pd.get_dummies(X, drop_first=True)
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_encoded, y)
    df_balanced = pd.concat([
        pd.DataFrame(X_resampled, columns=X_encoded.columns),
        pd.Series(y_resampled, name=target_col)
    ], axis=1)
    print("Before SMOTE:", y.value_counts().to_dict())
    print("After SMOTE: ", y_resampled.value_counts().to_dict())
    return df_balanced

In [14]:
indian_liver_patient = replace_nan(indian_liver_patient, option="mean")
print_nan_percentage(indian_liver_patient)
cirrhosis = replace_nan(cirrhosis, option="mean")
print_nan_percentage(cirrhosis)

NaN Analysis Summary:
                                      NaN Count  NaN %
Age of the patient                            0    0.0
Gender of the patient                         0    0.0
Total Bilirubin                               0    0.0
Direct Bilirubin                              0    0.0
Alkphos Alkaline Phosphotase                  0    0.0
Sgpt Alamine Aminotransferase                 0    0.0
Sgot Aspartate Aminotransferase               0    0.0
Total Protiens                                0    0.0
ALB Albumin                                   0    0.0
A/G Ratio Albumin and Globulin Ratio          0    0.0
Result                                        0    0.0
NaN Analysis Summary:
               NaN Count  NaN %
id                     0    0.0
N_Days                 0    0.0
Drug                   0    0.0
Age                    0    0.0
Sex                    0    0.0
Ascites                0    0.0
Hepatomegaly           0    0.0
Spiders                0    0.0
Edema   