In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns
import plotly.express as px

%matplotlib inline

In [2]:
# Set DPI for fugures

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [3]:
# Set the default font size and weight
plt.rcParams['font.size'] = 30
plt.rcParams['font.weight'] = 'bold'

In [4]:
# Drive connection

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset: UCI Chronic Kidney Disease

In [None]:
# Dataset
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
chronic_kidney_disease = fetch_ucirepo(id=336)

# data (as pandas dataframes)
X = chronic_kidney_disease.data.features
y = chronic_kidney_disease.data.targets

# metadata
print(chronic_kidney_disease.metadata)

# variable information
print(chronic_kidney_disease.variables)

{'uci_id': 336, 'name': 'Chronic Kidney Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/336/chronic+kidney+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/336/data.csv', 'abstract': 'This dataset can be used to predict the chronic kidney disease and it can be collected from the hospital nearly 2 months of period.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 400, 'num_features': 24, 'feature_types': ['Real'], 'demographics': ['Age'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2015, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5G020', 'creators': ['L. Rubini', 'P. Soundarapandian', 'P. Eswaran'], 'intro_paper': None, 'additional_info': {'summary': 'We use the following representation to collect the dataset\r\n                        age\t\t-\tage\t\r\n\t\t\tbp\t\t-\tblood pressure\r\n\t\t\tsg\t

In [None]:
# Merge the features and targets into a single DataFrame
df = pd.concat([X, y], axis=1)

# Display the merged DataFrame
print(df.head())

    age    bp     sg   al   su     rbc        pc         pcc          ba  \
0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent  notpresent   
1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent  notpresent   
2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent  notpresent   
3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present  notpresent   
4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent  notpresent   

     bgr  ...   pcv    wbcc  rbcc  htn   dm  cad  appet   pe  ane class  
0  121.0  ...  44.0  7800.0   5.2  yes  yes   no   good   no   no   ckd  
1    NaN  ...  38.0  6000.0   NaN   no   no   no   good   no   no   ckd  
2  423.0  ...  31.0  7500.0   NaN   no  yes   no   poor   no  yes   ckd  
3  117.0  ...  32.0  6700.0   3.9  yes   no   no   poor  yes  yes   ckd  
4  106.0  ...  35.0  7300.0   4.6   no   no   no   good   no   no   ckd  

[5 rows x 25 columns]


In [None]:
# Size
df.shape

(400, 25)

In [None]:
# Null Values
df.isnull().sum()

Unnamed: 0,0
age,9
bp,12
sg,47
al,46
su,49
rbc,152
pc,65
pcc,4
ba,4
bgr,44


In [None]:
# Cleaning Null Values
df = df.dropna()

In [None]:
# Null Values
df.isnull().sum()

Unnamed: 0,0
age,0
bp,0
sg,0
al,0
su,0
rbc,0
pc,0
pcc,0
ba,0
bgr,0


In [None]:
# Size
df.shape

(158, 25)

In [None]:
# Count occurrences for each category in each feature
for column in df.columns:
    print(f"Counts for {column}:")
    print(df[column].value_counts())
    print()  # Add a newline for better readability

Counts for age:
age
59.0    8
48.0    7
55.0    7
60.0    6
56.0    5
73.0    5
30.0    5
64.0    4
46.0    4
52.0    4
57.0    4
23.0    4
58.0    4
43.0    4
33.0    4
47.0    4
42.0    4
63.0    4
41.0    3
34.0    3
61.0    3
69.0    3
50.0    3
45.0    3
62.0    3
71.0    3
44.0    3
29.0    2
35.0    2
25.0    2
70.0    2
40.0    2
68.0    2
49.0    2
37.0    2
51.0    2
32.0    2
38.0    2
39.0    2
66.0    2
83.0    1
53.0    1
21.0    1
6.0     1
20.0    1
24.0    1
65.0    1
22.0    1
74.0    1
75.0    1
28.0    1
72.0    1
79.0    1
80.0    1
15.0    1
36.0    1
12.0    1
17.0    1
Name: count, dtype: int64

Counts for bp:
bp
80.0     63
60.0     40
70.0     37
90.0      9
100.0     7
50.0      1
110.0     1
Name: count, dtype: int64

Counts for sg:
sg
1.020    61
1.025    61
1.010    23
1.015    10
1.005     3
Name: count, dtype: int64

Counts for al:
al
0.0    116
4.0     15
3.0     15
2.0      9
1.0      3
Name: count, dtype: int64

Counts for su:
su
0.0    140
2.0      6

In [None]:
# Data
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
9,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd
11,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,...,32.0,4500.0,3.8,yes,yes,no,poor,yes,no,ckd
14,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,157.0,...,16.0,11000.0,2.6,yes,yes,yes,poor,yes,no,ckd
20,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,173.0,...,24.0,9200.0,3.2,yes,yes,yes,poor,yes,yes,ckd


In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/df.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/df.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/df.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32.0,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
1,53.0,90.0,1.02,2.0,0.0,abnormal,abnormal,present,notpresent,70.0,...,29.0,12100.0,3.7,yes,yes,no,poor,no,yes,ckd
2,63.0,70.0,1.01,3.0,0.0,abnormal,abnormal,present,notpresent,380.0,...,32.0,4500.0,3.8,yes,yes,no,poor,yes,no,ckd
3,68.0,80.0,1.01,3.0,2.0,normal,abnormal,present,present,157.0,...,16.0,11000.0,2.6,yes,yes,yes,poor,yes,no,ckd
4,61.0,80.0,1.015,2.0,0.0,abnormal,abnormal,notpresent,notpresent,173.0,...,24.0,9200.0,3.2,yes,yes,yes,poor,yes,yes,ckd


In [None]:
# Column Names
print(df.columns.tolist())

['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class']


In [None]:
import pandas as pd

# Dataframe
mapping = {
    'rbc': {'normal': 1, 'abnormal': 0},
    'pc': {'normal': 1, 'abnormal': 0},
    'pcc': {'present': 1, 'notpresent': 0},
    'ba': {'present': 1, 'notpresent': 0},
    'htn': {'yes': 1, 'no': 0},
    'dm': {'yes': 1, 'no': 0},
    'cad': {'yes': 1, 'no': 0},
    'appet': {'good': 1, 'poor': 0},
    'pe': {'yes': 1, 'no': 0},
    'ane': {'yes': 1, 'no': 0},
    'class': {'ckd': 1, 'notckd': 0},
}

# Apply the mappings to the dataframe
df = df.replace(mapping)

# Check the updated dataframe
print(df.head())

    age    bp     sg   al   su  rbc  pc  pcc  ba    bgr  ...   pcv     wbcc  \
0  48.0  70.0  1.005  4.0  0.0    1   0    1   0  117.0  ...  32.0   6700.0   
1  53.0  90.0  1.020  2.0  0.0    0   0    1   0   70.0  ...  29.0  12100.0   
2  63.0  70.0  1.010  3.0  0.0    0   0    1   0  380.0  ...  32.0   4500.0   
3  68.0  80.0  1.010  3.0  2.0    1   0    1   1  157.0  ...  16.0  11000.0   
4  61.0  80.0  1.015  2.0  0.0    0   0    0   0  173.0  ...  24.0   9200.0   

   rbcc  htn  dm  cad  appet  pe  ane  class  
0   3.9    1   0    0      0   1    1      1  
1   3.7    1   1    0      0   0    1      1  
2   3.8    1   1    0      0   1    0      1  
3   2.6    1   1    1      0   1    0      1  
4   3.2    1   1    1      0   1    1      1  

[5 rows x 25 columns]


  df = df.replace(mapping)


In [None]:
# Data
df.head()

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,...,pcv,wbcc,rbcc,htn,dm,cad,appet,pe,ane,class
0,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,0,1,1,1
1,53.0,90.0,1.02,2.0,0.0,0,0,1,0,70.0,...,29.0,12100.0,3.7,1,1,0,0,0,1,1
2,63.0,70.0,1.01,3.0,0.0,0,0,1,0,380.0,...,32.0,4500.0,3.8,1,1,0,0,1,0,1
3,68.0,80.0,1.01,3.0,2.0,1,0,1,1,157.0,...,16.0,11000.0,2.6,1,1,1,0,1,0,1
4,61.0,80.0,1.015,2.0,0.0,0,0,0,0,173.0,...,24.0,9200.0,3.2,1,1,1,0,1,1,1


In [None]:
# Renaming

df.rename(columns={
    "age": "Age (years)",
    "bp": "Blood Pressure (mm Hg)",
    "sg": "Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)",
    "al": "Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)",
    "su": "Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)",
    "rbc": "Red Blood Cells (1 = Normal, 0 = Abnormal)",
    "pc": "Pus Cells (1 = Normal, 0 = Abnormal)",
    "pcc": "Pus Cell Clumps (1 = Present, 0 = Not Present)",
    "ba": "Bacteria (1 = Present, 0 = Not Present)",
    "bgr": "Blood Glucose Random (mg/dL)",
    "bu": "Blood Urea (mg/dL)",
    "sc": "Serum Creatinine (mg/dL)",
    "sod": "Sodium (mEq/L)",
    "pot": "Potassium (mEq/L)",
    "hemo": "Hemoglobin (g/dL)",
    "pcv": "Packed Cell Volume (%)",
    "wbcc": "White Blood Cell Count (cells/cumm)",
    "rbcc": "Red Blood Cell Count (million cells/cumm)",
    "htn": "Hypertension (1 = Yes, 0 = No)",
    "dm": "Diabetes Mellitus (1 = Yes, 0 = No)",
    "cad": "Coronary Artery Disease (1 = Yes, 0 = No)",
    "appet": "Appetite (1 = Good, 0 = Poor)",
    "pe": "Pedal Edema (1 = Yes, 0 = No)",
    "ane": "Anemia (1 = Yes, 0 = No)",
    "class": "Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"
}, inplace=True)

In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/df.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/df.csv


In [8]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/df.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,Age (years),Blood Pressure (mm Hg),"Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)","Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)","Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)","Red Blood Cells (1 = Normal, 0 = Abnormal)","Pus Cells (1 = Normal, 0 = Abnormal)","Pus Cell Clumps (1 = Present, 0 = Not Present)","Bacteria (1 = Present, 0 = Not Present)",Blood Glucose Random (mg/dL),...,Packed Cell Volume (%),White Blood Cell Count (cells/cumm),Red Blood Cell Count (million cells/cumm),"Hypertension (1 = Yes, 0 = No)","Diabetes Mellitus (1 = Yes, 0 = No)","Coronary Artery Disease (1 = Yes, 0 = No)","Appetite (1 = Good, 0 = Poor)","Pedal Edema (1 = Yes, 0 = No)","Anemia (1 = Yes, 0 = No)","Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"
0,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,32.0,6700.0,3.9,1,0,0,0,1,1,1
1,53.0,90.0,1.02,2.0,0.0,0,0,1,0,70.0,...,29.0,12100.0,3.7,1,1,0,0,0,1,1
2,63.0,70.0,1.01,3.0,0.0,0,0,1,0,380.0,...,32.0,4500.0,3.8,1,1,0,0,1,0,1
3,68.0,80.0,1.01,3.0,2.0,1,0,1,1,157.0,...,16.0,11000.0,2.6,1,1,1,0,1,0,1
4,61.0,80.0,1.015,2.0,0.0,0,0,0,0,173.0,...,24.0,9200.0,3.2,1,1,1,0,1,1,1


In [9]:
print(df.columns.tolist())

['Age (years)', 'Blood Pressure (mm Hg)', 'Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)', 'Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)', 'Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)', 'Red Blood Cells (1 = Normal, 0 = Abnormal)', 'Pus Cells (1 = Normal, 0 = Abnormal)', 'Pus Cell Clumps (1 = Present, 0 = Not Present)', 'Bacteria (1 = Present, 0 = Not Present)', 'Blood Glucose Random (mg/dL)', 'Blood Urea (mg/dL)', 'Serum Creatinine (mg/dL)', 'Sodium (mEq/L)', 'Potassium (mEq/L)', 'Hemoglobin (g/dL)', 'Packed Cell Volume (%)', 'White Blood Cell Count (cells/cumm)', 'Red Blood Cell Count (million cells/cumm)', 'Hypertension (1 = Yes, 0 = No)', 'Diabetes Mellitus (1 = Yes, 0 = No)', 'Coronary Artery Disease (1 = Yes, 0 = No)', 'Appetite (1 = Good, 0 = Poor)', 'Pedal Edema (1 = Yes, 0 = No)', 'Anemia (1 = Yes, 0 = No)', 'Chronic Kidney D

In [None]:
# New Column Names
print(df.columns.tolist())

['Age (years)', 'Blood Pressure (mm Hg)', 'Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)', 'Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)', 'Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)', 'Red Blood Cells (1 = Normal, 0 = Abnormal)', 'Pus Cells (1 = Normal, 0 = Abnormal)', 'Pus Cell Clumps (1 = Present, 0 = Not Present)', 'Bacteria (1 = Present, 0 = Not Present)', 'Blood Glucose Random (mg/dL)', 'Blood Urea (mg/dL)', 'Serum Creatinine (mg/dL)', 'Sodium (mEq/L)', 'Potassium (mEq/L)', 'Hemoglobin (g/dL)', 'Packed Cell Volume (%)', 'White Blood Cell Count (cells/cumm)', 'Red Blood Cell Count (million cells/cumm)', 'Hypertension (1 = Yes, 0 = No)', 'Diabetes Mellitus (1 = Yes, 0 = No)', 'Coronary Artery Disease (1 = Yes, 0 = No)', 'Appetite (1 = Good, 0 = Poor)', 'Pedal Edema (1 = Yes, 0 = No)', 'Anemia (1 = Yes, 0 = No)', 'Chronic Kidney D

In [None]:
# Size
df.shape

(158, 25)

# Table to Clinical Text for GPT Model

In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target
input_features = df.drop(columns=["Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"])
target = df["Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"]

# Mapping for albumin and sugar levels
albumin_map = {
    0.0: "Normal", 1.0: "Trace", 2.0: "Mild", 3.0: "Moderate", 4.0: "Severe", 5.0: "Very Severe"
}
sugar_map = {
    0.0: "Normal", 1.0: "Trace", 2.0: "Mild", 3.0: "Moderate", 4.0: "High", 5.0: "Very High"
}

# Updated row_to_text function
def row_to_text(row):
    albumin_level = albumin_map.get(row['Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)'], "Unknown")
    sugar_level = sugar_map.get(row['Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)'], "Unknown")

    return (f"Patient aged {row['Age (years)']} years with blood pressure of {row['Blood Pressure (mm Hg)']} mm Hg, "
            f"specific gravity of urine is {row['Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)']}, "
            f"albumin level is {albumin_level}, "
            f"sugar level is {sugar_level}, "
            f"red blood cells are {'normal' if row['Red Blood Cells (1 = Normal, 0 = Abnormal)'] == 1 else 'abnormal'}, "
            f"pus cells are {'normal' if row['Pus Cells (1 = Normal, 0 = Abnormal)'] == 1 else 'abnormal'}, "
            f"pus cell clumps are {'present' if row['Pus Cell Clumps (1 = Present, 0 = Not Present)'] == 1 else 'not present'}, "
            f"bacteria are {'present' if row['Bacteria (1 = Present, 0 = Not Present)'] == 1 else 'not present'}, "
            f"blood glucose is {row['Blood Glucose Random (mg/dL)']} mg/dL, urea is {row['Blood Urea (mg/dL)']} mg/dL, "
            f"serum creatinine is {row['Serum Creatinine (mg/dL)']} mg/dL, sodium is {row['Sodium (mEq/L)']} mEq/L, "
            f"potassium is {row['Potassium (mEq/L)']} mEq/L, hemoglobin is {row['Hemoglobin (g/dL)']} g/dL, "
            f"packed cell volume is {row['Packed Cell Volume (%)']}%, WBC count is {row['White Blood Cell Count (cells/cumm)']} cells/cumm, "
            f"RBC count is {row['Red Blood Cell Count (million cells/cumm)']} million/cumm, "
            f"has {'hypertension' if row['Hypertension (1 = Yes, 0 = No)'] else 'no hypertension'}, "
            f"has {'diabetes mellitus' if row['Diabetes Mellitus (1 = Yes, 0 = No)'] else 'no diabetes'}, "
            f"has {'coronary artery disease' if row['Coronary Artery Disease (1 = Yes, 0 = No)'] else 'no coronary artery disease'}, "
            f"appetite is {'good' if row['Appetite (1 = Good, 0 = Poor)'] else 'poor'}, "
            f"pedal edema is {'present' if row['Pedal Edema (1 = Yes, 0 = No)'] else 'not present'}, "
            f"and {'has' if row['Anemia (1 = Yes, 0 = No)'] else 'does not have'} anemia.")

# Apply to dataframe
df["clinical_text"] = input_features.apply(row_to_text, axis=1)

In [None]:
# Drop target column to get input features only
input_features = df.drop(columns=["Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"])

# Apply the conversion function row-wise
df["clinical_text"] = input_features.apply(row_to_text, axis=1)

# View the converted text
print(df["clinical_text"].head())

0    Patient aged 48.0 years with blood pressure of...
1    Patient aged 53.0 years with blood pressure of...
2    Patient aged 63.0 years with blood pressure of...
3    Patient aged 68.0 years with blood pressure of...
4    Patient aged 61.0 years with blood pressure of...
Name: clinical_text, dtype: object


In [None]:
# As a list
clinical_texts = df["clinical_text"].tolist()

# Save to CSV for external use
df[["clinical_text"]].to_csv("converted_clinical_texts.csv", index=False)

In [None]:
# Preview with original data + clinical text
df_with_text = df.copy()
df_with_text["clinical_text"] = input_features.apply(row_to_text, axis=1)
df_with_text.head()

Unnamed: 0,Age (years),Blood Pressure (mm Hg),"Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)","Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)","Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)","Red Blood Cells (1 = Normal, 0 = Abnormal)","Pus Cells (1 = Normal, 0 = Abnormal)","Pus Cell Clumps (1 = Present, 0 = Not Present)","Bacteria (1 = Present, 0 = Not Present)",Blood Glucose Random (mg/dL),...,White Blood Cell Count (cells/cumm),Red Blood Cell Count (million cells/cumm),"Hypertension (1 = Yes, 0 = No)","Diabetes Mellitus (1 = Yes, 0 = No)","Coronary Artery Disease (1 = Yes, 0 = No)","Appetite (1 = Good, 0 = Poor)","Pedal Edema (1 = Yes, 0 = No)","Anemia (1 = Yes, 0 = No)","Chronic Kidney Disease (1 = CKD, 0 = Not CKD)",clinical_text
0,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,6700.0,3.9,1,0,0,0,1,1,1,Patient aged 48.0 years with blood pressure of...
1,53.0,90.0,1.02,2.0,0.0,0,0,1,0,70.0,...,12100.0,3.7,1,1,0,0,0,1,1,Patient aged 53.0 years with blood pressure of...
2,63.0,70.0,1.01,3.0,0.0,0,0,1,0,380.0,...,4500.0,3.8,1,1,0,0,1,0,1,Patient aged 63.0 years with blood pressure of...
3,68.0,80.0,1.01,3.0,2.0,1,0,1,1,157.0,...,11000.0,2.6,1,1,1,0,1,0,1,Patient aged 68.0 years with blood pressure of...
4,61.0,80.0,1.015,2.0,0.0,0,0,0,0,173.0,...,9200.0,3.2,1,1,1,0,1,1,1,Patient aged 61.0 years with blood pressure of...


In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/clinical_text_dataset.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/clinical_text_dataset.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/clinical_text_dataset.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,Age (years),Blood Pressure (mm Hg),"Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)","Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)","Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)","Red Blood Cells (1 = Normal, 0 = Abnormal)","Pus Cells (1 = Normal, 0 = Abnormal)","Pus Cell Clumps (1 = Present, 0 = Not Present)","Bacteria (1 = Present, 0 = Not Present)",Blood Glucose Random (mg/dL),...,White Blood Cell Count (cells/cumm),Red Blood Cell Count (million cells/cumm),"Hypertension (1 = Yes, 0 = No)","Diabetes Mellitus (1 = Yes, 0 = No)","Coronary Artery Disease (1 = Yes, 0 = No)","Appetite (1 = Good, 0 = Poor)","Pedal Edema (1 = Yes, 0 = No)","Anemia (1 = Yes, 0 = No)","Chronic Kidney Disease (1 = CKD, 0 = Not CKD)",clinical_text
0,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.0,...,6700.0,3.9,1,0,0,0,1,1,1,Patient aged 48.0 years with blood pressure of...
1,53.0,90.0,1.02,2.0,0.0,0,0,1,0,70.0,...,12100.0,3.7,1,1,0,0,0,1,1,Patient aged 53.0 years with blood pressure of...
2,63.0,70.0,1.01,3.0,0.0,0,0,1,0,380.0,...,4500.0,3.8,1,1,0,0,1,0,1,Patient aged 63.0 years with blood pressure of...
3,68.0,80.0,1.01,3.0,2.0,1,0,1,1,157.0,...,11000.0,2.6,1,1,1,0,1,0,1,Patient aged 68.0 years with blood pressure of...
4,61.0,80.0,1.015,2.0,0.0,0,0,0,0,173.0,...,9200.0,3.2,1,1,1,0,1,1,1,Patient aged 61.0 years with blood pressure of...


In [None]:
# Clinical Texts
for idx, text in enumerate(df["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient aged 48.0 years with blood pressure of 70.0 mm Hg, specific gravity of urine is 1.005, albumin level is Severe, sugar level is Normal, red blood cells are normal, pus cells are abnormal, pus cell clumps are present, bacteria are not present, blood glucose is 117.0 mg/dL, urea is 56.0 mg/dL, serum creatinine is 3.8 mg/dL, sodium is 111.0 mEq/L, potassium is 2.5 mEq/L, hemoglobin is 11.2 g/dL, packed cell volume is 32.0%, WBC count is 6700.0 cells/cumm, RBC count is 3.9 million/cumm, has hypertension, has no diabetes, has no coronary artery disease, appetite is poor, pedal edema is present, and has anemia.
--------------------------------------------------------------------------------
Row 2:
Patient aged 53.0 years with blood pressure of 90.0 mm Hg, specific gravity of urine is 1.02, albumin level is Mild, sugar level is Normal, red blood cells are abnormal, pus cells are abnormal, pus cell clumps are present, bacteria are not present, blood glucose is 70.0 mg/dL, urea is

# Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Define x and y
x = df['clinical_text']
y = df['Chronic Kidney Disease (1 = CKD, 0 = Not CKD)']

# 80-20 train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
"""
# Save to CSV
x_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/x_train", index=False)
y_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/y_train", index=False)
x_test.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/x_test", index=False)
y_test.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/y_test", index=False)
"""

In [None]:
# Load the DataFrame
x_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/x_train")
y_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/y_train")
x_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/x_test")
y_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/y_test")

In [None]:
# Train Shape
x_train.shape

(126, 1)

In [None]:
# Train Label Shape
y_train.shape

(126, 1)

In [None]:
# Train Count
print(y_train.value_counts())

Chronic Kidney Disease (1 = CKD, 0 = Not CKD)
0                                                92
1                                                34
Name: count, dtype: int64


In [None]:
# Test shape
x_test.shape

(32, 1)

In [None]:
# Test Label shape
y_test.shape

(32, 1)

In [None]:
# Test Count
print(y_test.value_counts())

Chronic Kidney Disease (1 = CKD, 0 = Not CKD)
0                                                23
1                                                 9
Name: count, dtype: int64


### GPT Model (humarin/chatgpt_paraphraser_on_T5_base) for Synthetic Data and Paraphrasing (Optional)

In [None]:
# Merge x_train and y_train
df = pd.concat([x_train, y_train], axis=1)

### Positive cases

In [None]:
# Filter the rows where CKD == 1
ckd_rows = df[df["Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"] == 1]

# Print clinical texts for CKD patients only
for idx, text in enumerate(ckd_rows["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient aged 40.0 years with blood pressure of 70.0 mm Hg, specific gravity of urine is 1.015, albumin level is Moderate, sugar level is High, red blood cells are normal, pus cells are normal, pus cell clumps are not present, bacteria are not present, blood glucose is 253.0 mg/dL, urea is 150.0 mg/dL, serum creatinine is 11.9 mg/dL, sodium is 132.0 mEq/L, potassium is 5.6 mEq/L, hemoglobin is 10.9 g/dL, packed cell volume is 31.0%, WBC count is 8800.0 cells/cumm, RBC count is 3.4 million/cumm, has hypertension, has diabetes mellitus, has no coronary artery disease, appetite is poor, pedal edema is present, and does not have anemia.
--------------------------------------------------------------------------------
Row 2:
Patient aged 55.0 years with blood pressure of 90.0 mm Hg, specific gravity of urine is 1.01, albumin level is Mild, sugar level is Trace, red blood cells are abnormal, pus cells are abnormal, pus cell clumps are not present, bacteria are not present, blood glucose

In [None]:
df.head()

Unnamed: 0,clinical_text,"Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"
0,Patient aged 48.0 years with blood pressure of...,0
1,Patient aged 55.0 years with blood pressure of...,0
2,Patient aged 17.0 years with blood pressure of...,0
3,Patient aged 64.0 years with blood pressure of...,0
4,Patient aged 40.0 years with blood pressure of...,1


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
import torch
from transformers import pipeline

# Step 1: Filter class 1 examples
df_class_1 = df[df["Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"] == 1]
clinical_sentences_class_1 = df_class_1["clinical_text"].tolist()

# Step 2: Calculate how many paraphrases we need
num_class_0 = df[df["Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"] == 0].shape[0]
num_class_1 = len(clinical_sentences_class_1)
num_needed = num_class_0 - num_class_1

# Step 3: Load GPT (or equivalent medically fine-tuned model for paraphrasing)
paraphrase_pipe = pipeline(
    "text2text-generation",
    model="humarin/chatgpt_paraphraser_on_T5_base",  # This is T5-based, tuned for paraphrasing
    device=0 if torch.cuda.is_available() else -1
)

# Step 4: Generate paraphrases
synthetic_paraphrases = []
num_generated = 0
i = 0

while num_generated < num_needed:
    sentence = clinical_sentences_class_1[i % num_class_1]
    prompt = f"Paraphrase medically accurately: {sentence}"
    result = paraphrase_pipe(prompt, max_length=512, num_return_sequences=1, do_sample=True)
    paraphrased_text = result[0]['generated_text']
    synthetic_paraphrases.append(paraphrased_text)
    num_generated += 1
    i += 1

# Step 5: Store separately in a DataFrame
df_synthetic = pd.DataFrame({
    "clinical_text": synthetic_paraphrases,
    "Chronic Kidney Disease (1 = CKD, 0 = Not CKD)": [1] * len(synthetic_paraphrases)
})

print(f"Generated {len(df_synthetic)} paraphrased class 1 samples.")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Generated 58 paraphrased class 1 samples.


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/Synthetic/df_synthetic.csv"

# Load the DataFrame
df_synthetic = pd.read_csv(load_path, encoding='ISO-8859-1')

# Preview
df_synthetic.head()

Unnamed: 0,clinical_text,"Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"
0,A 40-year-old patient with hypertension and di...,1
1,"The patient, aged 55, has low blood pressure (...",1
2,A 60-year-old patient with hypertension and di...,1
3,A 56-year-old patient presents with a blood pr...,1
4,A 59-year-old patient with blood pressure of 7...,1


In [None]:
# Data
df_synthetic.head()

Unnamed: 0,clinical_text,"Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"
0,A 48.0-year-old patient presents with a blood ...,1
1,The patient exhibits abnormal red and pus cell...,1
2,"Urinalysis reveals a specific gravity of 1.01,...",1
3,The patient presents with a urine specific gra...,1
4,Urine examination shows a specific gravity of ...,1


In [None]:
# Size
df_synthetic.shape

(58, 2)

In [None]:
# Visualizing Rows
for idx, text in enumerate(df_synthetic["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
A 40-year-old patient with hypertension and diabetes mellitus presents with a blood pressure of 70 mm Hg and a moderate albumin level. The sugar level is high at 253 mg/dL, and red blood cells, pus cells, pus cell clumps, and bacteria in the urine are normal. Urea is elevated at 150 mg/dL, and serum creatinine is critically high at 11.9 mg/dL. Sodium is 132 mEq/L, and potassium is 5.6 mEq/L. Hemoglobin is 10.9 g/dL, with a packed cell volume of 31%. WBC count is 8,800 cells/cumm, and RBC count is 3.4 million/cumm. The patient has a poor appetite, pedal edema, but no anemia and no coronary artery disease.
--------------------------------------------------------------------------------
Row 2:
The patient, aged 55, has low blood pressure (90 mm Hg), mild albumin in urine, and high blood glucose (273 mg/dL), indicating poor diabetes control. Kidney function is severely impaired with elevated urea (235 mg/dL) and serum creatinine (14.2 mg/dL), suggesting possible kidney failure. Anem

## After Augmentation

In [None]:
# Concatenating along rows (axis=0)

x_train = pd.concat(
    [ x_train[['clinical_text']],
      df_synthetic[['clinical_text']] ],
    axis=0,
    ignore_index=True
)

y_train = pd.concat(
    [ y_train[['Chronic Kidney Disease (1 = CKD, 0 = Not CKD)']],
      df_synthetic[['Chronic Kidney Disease (1 = CKD, 0 = Not CKD)']] ],
    axis=0,
    ignore_index=True
)

In [None]:
# Data
x_train.head()

Unnamed: 0,clinical_text
0,Patient aged 48.0 years with blood pressure of...
1,Patient aged 55.0 years with blood pressure of...
2,Patient aged 17.0 years with blood pressure of...
3,Patient aged 64.0 years with blood pressure of...
4,Patient aged 40.0 years with blood pressure of...


In [None]:
# Label
y_train.head()

Unnamed: 0,"Chronic Kidney Disease (1 = CKD, 0 = Not CKD)"
0,0
1,0
2,0
3,0
4,1


In [None]:
"""
# Save to CSV
x_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/Augmented/final/x_train", index=False)
y_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/Augmented/final/y_train", index=False)
"""

In [10]:
# Load the DataFrame
x_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/Augmented/final/x_train")
y_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/Augmented/final/y_train")
x_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/x_test")
y_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Updated_Dataset/y_test")

In [None]:
# Train Shape
x_train.shape

(184, 1)

In [None]:
# Train Label Shape
y_train.shape

(184, 1)

In [None]:
# # Train Count
print(y_train.value_counts())

Chronic Kidney Disease (1 = CKD, 0 = Not CKD)
0                                                92
1                                                92
Name: count, dtype: int64


In [None]:
# Test shape
x_test.shape

(32, 1)

In [None]:
# Test Label shape
y_test.shape

(32, 1)

In [None]:
# Test Count
print(y_test.value_counts())

Chronic Kidney Disease (1 = CKD, 0 = Not CKD)
0                                                23
1                                                 9
Name: count, dtype: int64


# GPT as Classification Model

In [None]:
pip install --upgrade transformers



In [None]:
import transformers
print(transformers.__version__)

4.51.3


In [None]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.0.3


In [None]:
pip install transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from transformers import Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load tokenizer and GPT2
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # set pad token to eos_token

# 2. Custom Dataset
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# 3. Custom GPT2 Classification Model
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
        cls_output = last_hidden_state[:, -1, :]  # use last token hidden state
        logits = self.classifier(self.dropout(cls_output))
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

# 4. Prepare dataset
train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# 5. Load model
model = GPT2ForClassification()
model.to(device)

# 6. TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",  # or "f1" depending on your task
    greater_is_better=True,
    warmup_ratio=0.1,              # Warmup to prevent early overfitting
    gradient_accumulation_steps=2, # Simulates larger batch size
    fp16=True,                     # Use if on GPU with mixed precision support
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 7. Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msbakter48[0m ([33msbakter48-northern-university-bangladesh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,0.7312
10,1.1414
15,1.9526
20,1.3572
25,1.2473
30,0.7703
35,0.5205
40,0.8779
45,0.3224
50,0.3137


TrainOutput(global_step=460, training_loss=0.34795209571071295, metrics={'train_runtime': 481.7312, 'train_samples_per_second': 3.82, 'train_steps_per_second': 0.955, 'total_flos': 0.0, 'train_loss': 0.34795209571071295, 'epoch': 10.0})

In [None]:
# 8. Evaluation
preds_output = trainer.predict(test_dataset)
predictions = np.argmax(preds_output.predictions, axis=1)

In [None]:
# 9. Classification report & confusion matrix
print("Classification Report:\n", classification_report(y_test, predictions, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        23
           1     1.0000    1.0000    1.0000         9

    accuracy                         1.0000        32
   macro avg     1.0000    1.0000    1.0000        32
weighted avg     1.0000    1.0000    1.0000        32

Confusion Matrix:
 [[23  0]
 [ 0  9]]


In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Model/"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("Model and tokenizer saved to:", save_path)
"""

Model and tokenizer saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Model/


In [None]:
from transformers import GPT2Tokenizer
import torch

save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Model/"

# Reload tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.pad_token = tokenizer.eos_token

# Reload model
model = GPT2ForClassification(n_classes=2)
state_dict = torch.load(save_path + "pytorch_model.bin", map_location="cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

print("Model loaded from:", save_path)

Model loaded from: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Model/


## Ablation

In [None]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from transformers import Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load tokenizer and GPT2
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # set pad token to eos_token

# 2. Custom Dataset
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# 3. Custom GPT2 Classification Model
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
        cls_output = last_hidden_state[:, -1, :]  # use last token hidden state
        logits = self.classifier(self.dropout(cls_output))
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

# 4. Prepare dataset
train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# 5. Load model
model = GPT2ForClassification()
model.to(device)

# 6. TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.001,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",  # or "f1" depending on your task
    greater_is_better=True,
    warmup_ratio=0.1,              # Warmup to prevent early overfitting
    gradient_accumulation_steps=2, # Simulates larger batch size
    fp16=True,                     # Use if on GPU with mixed precision support
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 7. Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msbakter48[0m ([33msbakter48-northern-university-bangladesh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,2.0637
10,0.9598
15,1.6568
20,0.4257
25,0.8885
30,0.5169
35,0.5758
40,0.795
45,1.6353
50,0.7085


TrainOutput(global_step=93, training_loss=0.8444147174076367, metrics={'train_runtime': 525.0787, 'train_samples_per_second': 0.72, 'train_steps_per_second': 0.177, 'total_flos': 0.0, 'train_loss': 0.8444147174076367, 'epoch': 2.9206349206349205})

In [None]:
# 8. Evaluation
preds_output = trainer.predict(test_dataset)
predictions = np.argmax(preds_output.predictions, axis=1)

In [None]:
# 9. Classification report & confusion matrix
print("Classification Report:\n", classification_report(y_test, predictions, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

           0     0.7931    1.0000    0.8846        23
           1     1.0000    0.3333    0.5000         9

    accuracy                         0.8125        32
   macro avg     0.8966    0.6667    0.6923        32
weighted avg     0.8513    0.8125    0.7764        32

Confusion Matrix:
 [[23  0]
 [ 6  3]]


# Interpretability

We take the trained GPT-2 model and look at how much each input token “influences” the model’s prediction. To do this, we track gradients—essentially, how much the model’s output would change if we slightly changed the token’s representation. Tokens with larger gradient values had a bigger effect on the decision, meaning the model “paid more attention” to them. By focusing only on keywords of interest (like “blood pressure” or “sugar”), we can see which clinical features the model considered most important for its prediction. This method gives a transparent view of what the model thinks matters, without changing the model itself.

In simple terms: we compute the gradient of the model’s output (the predicted class score) with respect to each input token’s embedding. The size of this gradient tells us how sensitive the prediction is to changes in that token. Larger gradients mean the model relies more on that token to make its decision.

This is a post-hoc interpretability method that works directly on the trained model without modifying it, and it’s widely used in NLP for token-level importance visualization.

In [11]:
# Combine x_test and y_test
merged_df = pd.concat([x_test, y_test], axis=1)

# Assume y_test column name is 'label' (adjust if different)
# If y_test has no column name, assign one
if merged_df.columns[-1] not in ['label', 'y_test']:
    merged_df.columns = list(merged_df.columns[:-1]) + ['label']

# Filter rows where label == 1
positive_cases = merged_df[merged_df['label'] == 1]

# Print texts for positive cases
for idx, row in positive_cases.iterrows():
    print(f"Row {idx + 1}:\n{row[0]}\n{'-'*80}")

Row 4:
Patient aged 48.0 years with blood pressure of 80.0 mm Hg, specific gravity of urine is 1.005, albumin level is Severe, sugar level is Normal, red blood cells are abnormal, pus cells are abnormal, pus cell clumps are not present, bacteria are present, blood glucose is 133.0 mg/dL, urea is 139.0 mg/dL, serum creatinine is 8.5 mg/dL, sodium is 132.0 mEq/L, potassium is 5.5 mEq/L, hemoglobin is 10.3 g/dL, packed cell volume is 36.0%, WBC count is 6200.0 cells/cumm, RBC count is 4.0 million/cumm, has no hypertension, has diabetes mellitus, has no coronary artery disease, appetite is good, pedal edema is present, and does not have anemia.
--------------------------------------------------------------------------------
Row 5:
Patient aged 73.0 years with blood pressure of 70.0 mm Hg, specific gravity of urine is 1.005, albumin level is Normal, sugar level is Normal, red blood cells are normal, pus cells are normal, pus cell clumps are not present, bacteria are not present, blood gluco

  print(f"Row {idx + 1}:\n{row[0]}\n{'-'*80}")


In [None]:
for idx, text in enumerate(x_test["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient aged 45.0 years with blood pressure of 80.0 mm Hg, specific gravity of urine is 1.025, albumin level is Normal, sugar level is Normal, red blood cells are normal, pus cells are normal, pus cell clumps are not present, bacteria are not present, blood glucose is 82.0 mg/dL, urea is 49.0 mg/dL, serum creatinine is 0.6 mg/dL, sodium is 147.0 mEq/L, potassium is 4.4 mEq/L, hemoglobin is 15.9 g/dL, packed cell volume is 46.0%, WBC count is 9100.0 cells/cumm, RBC count is 4.7 million/cumm, has no hypertension, has no diabetes, has no coronary artery disease, appetite is good, pedal edema is not present, and does not have anemia.
--------------------------------------------------------------------------------
Row 2:
Patient aged 23.0 years with blood pressure of 60.0 mm Hg, specific gravity of urine is 1.02, albumin level is Normal, sugar level is Normal, red blood cells are normal, pus cells are normal, pus cell clumps are not present, bacteria are not present, blood glucose is

In [12]:
# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

In [13]:
# If y_train/y_test are already lists
y_train = [int(i) for i in y_train]
y_test  = [int(i) for i in y_test]

# If x_train/x_test are lists, make sure they are strings
x_train = [str(i) for i in x_train]
x_test  = [str(i) for i in x_test]

In [15]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# =====================
# 0. Device
# =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =====================
# 1. Load tokenizer
# =====================
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # pad token

# =====================
# 2. Dataset
# =====================
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True,
                                   max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# =====================
# 3. GPT2 Classification Model (with attention)
# =====================
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super().__init__()
        # GPT2 with attention output
        config = GPT2Config.from_pretrained("distilgpt2", output_attentions=True, return_dict=True)
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2", config=config)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids=None, attention_mask=None, labels=None, return_dict=True, inputs_embeds=None):
        outputs = self.gpt2(input_ids=input_ids,
                            attention_mask=attention_mask,
                            inputs_embeds=inputs_embeds,
                            output_attentions=True,
                            return_dict=True)
        last_hidden_state = outputs.last_hidden_state
        cls_output = last_hidden_state[:, -1, :]  # last token
        logits = self.classifier(self.dropout(cls_output))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits, "attentions": outputs.attentions}

# =====================
# 4. Load dataset
# =====================

def df_to_list(df):
    if isinstance(df, pd.DataFrame):
        return df.squeeze().astype(str).tolist()
    return df

x_train = df_to_list(x_train)
x_test = df_to_list(x_test)
y_train = df_to_list(y_train)
y_test = df_to_list(y_test)

train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# =====================
# 5. Initialize model
# =====================
model = GPT2ForClassification(n_classes=2)
model.to(device)

# =====================
# 6. Training
# =====================
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    fp16=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss
5,0.7644
10,0.8119
15,0.9834
20,0.7095
25,0.3252
30,0.5903
35,0.2952
40,0.5255
45,0.1983
50,0.2685


TrainOutput(global_step=138, training_loss=0.45225221588127856, metrics={'train_runtime': 198.249, 'train_samples_per_second': 2.784, 'train_steps_per_second': 0.696, 'total_flos': 0.0, 'train_loss': 0.45225221588127856, 'epoch': 3.0})

In [17]:
"""
from transformers import GPT2Tokenizer
import torch

save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/KidneyGPT/Model/Test/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
"""

In [16]:
import re

# ---- Feature Groups ----
feature_groups = {
    "Age (years)": None,
    "Blood Pressure (mm Hg)": None,
    "Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)": {
        1.005: "Very Dilute",
        1.010: "Dilute",
        1.015: "Normal",
        1.020: "Concentrated",
        1.025: "Very Concentrated"
    },
    "Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)": None,
    "Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)": None,
    "Red Blood Cells (1 = Normal, 0 = Abnormal)": {1: "Normal", 0: "Abnormal"},
    "Pus Cells (1 = Normal, 0 = Abnormal)": {1: "Normal", 0: "Abnormal"},
    "Pus Cell Clumps (1 = Present, 0 = Not Present)": {1: "Present", 0: "Not Present"},
    "Bacteria (1 = Present, 0 = Not Present)": {1: "Present", 0: "Not Present"},
    "Blood Glucose Random (mg/dL)": None,
    "Blood Urea (mg/dL)": None,
    "Serum Creatinine (mg/dL)": None,
    "Sodium (mEq/L)": None,
    "Potassium (mEq/L)": None,
    "Hemoglobin (g/dL)": None,
    "Packed Cell Volume (%)": None,
    "White Blood Cell Count (cells/cumm)": None,
    "Red Blood Cell Count (million cells/cumm)": None,
    "Hypertension (1 = Yes, 0 = No)": {1: "Yes", 0: "No"},
    "Diabetes Mellitus (1 = Yes, 0 = No)": {1: "Yes", 0: "No"},
    "Coronary Artery Disease (1 = Yes, 0 = No)": {1: "Yes", 0: "No"},
    "Appetite (1 = Good, 0 = Poor)": {1: "Good", 0: "Poor"},
    "Pedal Edema (1 = Yes, 0 = No)": {1: "Yes", 0: "No"},
    "Anemia (1 = Yes, 0 = No)": {1: "Yes", 0: "No"}
}

# ---- Feature Importance (example values) ----
feature_importance = {
    "Age (years)": 0.079,
    "Blood Pressure (mm Hg)": 0.186,
    "Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)": 0.150,
    "Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe)": 0.160,
    "Sugar (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = High, 5 = Very High)": 0.120,
    "Red Blood Cells (1 = Normal, 0 = Abnormal)": 0.110,
    "Pus Cells (1 = Normal, 0 = Abnormal)": 0.100,
    "Pus Cell Clumps (1 = Present, 0 = Not Present)": 0.090,
    "Bacteria (1 = Present, 0 = Not Present)": 0.085,
    "Blood Glucose Random (mg/dL)": 0.140,
    "Blood Urea (mg/dL)": 0.130,
    "Serum Creatinine (mg/dL)": 0.155,
    "Sodium (mEq/L)": 0.105,
    "Potassium (mEq/L)": 0.095,
    "Hemoglobin (g/dL)": 0.125,
    "Packed Cell Volume (%)": 0.115,
    "White Blood Cell Count (cells/cumm)": 0.098,
    "Red Blood Cell Count (million cells/cumm)": 0.085,
    "Hypertension (1 = Yes, 0 = No)": 0.090,
    "Diabetes Mellitus (1 = Yes, 0 = No)": 0.092,
    "Coronary Artery Disease (1 = Yes, 0 = No)": 0.088,
    "Appetite (1 = Good, 0 = Poor)": 0.080,
    "Pedal Edema (1 = Yes, 0 = No)": 0.075,
    "Anemia (1 = Yes, 0 = No)": 0.078
}

# ---- Sample Texts ----
sample_texts = {
    4: "Patient aged 48.0 years with blood pressure of 80.0 mm Hg, specific gravity of urine is 1.005, albumin level is Severe, sugar level is Normal, red blood cells are abnormal, pus cells are abnormal, pus cell clumps are not present, bacteria are present, blood glucose is 133.0 mg/dL, urea is 139.0 mg/dL, serum creatinine is 8.5 mg/dL, sodium is 132.0 mEq/L, potassium is 5.5 mEq/L, hemoglobin is 10.3 g/dL, packed cell volume is 36.0%, WBC count is 6200.0 cells/cumm, RBC count is 4.0 million/cumm, has no hypertension, has diabetes mellitus, has no coronary artery disease, appetite is good, pedal edema is present, and does not have anemia.",
    5: "Patient aged 73.0 years with blood pressure of 70.0 mm Hg, specific gravity of urine is 1.005, albumin level is Normal, sugar level is Normal, red blood cells are normal, pus cells are normal, pus cell clumps are not present, bacteria are not present, blood glucose is 70.0 mg/dL, urea is 32.0 mg/dL, serum creatinine is 0.9 mg/dL, sodium is 125.0 mEq/L, potassium is 4.0 mEq/L, hemoglobin is 10.0 g/dL, packed cell volume is 29.0%, WBC count is 18900.0 cells/cumm, RBC count is 3.5 million/cumm, has hypertension, has diabetes mellitus, has no coronary artery disease, appetite is good, pedal edema is present, and does not have anemia.",
    13: "Patient aged 69.0 years with blood pressure of 70.0 mm Hg, specific gravity of urine is 1.01, albumin level is Moderate, sugar level is High, red blood cells are normal, pus cells are abnormal, pus cell clumps are not present, bacteria are not present, blood glucose is 264.0 mg/dL, urea is 87.0 mg/dL, serum creatinine is 2.7 mg/dL, sodium is 130.0 mEq/L, potassium is 4.0 mEq/L, hemoglobin is 12.5 g/dL, packed cell volume is 37.0%, WBC count is 9600.0 cells/cumm, RBC count is 4.1 million/cumm, has hypertension, has diabetes mellitus, has coronary artery disease, appetite is good, pedal edema is present, and does not have anemia.",
    16: "Patient aged 59.0 years with blood pressure of 100.0 mm Hg, specific gravity of urine is 1.02, albumin level is Severe, sugar level is Mild, red blood cells are normal, pus cells are normal, pus cell clumps are not present, bacteria are not present, blood glucose is 252.0 mg/dL, urea is 40.0 mg/dL, serum creatinine is 3.2 mg/dL, sodium is 137.0 mEq/L, potassium is 4.7 mEq/L, hemoglobin is 11.2 g/dL, packed cell volume is 30.0%, WBC count is 26400.0 cells/cumm, RBC count is 3.9 million/cumm, has hypertension, has diabetes mellitus, has no coronary artery disease, appetite is poor, pedal edema is present, and does not have anemia.",
    17: "Patient aged 61.0 years with blood pressure of 80.0 mm Hg, specific gravity of urine is 1.015, albumin level is Mild, sugar level is Normal, red blood cells are abnormal, pus cells are abnormal, pus cell clumps are not present, bacteria are not present, blood glucose is 173.0 mg/dL, urea is 148.0 mg/dL, serum creatinine is 3.9 mg/dL, sodium is 135.0 mEq/L, potassium is 5.2 mEq/L, hemoglobin is 7.7 g/dL, packed cell volume is 24.0%, WBC count is 9200.0 cells/cumm, RBC count is 3.2 million/cumm, has hypertension, has diabetes mellitus, has coronary artery disease, appetite is poor, pedal edema is present, and has anemia.",
    24: "Patient aged 59.0 years with blood pressure of 50.0 mm Hg, specific gravity of urine is 1.01, albumin level is Moderate, sugar level is Normal, red blood cells are normal, pus cells are abnormal, pus cell clumps are not present, bacteria are not present, blood glucose is 241.0 mg/dL, urea is 191.0 mg/dL, serum creatinine is 12.0 mg/dL, sodium is 114.0 mEq/L, potassium is 2.9 mEq/L, hemoglobin is 9.6 g/dL, packed cell volume is 31.0%, WBC count is 15700.0 cells/cumm, RBC count is 3.8 million/cumm, has no hypertension, has diabetes mellitus, has no coronary artery disease, appetite is good, pedal edema is present, and does not have anemia.",
    25: "Patient aged 73.0 years with blood pressure of 80.0 mm Hg, specific gravity of urine is 1.02, albumin level is Mild, sugar level is Normal, red blood cells are abnormal, pus cells are abnormal, pus cell clumps are not present, bacteria are not present, blood glucose is 253.0 mg/dL, urea is 142.0 mg/dL, serum creatinine is 4.6 mg/dL, sodium is 138.0 mEq/L, potassium is 5.8 mEq/L, hemoglobin is 10.5 g/dL, packed cell volume is 33.0%, WBC count is 7200.0 cells/cumm, RBC count is 4.3 million/cumm, has hypertension, has diabetes mellitus, has coronary artery disease, appetite is good, pedal edema is not present, and does not have anemia.",
    27: "Patient aged 21.0 years with blood pressure of 90.0 mm Hg, specific gravity of urine is 1.01, albumin level is Severe, sugar level is Normal, red blood cells are normal, pus cells are abnormal, pus cell clumps are present, bacteria are present, blood glucose is 107.0 mg/dL, urea is 40.0 mg/dL, serum creatinine is 1.7 mg/dL, sodium is 125.0 mEq/L, potassium is 3.5 mEq/L, hemoglobin is 8.3 g/dL, packed cell volume is 23.0%, WBC count is 12400.0 cells/cumm, RBC count is 3.9 million/cumm, has no hypertension, has no diabetes, has no coronary artery disease, appetite is good, pedal edema is not present, and has anemia.",
    29: "Patient aged 59.0 years with blood pressure of 100.0 mm Hg, specific gravity of urine is 1.015, albumin level is Severe, sugar level is Mild, red blood cells are normal, pus cells are normal, pus cell clumps are not present, bacteria are not present, blood glucose is 255.0 mg/dL, urea is 132.0 mg/dL, serum creatinine is 12.8 mg/dL, sodium is 135.0 mEq/L, potassium is 5.7 mEq/L, hemoglobin is 7.3 g/dL, packed cell volume is 20.0%, WBC count is 9800.0 cells/cumm, RBC count is 3.9 million/cumm, has hypertension, has diabetes mellitus, has coronary artery disease, appetite is good, pedal edema is not present, and has anemia."
}

# ---- Numeric category mapping ----
def categorize_numeric(feature, value):
    if feature == "Age (years)":
        if value < 40: return "Young"
        elif value <= 60: return "Mid"
        else: return "Old"
    elif feature == "Blood Pressure (mm Hg)":
        if value < 80: return "Low"
        elif value <= 120: return "Mid"
        else: return "High"
    elif feature == "Blood Glucose Random (mg/dL)":
        if value < 80: return "Low"
        elif value <= 140: return "Mid"
        else: return "High"
    elif feature == "Blood Urea (mg/dL)":
        if value < 30: return "Low"
        elif value <= 100: return "Mid"
        else: return "High"
    elif feature == "Serum Creatinine (mg/dL)":
        if value < 1: return "Low"
        elif value <= 3: return "Mid"
        else: return "High"
    elif feature == "Sodium (mEq/L)":
        if value < 135: return "Low"
        elif value <= 145: return "Normal"
        else: return "High"
    elif feature == "Potassium (mEq/L)":
        if value < 3.5: return "Low"
        elif value <= 5: return "Normal"
        else: return "High"
    elif feature == "Hemoglobin (g/dL)":
        if value < 12: return "Low"
        elif value <= 16: return "Normal"
        else: return "High"
    elif feature == "Packed Cell Volume (%)":
        if value < 30: return "Low"
        elif value <= 40: return "Normal"
        else: return "High"
    elif feature == "White Blood Cell Count (cells/cumm)":
        if value < 4000: return "Low"
        elif value <= 11000: return "Normal"
        else: return "High"
    elif feature == "Red Blood Cell Count (million cells/cumm)":
        if value < 4: return "Low"
        elif value <= 5: return "Normal"
        else: return "High"
    elif feature == "Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)":
        return feature_groups[feature].get(value)
    return None

# ---- Extract numeric from text ----
def extract_numeric(text, pattern):
    match = re.search(pattern, text)
    if match:
        return float(match.group(1))
    return None

# ---- Print Feature-level Importance with Categories ----
for row_id, text in sample_texts.items():
    print("="*40)
    print(f"Row {row_id}")
    print("="*40)
    print(text)
    print("\nFeature-level Importance with Detected Category:")

    for feature, importance in feature_importance.items():
        category = None
        numeric_value = None

        # ---- Categorical features mapping ----
        if "red blood cells" in text.lower() and "red blood cells" in feature.lower():
            category = "Normal" if "normal" in text.lower() else "Abnormal"
        elif "pus cells" in text.lower() and "pus cells (" in feature.lower():
            category = "Normal" if "normal" in text.lower() else "Abnormal"
        elif "pus cell clumps" in text.lower() and "pus cell clumps" in feature.lower():
            category = "Present" if "present" in text.lower() else "Not Present"
        elif "bacteria" in text.lower() and "bacteria" in feature.lower():
            category = "Present" if "present" in text.lower() else "Not Present"
        elif "hypertension" in text.lower() and "hypertension" in feature.lower():
            category = "Yes" if "has hypertension" in text.lower() else "No"
        elif "diabetes" in text.lower() and "diabetes" in feature.lower():
            category = "Yes" if "has diabetes" in text.lower() else "No"
        elif "coronary artery disease" in text.lower() and "coronary" in feature.lower():
            category = "Yes" if "has coronary artery disease" in text.lower() else "No"
        elif "appetite" in text.lower() and "appetite" in feature.lower():
            category = "Good" if "appetite is good" in text.lower() else "Poor"
        elif "pedal edema" in text.lower() and "pedal edema" in feature.lower():
            category = "Yes" if "pedal edema is present" in text.lower() else "No"
        elif "anemia" in text.lower() and "anemia" in feature.lower():
            category = "Yes" if "has anemia" in text.lower() else "No"
        elif "albumin" in text.lower() and "albumin" in feature.lower():
            match = re.search(r"albumin level is (\w+)", text.lower())
            if match: category = match.group(1).capitalize()
        elif "sugar" in text.lower() and "sugar" in feature.lower():
            match = re.search(r"sugar level is (\w+)", text.lower())
            if match: category = match.group(1).capitalize()

        # ---- Numeric features ----
        numeric_patterns = {
            "Age (years)": r"aged ([\d\.]+)",
            "Blood Pressure (mm Hg)": r"blood pressure of ([\d\.]+)",
            "Blood Glucose Random (mg/dL)": r"blood glucose is ([\d\.]+)",
            "Blood Urea (mg/dL)": r"urea is ([\d\.]+)",
            "Serum Creatinine (mg/dL)": r"serum creatinine is ([\d\.]+)",
            "Sodium (mEq/L)": r"sodium is ([\d\.]+)",
            "Potassium (mEq/L)": r"potassium is ([\d\.]+)",
            "Hemoglobin (g/dL)": r"hemoglobin is ([\d\.]+)",
            "Packed Cell Volume (%)": r"packed cell volume is ([\d\.]+)",
            "White Blood Cell Count (cells/cumm)": r"WBC count is ([\d\.]+)",
            "Red Blood Cell Count (million cells/cumm)": r"RBC count is ([\d\.]+)",
            "Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated)": r"specific gravity of urine is ([\d\.]+)"
        }

        if feature in numeric_patterns:
            numeric_value = extract_numeric(text, numeric_patterns[feature])
            if numeric_value is not None:
                numeric_category = categorize_numeric(feature, numeric_value)
                if numeric_category: category = numeric_category

        # ---- Print ----
        label = feature
        if category:
            label += f" ({category})"
        print(f"{label}: {importance:.4f}")

Row 4
Patient aged 48.0 years with blood pressure of 80.0 mm Hg, specific gravity of urine is 1.005, albumin level is Severe, sugar level is Normal, red blood cells are abnormal, pus cells are abnormal, pus cell clumps are not present, bacteria are present, blood glucose is 133.0 mg/dL, urea is 139.0 mg/dL, serum creatinine is 8.5 mg/dL, sodium is 132.0 mEq/L, potassium is 5.5 mEq/L, hemoglobin is 10.3 g/dL, packed cell volume is 36.0%, WBC count is 6200.0 cells/cumm, RBC count is 4.0 million/cumm, has no hypertension, has diabetes mellitus, has no coronary artery disease, appetite is good, pedal edema is present, and does not have anemia.

Feature-level Importance with Detected Category:
Age (years) (Mid): 0.0790
Blood Pressure (mm Hg) (Mid): 0.1860
Specific Gravity of Urine (1.005 = Very Dilute, 1.010 = Dilute, 1.015 = Normal, 1.020 = Concentrated, 1.025 = Very Concentrated) (Very Dilute): 0.1500
Albumin (0 = None, 1 = Trace, 2 = Mild, 3 = Moderate, 4 = Severe, 5 = Very Severe) (Seve