In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns
import plotly.express as px

%matplotlib inline

In [2]:
# Set DPI for fugures

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [3]:
# Set the default font size and weight
plt.rcParams['font.size'] = 30
plt.rcParams['font.weight'] = 'bold'

In [4]:
# Drive connection

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset: UCI Heart Failure Clinical Records

In [None]:
# Dataset
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_failure_clinical_records = fetch_ucirepo(id=519)

# data (as pandas dataframes)
X = heart_failure_clinical_records.data.features
y = heart_failure_clinical_records.data.targets

# metadata
print(heart_failure_clinical_records.metadata)

# variable information
print(heart_failure_clinical_records.variables)

{'uci_id': 519, 'name': 'Heart Failure Clinical Records', 'repository_url': 'https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records', 'data_url': 'https://archive.ics.uci.edu/static/public/519/data.csv', 'abstract': 'This dataset contains the medical records of 299 patients who had heart failure, collected during their follow-up period, where each patient profile has 13 clinical features.', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 299, 'num_features': 12, 'feature_types': ['Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['death_event'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Feb 26 2024', 'dataset_doi': '10.24432/C5Z89R', 'creators': [], 'intro_paper': {'ID': 286, 'type': 'NATIVE', 'title': 'Machine learning can predict survival of patients with heart failure f

In [None]:
# Merge the features and targets into a single DataFrame
df = pd.concat([X, y], axis=1)

# Display the merged DataFrame
print(df.head())

    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      7861         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           129    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           116    0   

   smoking  time  death_event  
0        0     4            1  
1        0     6            1  
2       

In [None]:
# Size
df.shape

(299, 13)

In [None]:
# Null Values
df.isnull().sum()

Unnamed: 0,0
age,0
anaemia,0
creatinine_phosphokinase,0
diabetes,0
ejection_fraction,0
high_blood_pressure,0
platelets,0
serum_creatinine,0
serum_sodium,0
sex,0


In [None]:
# Size
df.shape

(299, 13)

In [None]:
# Count occurrences for each category in each feature
for column in df.columns:
    print(f"Counts for {column}:")
    print(df[column].value_counts())
    print()  # Add a newline for better readability

Counts for age:
age
60.000    33
50.000    27
65.000    26
70.000    25
45.000    19
55.000    17
75.000    11
58.000    10
53.000    10
63.000     8
80.000     7
72.000     7
42.000     7
40.000     7
85.000     6
68.000     5
62.000     5
52.000     5
61.000     4
49.000     4
73.000     4
51.000     4
59.000     4
69.000     3
46.000     3
82.000     3
90.000     3
64.000     3
48.000     2
78.000     2
95.000     2
57.000     2
54.000     2
77.000     2
67.000     2
66.000     2
44.000     2
60.667     2
87.000     1
79.000     1
41.000     1
94.000     1
86.000     1
81.000     1
43.000     1
47.000     1
56.000     1
Name: count, dtype: int64

Counts for anaemia:
anaemia
0    170
1    129
Name: count, dtype: int64

Counts for creatinine_phosphokinase:
creatinine_phosphokinase
582     47
129      4
66       4
47       3
60       3
        ..
190      1
103      1
1820     1
2060     1
2413     1
Name: count, Length: 208, dtype: int64

Counts for diabetes:
diabetes
0    174
1    12

In [None]:
# Data
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death_event
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/df.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/df.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/df.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death_event
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
# Column Names
print(df.columns.tolist())

['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time', 'death_event']


In [None]:
df.rename(columns={
    "age": "Age (years)",
    "anaemia": "Anemia (1 = Yes, 0 = No)",
    "creatinine_phosphokinase": "Creatinine Phosphokinase (mcg/L)",
    "diabetes": "Diabetes Mellitus (1 = Yes, 0 = No)",
    "ejection_fraction": "Ejection Fraction (%)",
    "high_blood_pressure": "High Blood Pressure (1 = Yes, 0 = No)",
    "platelets": "Platelet Count (kiloplatelets/mL)",
    "serum_creatinine": "Serum Creatinine (mg/dL)",
    "serum_sodium": "Serum Sodium (mEq/L)",
    "sex": "Sex (1 = Male, 0 = Female)",
    "smoking": "Smoking Status (1 = Smoker, 0 = Non-Smoker)",
    "time": "Follow-up Time (days)",
    "death_event": "Death Event (1 = Deceased, 0 = Alive)"
}, inplace=True)

In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Updated_Dataset/df.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Updated_Dataset/df.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Updated_Dataset/df.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,Age (years),"Anemia (1 = Yes, 0 = No)",Creatinine Phosphokinase (mcg/L),"Diabetes Mellitus (1 = Yes, 0 = No)",Ejection Fraction (%),"High Blood Pressure (1 = Yes, 0 = No)",Platelet Count (kiloplatelets/mL),Serum Creatinine (mg/dL),Serum Sodium (mEq/L),"Sex (1 = Male, 0 = Female)","Smoking Status (1 = Smoker, 0 = Non-Smoker)",Follow-up Time (days),"Death Event (1 = Deceased, 0 = Alive)"
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
# New Column Names
print(df.columns.tolist())

['Age (years)', 'Anemia (1 = Yes, 0 = No)', 'Creatinine Phosphokinase (mcg/L)', 'Diabetes Mellitus (1 = Yes, 0 = No)', 'Ejection Fraction (%)', 'High Blood Pressure (1 = Yes, 0 = No)', 'Platelet Count (kiloplatelets/mL)', 'Serum Creatinine (mg/dL)', 'Serum Sodium (mEq/L)', 'Sex (1 = Male, 0 = Female)', 'Smoking Status (1 = Smoker, 0 = Non-Smoker)', 'Follow-up Time (days)', 'Death Event (1 = Deceased, 0 = Alive)']


In [None]:
# Size
df.shape

(299, 13)

# Table to Clinical Text for GPT Model

In [None]:
# Separate features and target
input_features = df.drop(columns=["Death Event (1 = Deceased, 0 = Alive)"])
target = df["Death Event (1 = Deceased, 0 = Alive)"]

# Function to convert a row into a clinical-style sentence
def row_to_text(row):
    return (f"Patient is a {int(row['Age (years)'])}-year-old with an ejection fraction of {row['Ejection Fraction (%)']}%, "
            f"serum creatinine level of {row['Serum Creatinine (mg/dL)']} mg/dL, serum sodium level of {row['Serum Sodium (mEq/L)']} mEq/L, "
            f"platelets count of {row['Platelet Count (kiloplatelets/mL)']} per µL, creatinine phosphokinase level of {row['Creatinine Phosphokinase (mcg/L)']} IU/L, "
            f"and has {'high blood pressure' if row['High Blood Pressure (1 = Yes, 0 = No)'] == 1 else 'no high blood pressure'}. "
            f"The patient {'is diabetic' if row['Diabetes Mellitus (1 = Yes, 0 = No)'] == 1 else 'is not diabetic'}, "
            f"{'has anaemia' if row['Anemia (1 = Yes, 0 = No)'] == 1 else 'does not have anaemia'}, and "
            f"{'is a smoker' if row['Smoking Status (1 = Smoker, 0 = Non-Smoker)'] == 1 else 'does not smoke'}. "
            f"The follow-up duration is {row['Follow-up Time (days)']} days.")

# Apply function to all rows
df["clinical_text"] = input_features.apply(row_to_text, axis=1)

In [None]:
# Drop target column to get input features only
input_features = df.drop(columns=["Death Event (1 = Deceased, 0 = Alive)"])

# Apply the conversion function row-wise
df["clinical_text"] = input_features.apply(row_to_text, axis=1)

# View the converted text
print(df["clinical_text"].head())

0    Patient is a 75-year-old with an ejection frac...
1    Patient is a 55-year-old with an ejection frac...
2    Patient is a 65-year-old with an ejection frac...
3    Patient is a 50-year-old with an ejection frac...
4    Patient is a 65-year-old with an ejection frac...
Name: clinical_text, dtype: object


In [None]:
# As a list
clinical_texts = df["clinical_text"].tolist()

# Save to CSV for external use
df[["clinical_text"]].to_csv("converted_clinical_texts.csv", index=False)

In [None]:
# Preview with original data + clinical text
df_with_text = df.copy()
df_with_text["clinical_text"] = input_features.apply(row_to_text, axis=1)
df_with_text.head()

Unnamed: 0,Age (years),"Anemia (1 = Yes, 0 = No)",Creatinine Phosphokinase (mcg/L),"Diabetes Mellitus (1 = Yes, 0 = No)",Ejection Fraction (%),"High Blood Pressure (1 = Yes, 0 = No)",Platelet Count (kiloplatelets/mL),Serum Creatinine (mg/dL),Serum Sodium (mEq/L),"Sex (1 = Male, 0 = Female)","Smoking Status (1 = Smoker, 0 = Non-Smoker)",Follow-up Time (days),"Death Event (1 = Deceased, 0 = Alive)",clinical_text
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1,Patient is a 75-year-old with an ejection frac...
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1,Patient is a 55-year-old with an ejection frac...
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1,Patient is a 65-year-old with an ejection frac...
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1,Patient is a 50-year-old with an ejection frac...
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1,Patient is a 65-year-old with an ejection frac...


In [None]:
df_with_text.shape

(299, 14)

In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Updated_Dataset/clinical_text_dataset.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Updated_Dataset/clinical_text_dataset.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Updated_Dataset/clinical_text_dataset.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,Age (years),"Anemia (1 = Yes, 0 = No)",Creatinine Phosphokinase (mcg/L),"Diabetes Mellitus (1 = Yes, 0 = No)",Ejection Fraction (%),"High Blood Pressure (1 = Yes, 0 = No)",Platelet Count (kiloplatelets/mL),Serum Creatinine (mg/dL),Serum Sodium (mEq/L),"Sex (1 = Male, 0 = Female)","Smoking Status (1 = Smoker, 0 = Non-Smoker)",Follow-up Time (days),"Death Event (1 = Deceased, 0 = Alive)",clinical_text
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1,Patient is a 75-year-old with an ejection frac...
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1,Patient is a 55-year-old with an ejection frac...
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1,Patient is a 65-year-old with an ejection frac...
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1,Patient is a 50-year-old with an ejection frac...
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1,Patient is a 65-year-old with an ejection frac...


In [None]:
for idx, text in enumerate(df["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient is a 75-year-old with an ejection fraction of 20%, serum creatinine level of 1.9 mg/dL, serum sodium level of 130 mEq/L, platelets count of 265000.0 per µL, creatinine phosphokinase level of 582 IU/L, and has high blood pressure. The patient is not diabetic, does not have anaemia, and does not smoke. The follow-up duration is 4 days.
--------------------------------------------------------------------------------
Row 2:
Patient is a 55-year-old with an ejection fraction of 38%, serum creatinine level of 1.1 mg/dL, serum sodium level of 136 mEq/L, platelets count of 263358.03 per µL, creatinine phosphokinase level of 7861 IU/L, and has no high blood pressure. The patient is not diabetic, does not have anaemia, and does not smoke. The follow-up duration is 6 days.
--------------------------------------------------------------------------------
Row 3:
Patient is a 65-year-old with an ejection fraction of 20%, serum creatinine level of 1.3 mg/dL, serum sodium level of 129 mE

# Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Define x and y
x = df['clinical_text']
y = df['Death Event (1 = Deceased, 0 = Alive)']

# 80-20 train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.164, random_state=42, stratify=y)

In [None]:
# Load the DataFrame
x_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/Actual/x_train_updated.csv")
y_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/Actual/y_train_updated.csv")
x_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/Actual/x_test_cleaned.csv")
y_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/Actual/y_test_cleaned.csv")

In [None]:
# Train Shape
x_train.shape

(249, 1)

In [None]:
# Train Label Shape
y_train.shape

(249, 1)

In [None]:
# Train Count
print(y_train.value_counts())

Death Event (1 = Deceased, 0 = Alive)
0                                        162
1                                         87
Name: count, dtype: int64


In [None]:
# Test shape
x_test.shape

(50, 1)

In [None]:
# Test Label shape
y_test.shape

(50, 1)

In [None]:
# Test Count
print(y_test.value_counts())

Death Event (1 = Deceased, 0 = Alive)
0                                        41
1                                         9
Name: count, dtype: int64


### GPT Model (humarin/chatgpt_paraphraser_on_T5_base) for Synthetic Data and Paraphrasing (Optional)

In [None]:
# Merge x_train and y_train
df = pd.concat([x_train, y_train], axis=1)

### Positive classes

In [None]:
# Filter the rows where CKD == 1
hf_rows = df[df["Death Event (1 = Deceased, 0 = Alive)"] == 1]

# Print clinical texts for CKD patients only
for idx, text in enumerate(hf_rows["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient is a 75-year-old with an ejection fraction of 20%, serum creatinine level of 1.9 mg/dL, serum sodium level of 130 mEq/L, platelets count of 265000.0 per µL, creatinine phosphokinase level of 582 IU/L, and has high blood pressure. The patient is not diabetic, does not have anaemia, and does not smoke. The follow-up duration is 4 days.
--------------------------------------------------------------------------------
Row 2:
Patient is a 45-year-old with an ejection fraction of 20%, serum creatinine level of 1.6 mg/dL, serum sodium level of 135 mEq/L, platelets count of 126000.0 per µL, creatinine phosphokinase level of 582 IU/L, and has high blood pressure. The patient is not diabetic, does not have anaemia, and does not smoke. The follow-up duration is 180 days.
--------------------------------------------------------------------------------
Row 3:
Patient is a 68-year-old with an ejection fraction of 25%, serum creatinine level of 1.0 mg/dL, serum sodium level of 138 mEq/L

In [None]:
df.head()

Unnamed: 0,clinical_text,"Death Event (1 = Deceased, 0 = Alive)"
0,Patient is a 58-year-old with an ejection frac...,0
1,Patient is a 53-year-old with an ejection frac...,0
2,Patient is a 75-year-old with an ejection frac...,1
3,Patient is a 64-year-old with an ejection frac...,0
4,Patient is a 45-year-old with an ejection frac...,1


In [None]:
import torch
from transformers import pipeline

# Step 1: Filter class 1 examples
df_class_1 = df[df["Death Event (1 = Deceased, 0 = Alive)"] == 1]
clinical_sentences_class_1 = df_class_1["clinical_text"].tolist()

# Step 2: Calculate how many paraphrases we need
num_class_0 = df[df["Death Event (1 = Deceased, 0 = Alive)"] == 0].shape[0]
num_class_1 = len(clinical_sentences_class_1)
num_needed = num_class_0 - num_class_1

# Step 3: Load GPT (or equivalent medically fine-tuned model for paraphrasing)
paraphrase_pipe = pipeline(
    "text2text-generation",
    model="humarin/chatgpt_paraphraser_on_T5_base",  # This is T5-based, tuned for paraphrasing
    device=0 if torch.cuda.is_available() else -1
)

# Step 4: Generate paraphrases
synthetic_paraphrases = []
num_generated = 0
i = 0

while num_generated < num_needed:
    sentence = clinical_sentences_class_1[i % num_class_1]
    prompt = f"Paraphrase medically accurately: {sentence}"
    result = paraphrase_pipe(prompt, max_length=512, num_return_sequences=1, do_sample=True)
    paraphrased_text = result[0]['generated_text']
    synthetic_paraphrases.append(paraphrased_text)
    num_generated += 1
    i += 1

# Step 5: Store separately in a DataFrame
df_synthetic = pd.DataFrame({
    "clinical_text": synthetic_paraphrases,
    "Death Event (1 = Deceased, 0 = Alive)": [1] * len(synthetic_paraphrases)
})

print(f"Generated {len(df_synthetic)} paraphrased class 1 samples.")

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu


Generated 85 paraphrased class 1 samples.


In [None]:
# Data
df_synthetic.head()

Unnamed: 0,clinical_text,"Death Event (1 = Deceased, 0 = Alive)"
0,"The patient is medically competent, aged 75 ye...",1
1,The individual's medical history is as follows...,1
2,The individual is 68 years old and has an ejec...,1
3,The medical experts are pleased with the follo...,1
4,"A 55-year-old individual with a 20% ejectorin,...",1


In [None]:
# Loading Synthetic Data
synthetic_KidD_Data = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Updated_Dataset/synthetic/df_synthetic.csv", encoding='ISO-8859-1')

In [None]:
# Size
synthetic_KidD_Data.shape

(85, 2)

In [None]:
# Visualizing Rows
for idx, text in enumerate(synthetic_KidD_Data["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
The patient, aged 75, has a serum creatinine of 1.9 mg/dL and a sodium level of 130 mEq/L. Their ejection fraction is 20%, and creatinine phosphokinase is 582 IU/L. Platelets count is 265,000 per µL. They are hypertensive, not diabetic, have no anemia, and do not smoke. Follow-up duration is 4 days.
--------------------------------------------------------------------------------
Row 2:
The patient is 45 years old with a serum creatinine level of 1.6 mg/dL and serum sodium of 135 mEq/L. Platelets count is 126,000 per µL. Ejection fraction is 20%, and creatinine phosphokinase level is 582 IU/L. The patient has high blood pressure but is not diabetic, has no anemia, and does not smoke. Follow-up period is 180 days.
--------------------------------------------------------------------------------
Row 3:
The 68-year-old patient has a serum creatinine level of 1.0 mg/dL, serum sodium of 138 mEq/L, and platelets count of 166,000 per µL. Ejection fraction is 25%, and creatinine phosphoki

In [None]:
synthetic_KidD_Data.head()

Unnamed: 0,clinical_text,"Death Event (1 = Deceased, 0 = Alive)"
0,"The patient, aged 75, has a serum creatinine o...",1
1,The patient is 45 years old with a serum creat...,1
2,The 68-year-old patient has a serum creatinine...,1
3,The 45-year-old patient has high blood pressur...,1
4,The 55-year-old patient has a serum creatinine...,1


## After Augmentation

In [None]:
# Concatenating along rows (axis=0)

x_train = pd.concat(
    [ x_train[['clinical_text']],
      synthetic_KidD_Data[['clinical_text']] ],
    axis=0,
    ignore_index=True
)

y_train = pd.concat(
    [ y_train[['Death Event (1 = Deceased, 0 = Alive)']],
      synthetic_KidD_Data[['Death Event (1 = Deceased, 0 = Alive)']] ],
    axis=0,
    ignore_index=True
)

In [None]:
# Data
x_train.head()

Unnamed: 0,clinical_text
0,Patient is a 58-year-old with an ejection frac...
1,Patient is a 53-year-old with an ejection frac...
2,Patient is a 75-year-old with an ejection frac...
3,Patient is a 64-year-old with an ejection frac...
4,Patient is a 45-year-old with an ejection frac...


In [None]:
# Label
y_train.head()

Unnamed: 0,"Death Event (1 = Deceased, 0 = Alive)"
0,0
1,0
2,1
3,0
4,1


In [None]:
"""
# Save to CSV
x_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/final/x_train_updated.csv", index=False)
y_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/final/y_train_updated.csv", index=False)
"""

In [5]:
# Load the DataFrame
x_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/final/x_train_updated.csv")
y_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/final/y_train_updated.csv")
x_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/x_test_cleaned.csv")
y_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/UpData/y_test_cleaned.csv")

In [None]:
# Train Shape
x_train.shape

(334, 1)

In [None]:
# Train Label Shape
y_train.shape

(334, 1)

In [None]:
# # Train Count
print(y_train.value_counts())

Death Event (1 = Deceased, 0 = Alive)
1                                        172
0                                        162
Name: count, dtype: int64


In [None]:
# Test shape
x_test.shape

(50, 1)

In [None]:
# Test Label shape
y_test.shape

(50, 1)

In [None]:
# Test Count
print(y_test.value_counts())

Death Event (1 = Deceased, 0 = Alive)
0                                        41
1                                         9
Name: count, dtype: int64


# GPT as Classification Model

In [None]:
pip install --upgrade transformers



In [None]:
import transformers
print(transformers.__version__)

4.51.3


In [None]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (54.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.0.4


In [None]:
pip install transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from transformers import Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load tokenizer and GPT2
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # set pad token to eos_token

# 2. Custom Dataset
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# 3. Custom GPT2 Classification Model
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
        cls_output = last_hidden_state[:, -1, :]  # use last token hidden state
        logits = self.classifier(self.dropout(cls_output))
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

# 4. Prepare dataset
train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# 5. Load model
model = GPT2ForClassification()
model.to(device)

# 6. TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",  # or "f1" depending on your task
    greater_is_better=True,
    warmup_ratio=0.1,              # Warmup to prevent early overfitting
    gradient_accumulation_steps=2, # Simulates larger batch size
    fp16=True,                     # Use if on GPU with mixed precision support
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 7. Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msbakter48[0m ([33msbakter48-northern-university-bangladesh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,3.3747
10,3.7133
15,2.2932
20,2.0505
25,0.9076
30,0.8762
35,0.7155
40,0.6784
45,0.69
50,0.581


TrainOutput(global_step=664, training_loss=0.6860696696788522, metrics={'train_runtime': 3426.9377, 'train_samples_per_second': 0.78, 'train_steps_per_second': 0.194, 'total_flos': 0.0, 'train_loss': 0.6860696696788522, 'epoch': 7.910179640718563})

In [None]:
# 8. Evaluation
preds_output = trainer.predict(test_dataset)
predictions = np.argmax(preds_output.predictions, axis=1)

In [None]:
# 9. Classification report & confusion matrix
print("Classification Report:\n", classification_report(y_test, predictions, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

           0     1.0000    0.9512    0.9750        41
           1     0.8182    1.0000    0.9000         9

    accuracy                         0.9600        50
   macro avg     0.9091    0.9756    0.9375        50
weighted avg     0.9673    0.9600    0.9615        50

Confusion Matrix:
 [[39  2]
 [ 0  9]]


In [9]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Model/"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("Model and tokenizer saved to:", save_path)
"""

Model and tokenizer saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Model/


In [10]:
from transformers import GPT2Tokenizer
import torch

save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Model/"

# Reload tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(save_path)
tokenizer.pad_token = tokenizer.eos_token

# Reload model
model = GPT2ForClassification(n_classes=2)
state_dict = torch.load(save_path + "pytorch_model.bin", map_location="cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(state_dict)
model.to(device)
model.eval()

print("Model loaded from:", save_path)

Model loaded from: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Model/


## Ablation

In [None]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from transformers import Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load tokenizer and GPT2
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # set pad token to eos_token

# 2. Custom Dataset
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# 3. Custom GPT2 Classification Model
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
        cls_output = last_hidden_state[:, -1, :]  # use last token hidden state
        logits = self.classifier(self.dropout(cls_output))
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

# 4. Prepare dataset
train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# 5. Load model
model = GPT2ForClassification()
model.to(device)

# 6. TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.001,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",  # or "f1" depending on your task
    greater_is_better=True,
    warmup_ratio=0.1,              # Warmup to prevent early overfitting
    gradient_accumulation_steps=2, # Simulates larger batch size
    fp16=True,                     # Use if on GPU with mixed precision support
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 7. Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msbakter48[0m ([33msbakter48-northern-university-bangladesh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,2.3854
10,1.5268
15,1.4127
20,1.2517
25,1.1572
30,1.0853
35,1.0639
40,0.4454
45,0.9834
50,1.295


TrainOutput(global_step=186, training_loss=0.8315196857657484, metrics={'train_runtime': 794.9875, 'train_samples_per_second': 0.94, 'train_steps_per_second': 0.234, 'total_flos': 0.0, 'train_loss': 0.8315196857657484, 'epoch': 2.96})

In [None]:
# 8. Evaluation
preds_output = trainer.predict(test_dataset)
predictions = np.argmax(preds_output.predictions, axis=1)

In [None]:
# 9. Classification report & confusion matrix
print("Classification Report:\n", classification_report(y_test, predictions, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

           0     0.8333    0.9756    0.8989        41
           1     0.5000    0.1111    0.1818         9

    accuracy                         0.8200        50
   macro avg     0.6667    0.5434    0.5403        50
weighted avg     0.7733    0.8200    0.7698        50

Confusion Matrix:
 [[40  1]
 [ 8  1]]


# Interpretability

We take the trained GPT-2 model and look at how much each input token “influences” the model’s prediction. To do this, we track gradients—essentially, how much the model’s output would change if we slightly changed the token’s representation. Tokens with larger gradient values had a bigger effect on the decision, meaning the model “paid more attention” to them. By focusing only on keywords of interest (like “blood pressure” or “sugar”), we can see which clinical features the model considered most important for its prediction. This method gives a transparent view of what the model thinks matters, without changing the model itself.

In simple terms: we compute the gradient of the model’s output (the predicted class score) with respect to each input token’s embedding. The size of this gradient tells us how sensitive the prediction is to changes in that token. Larger gradients mean the model relies more on that token to make its decision.

This is a post-hoc interpretability method that works directly on the trained model without modifying it, and it’s widely used in NLP for token-level importance visualization.

In [6]:
for idx, text in enumerate(x_test.values):
    print(f"Row {idx + 1}:\n{text}\n{'-' * 80}")

Row 1:
['Patient is a 60-year-old with an ejection fraction of 30%, serum creatinine level of 1.0 mg/dL, serum sodium level of 137 mEq/L, platelets count of 150000.0 per µL, creatinine phosphokinase level of 257 IU/L, and has no high blood pressure. The patient is diabetic, has anaemia, and is a smoker. The follow-up duration is 245 days.']
--------------------------------------------------------------------------------
Row 2:
['Patient is a 60-year-old with an ejection fraction of 35%, serum creatinine level of 0.9 mg/dL, serum sodium level of 136 mEq/L, platelets count of 228000.0 per µL, creatinine phosphokinase level of 2261 IU/L, and has high blood pressure. The patient is not diabetic, does not have anaemia, and does not smoke. The follow-up duration is 115 days.']
--------------------------------------------------------------------------------
Row 3:
['Patient is a 60-year-old with an ejection fraction of 60%, serum creatinine level of 1.5 mg/dL, serum sodium level of 135 mEq/L,

In [7]:
# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

In [8]:
# If y_train/y_test are already lists
y_train = [int(i) for i in y_train]
y_test  = [int(i) for i in y_test]

# If x_train/x_test are lists, make sure they are strings
x_train = [str(i) for i in x_train]
x_test  = [str(i) for i in x_test]

In [9]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# =====================
# 0. Device
# =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =====================
# 1. Load tokenizer
# =====================
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # pad token

# =====================
# 2. Dataset
# =====================
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True,
                                   max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# =====================
# 3. GPT2 Classification Model (with attention)
# =====================
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super().__init__()
        # GPT2 with attention output
        config = GPT2Config.from_pretrained("distilgpt2", output_attentions=True, return_dict=True)
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2", config=config)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids=None, attention_mask=None, labels=None, return_dict=True, inputs_embeds=None):
        outputs = self.gpt2(input_ids=input_ids,
                            attention_mask=attention_mask,
                            inputs_embeds=inputs_embeds,
                            output_attentions=True,
                            return_dict=True)
        last_hidden_state = outputs.last_hidden_state
        cls_output = last_hidden_state[:, -1, :]  # last token
        logits = self.classifier(self.dropout(cls_output))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits, "attentions": outputs.attentions}

# =====================
# 4. Load dataset
# =====================

def df_to_list(df):
    if isinstance(df, pd.DataFrame):
        return df.squeeze().astype(str).tolist()
    return df

x_train = df_to_list(x_train)
x_test = df_to_list(x_test)
y_train = df_to_list(y_train)
y_test = df_to_list(y_test)

train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# =====================
# 5. Initialize model
# =====================
model = GPT2ForClassification(n_classes=2)
model.to(device)

# =====================
# 6. Training
# =====================
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    fp16=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mstarwarsfanclub1234[0m ([33mstarwarsfanclub1234-montclair-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,1.9125
10,1.7533
15,1.1906
20,1.4179
25,0.9666
30,0.6919
35,0.5217
40,0.9393
45,0.6135
50,0.4186


TrainOutput(global_step=252, training_loss=0.23760787373970424, metrics={'train_runtime': 412.5463, 'train_samples_per_second': 2.429, 'train_steps_per_second': 0.611, 'total_flos': 0.0, 'train_loss': 0.23760787373970424, 'epoch': 3.0})

In [10]:
"""
from transformers import GPT2Tokenizer
import torch

save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/HeartFailureGPT/Model/Test/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
"""

In [11]:
# Sample clinical sentence
sample_text = ("Patient is a 65-year-old with an ejection fraction of 25%, serum creatinine level of 1.3 mg/dL, serum sodium level of 137 mEq/L, platelets count of 276000.0 per µL, creatinine phosphokinase level of 52 IU/L, and has high blood pressure. The patient is not diabetic, has anaemia, and does not smoke. The follow-up duration is 16 days.")

# Tokenize
inputs = tokenizer(
    sample_text,
    return_tensors="pt",
    truncation=True,
    padding=True
).to(device)

print("Input IDs shape:", inputs["input_ids"].shape)
print("Attention mask shape:", inputs["attention_mask"].shape)

Input IDs shape: torch.Size([1, 96])
Attention mask shape: torch.Size([1, 96])


In [12]:
# Forward pass
with torch.no_grad():
    outputs = model(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"]
    )

logits = outputs["logits"]
attentions = outputs["attentions"]  # tuple of attention matrices per layer
predicted_class = logits.argmax(dim=-1).item()

print("Predicted class:", predicted_class)
print("Number of attention layers:", len(attentions))
print("Shape of first layer attentions:", attentions[0].shape)  # [batch, heads, seq_len, seq_len]

Predicted class: 1
Number of attention layers: 6
Shape of first layer attentions: torch.Size([1, 12, 96, 96])


In [13]:
# =====================
# Updated Sample clinical text
# =====================
sample_text = (
    "Patient is a 65-year-old with an ejection fraction of 25%, "
    "serum creatinine level of 1.3 mg/dL, serum sodium level of 137 mEq/L, "
    "platelets count of 276000.0 per µL, creatinine phosphokinase level of 52 IU/L, "
    "and has high blood pressure. The patient is not diabetic, has anaemia, "
    "and does not smoke. The follow-up duration is 16 days."
)

# =====================
# keywords of interest for this text
# =====================
key_words = [
    "age", "ejection", "fraction", "creatinine", "sodium", "platelets",
    "creatinine", "phosphokinase", "blood", "pressure", "diabetic",
    "anaemia", "smoke", "follow-up"
]

# =====================
# Tokenize
# =====================
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True).to(device)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

# =====================
# Forward pass with embedding gradient tracking
# =====================
embed_layer = model.gpt2.wte
embeds = embed_layer(inputs["input_ids"])
embeds.retain_grad()
embeds.requires_grad_(True)

outputs = model(input_ids=None, attention_mask=inputs["attention_mask"], inputs_embeds=embeds)
cls_index = outputs["logits"].argmax(dim=-1).item()
score = outputs["logits"][0, cls_index]

model.zero_grad()
score.backward()

# =====================
# Compute token importance
# =====================
grads = embeds.grad[0]  # (seq_len, embedding_dim)
token_importance = grads.norm(dim=-1).detach().cpu().numpy()

# =====================
# Map token saliency to keywords only
# =====================
token_scores = {}
for t, s in zip(tokens, token_importance):
    clean_token = t.replace("Ġ", "").lower()  # remove GPT2 whitespace token
    if any(k.lower() in clean_token for k in key_words):
        token_scores[t] = s

# =====================
# Print keyword-specific saliency
# =====================
print("Keyword-level gradient saliency:")
for t, s in token_scores.items():
    print(f"{t}: {s:.4f}")

Keyword-level gradient saliency:
Ġfraction: 0.2646
Ġsodium: 0.1962
Ġblood: 0.1620
Ġpressure: 0.2751
Ġdiabetic: 0.3275
Ġsmoke: 0.3621
