In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns
import plotly.express as px

%matplotlib inline

In [2]:
# Set DPI for fugures

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300

In [3]:
# Set the default font size and weight
plt.rcParams['font.size'] = 30
plt.rcParams['font.weight'] = 'bold'

In [4]:
# Drive connection

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dataset: UCI Thyroid Cancer Recurrence

In [None]:
# Dataset
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
differentiated_thyroid_cancer_recurrence = fetch_ucirepo(id=915)

# data (as pandas dataframes)
X = differentiated_thyroid_cancer_recurrence.data.features
y = differentiated_thyroid_cancer_recurrence.data.targets

# metadata
print(differentiated_thyroid_cancer_recurrence.metadata)

# variable information
print(differentiated_thyroid_cancer_recurrence.variables)

{'uci_id': 915, 'name': 'Differentiated Thyroid Cancer Recurrence', 'repository_url': 'https://archive.ics.uci.edu/dataset/915/differentiated+thyroid+cancer+recurrence', 'data_url': 'https://archive.ics.uci.edu/static/public/915/data.csv', 'abstract': 'This data set contains 13 clinicopathologic features aiming to predict recurrence of well differentiated thyroid cancer. The data set was collected in duration of 15 years and each patient was followed for at least 10 years.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 383, 'num_features': 16, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': ['Age', 'Gender'], 'target_col': ['Recurred'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2023, 'last_updated': 'Wed Mar 20 2024', 'dataset_doi': '10.24432/C5632J', 'creators': ['Shiva Borzooei', 'Aidin Tarokhian'], 'intro_paper': {'ID': 333, 'type': 'NA

In [None]:
# Merge the features and targets into a single DataFrame
df = pd.concat([X, y], axis=1)

# Display the merged DataFrame
print(df.head())

   Age Gender Smoking Hx Smoking Hx Radiothreapy Thyroid Function  \
0   27      F      No         No              No        Euthyroid   
1   34      F      No        Yes              No        Euthyroid   
2   30      F      No         No              No        Euthyroid   
3   62      F      No         No              No        Euthyroid   
4   62      F      No         No              No        Euthyroid   

          Physical Examination Adenopathy       Pathology     Focality Risk  \
0   Single nodular goiter-left         No  Micropapillary    Uni-Focal  Low   
1          Multinodular goiter         No  Micropapillary    Uni-Focal  Low   
2  Single nodular goiter-right         No  Micropapillary    Uni-Focal  Low   
3  Single nodular goiter-right         No  Micropapillary    Uni-Focal  Low   
4          Multinodular goiter         No  Micropapillary  Multi-Focal  Low   

     T   N   M Stage       Response Recurred  
0  T1a  N0  M0     I  Indeterminate       No  
1  T1a  N0  M0  

In [None]:
# Size
df.shape

(383, 17)

In [None]:
# Null Values
df.isnull().sum()

Unnamed: 0,0
Age,0
Gender,0
Smoking,0
Hx Smoking,0
Hx Radiothreapy,0
Thyroid Function,0
Physical Examination,0
Adenopathy,0
Pathology,0
Focality,0


In [None]:
# Size
df.shape

(383, 17)

In [None]:
# Count occurrences for each category in each feature
for column in df.columns:
    print(f"Counts for {column}:")
    print(df[column].value_counts())
    print()  # Add a newline for better readability

Counts for Age:
Age
31    22
27    13
30    12
33    12
40    12
      ..
79     1
15     1
82     1
64     1
78     1
Name: count, Length: 65, dtype: int64

Counts for Gender:
Gender
F    312
M     71
Name: count, dtype: int64

Counts for Smoking:
Smoking
No     334
Yes     49
Name: count, dtype: int64

Counts for Hx Smoking:
Hx Smoking
No     355
Yes     28
Name: count, dtype: int64

Counts for Hx Radiothreapy:
Hx Radiothreapy
No     376
Yes      7
Name: count, dtype: int64

Counts for Thyroid Function:
Thyroid Function
Euthyroid                      332
Clinical Hyperthyroidism        20
Subclinical Hypothyroidism      14
Clinical Hypothyroidism         12
Subclinical Hyperthyroidism      5
Name: count, dtype: int64

Counts for Physical Examination:
Physical Examination
Multinodular goiter            140
Single nodular goiter-right    140
Single nodular goiter-left      89
Normal                           7
Diffuse goiter                   7
Name: count, dtype: int64

Counts for Ade

In [None]:
# Data
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/df.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/df.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/df.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
0,27,F,No,No,No,Euthyroid,Single nodular goiter-left,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Indeterminate,No
1,34,F,No,Yes,No,Euthyroid,Multinodular goiter,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
2,30,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
3,62,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Micropapillary,Uni-Focal,Low,T1a,N0,M0,I,Excellent,No
4,62,F,No,No,No,Euthyroid,Multinodular goiter,No,Micropapillary,Multi-Focal,Low,T1a,N0,M0,I,Excellent,No


In [None]:
# Column Names
print(df.columns.tolist())

['Age', 'Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 'T', 'N', 'M', 'Stage', 'Response', 'Recurred']


In [None]:
# Binary mappings
binary_maps = {
    "Gender": {"F": 0, "M": 1},
    "Smoking": {"No": 0, "Yes": 1},
    "Hx Smoking": {"No": 0, "Yes": 1},
    "Hx Radiothreapy": {"No": 0, "Yes": 1},
    "Focality": {"Uni-Focal": 0, "Multi-Focal": 1},
    "M": {"M0": 0, "M1": 1},
    "Recurred": {"No": 0, "Yes": 1}
}

# Multi-category mappings
thyroid_map = {
    "Euthyroid": 0,
    "Subclinical Hyperthyroidism": 1,
    "Clinical Hyperthyroidism": 2,
    "Subclinical Hypothyroidism": 3,
    "Clinical Hypothyroidism": 4
}

exam_map = {
    "Normal": 0,
    "Single nodular goiter-right": 1,
    "Single nodular goiter-left": 2,
    "Multinodular goiter": 3,
    "Diffuse goiter": 4
}

adenopathy_map = {
    "No": 0,
    "Left": 1,
    "Right": 2,
    "Bilateral": 3,
    "Extensive": 4,
    "Posterior": 5
}

pathology_map = {
    "Papillary": 0,
    "Micropapillary": 1,
    "Follicular": 2,
    "Hurthel cell": 3
}

risk_map = {"Low": 0, "Intermediate": 1, "High": 2}

t_map = {"T1a": 0, "T1b": 1, "T2": 2, "T3a": 3, "T3b": 4, "T4a": 5, "T4b": 6}

n_map = {"N0": 0, "N1a": 1, "N1b": 2}

stage_map = {"I": 0, "II": 1, "III": 2, "IVA": 3, "IVB": 4}

response_map = {
    "Excellent": 0,
    "Indeterminate": 1,
    "Biochemical Incomplete": 2,
    "Structural Incomplete": 3
}

# Apply all mappings
for col, mapping in binary_maps.items():
    df[col] = df[col].map(mapping)

df["Thyroid Function"] = df["Thyroid Function"].map(thyroid_map)
df["Physical Examination"] = df["Physical Examination"].map(exam_map)
df["Adenopathy"] = df["Adenopathy"].map(adenopathy_map)
df["Pathology"] = df["Pathology"].map(pathology_map)
df["Risk"] = df["Risk"].map(risk_map)
df["T"] = df["T"].map(t_map)
df["N"] = df["N"].map(n_map)
df["Stage"] = df["Stage"].map(stage_map)
df["Response"] = df["Response"].map(response_map)

In [None]:
df.rename(columns={

    "Age": "Age (years)",

    "Gender": "Gender (0 = Female, 1 = Male)",

    "Smoking": "Smoking Status (0 = No, 1 = Yes)",

    "Hx Smoking": "History of Smoking (0 = No, 1 = Yes)",

    "Hx Radiothreapy": "History of Radiotherapy (0 = No, 1 = Yes)",

    "Thyroid Function": "Thyroid Function (0 = Euthyroid, 1 = Subclinical Hyperthyroidism, 2 = Clinical Hyperthyroidism, 3 = Subclinical Hypothyroidism, 4 = Clinical Hypothyroidism)",

    "Physical Examination": "Physical Examination (0 = Normal, 1 = Single nodular goiter-right, 2 = Single nodular goiter-left, 3 = Multinodular goiter, 4 = Diffuse goiter)",

    "Adenopathy": "Adenopathy (0 = No, 1 = Left, 2 = Right, 3 = Bilateral, 4 = Extensive, 5 = Posterior)",

    "Pathology": "Pathology Type (0 = Papillary, 1 = Micropapillary, 2 = Follicular, 3 = Hurthel cell)",

    "Focality": "Tumor Focality (0 = Uni-Focal, 1 = Multi-Focal)",

    "Risk": "ATA Risk Stratification (0 = Low, 1 = Intermediate, 2 = High)",

    "T": "Tumor Size / Extent (T Stage: 0 = T1a, 1 = T1b, 2 = T2, 3 = T3a, 4 = T3b, 5 = T4a, 6 = T4b)",

    "N": "Lymph Node Involvement (N Stage: 0 = N0, 1 = N1a, 2 = N1b)",

    "M": "Metastasis Status (0 = M0, 1 = M1)",

    "Stage": "AJCC TNM Stage (0 = I, 1 = II, 2 = III, 3 = IVA, 4 = IVB)",

    "Response": "Response to Initial Therapy (0 = Excellent, 1 = Indeterminate, 2 = Biochemical Incomplete, 3 = Structural Incomplete)",

    "Recurred": "Disease Recurrence (0 = No, 1 = Yes)"

}, inplace=True)

In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/df.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/df.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/df.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,Age (years),"Gender (0 = Female, 1 = Male)","Smoking Status (0 = No, 1 = Yes)","History of Smoking (0 = No, 1 = Yes)","History of Radiotherapy (0 = No, 1 = Yes)","Thyroid Function (0 = Euthyroid, 1 = Subclinical Hyperthyroidism, 2 = Clinical Hyperthyroidism, 3 = Subclinical Hypothyroidism, 4 = Clinical Hypothyroidism)","Physical Examination (0 = Normal, 1 = Single nodular goiter-right, 2 = Single nodular goiter-left, 3 = Multinodular goiter, 4 = Diffuse goiter)","Adenopathy (0 = No, 1 = Left, 2 = Right, 3 = Bilateral, 4 = Extensive, 5 = Posterior)","Pathology Type (0 = Papillary, 1 = Micropapillary, 2 = Follicular, 3 = Hurthel cell)","Tumor Focality (0 = Uni-Focal, 1 = Multi-Focal)","ATA Risk Stratification (0 = Low, 1 = Intermediate, 2 = High)","Tumor Size / Extent (T Stage: 0 = T1a, 1 = T1b, 2 = T2, 3 = T3a, 4 = T3b, 5 = T4a, 6 = T4b)","Lymph Node Involvement (N Stage: 0 = N0, 1 = N1a, 2 = N1b)","Metastasis Status (0 = M0, 1 = M1)","AJCC TNM Stage (0 = I, 1 = II, 2 = III, 3 = IVA, 4 = IVB)","Response to Initial Therapy (0 = Excellent, 1 = Indeterminate, 2 = Biochemical Incomplete, 3 = Structural Incomplete)","Disease Recurrence (0 = No, 1 = Yes)"
0,27,0,0,0,0,0,2,0,1,0,0,0,0,0,0,1,0
1,34,0,0,1,0,0,3,0,1,0,0,0,0,0,0,0,0
2,30,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
3,62,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
4,62,0,0,0,0,0,3,0,1,1,0,0,0,0,0,0,0


In [None]:
# New Column Names
print(df.columns.tolist())

['Age (years)', 'Gender (0 = Female, 1 = Male)', 'Smoking Status (0 = No, 1 = Yes)', 'History of Smoking (0 = No, 1 = Yes)', 'History of Radiotherapy (0 = No, 1 = Yes)', 'Thyroid Function (0 = Euthyroid, 1 = Subclinical Hyperthyroidism, 2 = Clinical Hyperthyroidism, 3 = Subclinical Hypothyroidism, 4 = Clinical Hypothyroidism)', 'Physical Examination (0 = Normal, 1 = Single nodular goiter-right, 2 = Single nodular goiter-left, 3 = Multinodular goiter, 4 = Diffuse goiter)', 'Adenopathy (0 = No, 1 = Left, 2 = Right, 3 = Bilateral, 4 = Extensive, 5 = Posterior)', 'Pathology Type (0 = Papillary, 1 = Micropapillary, 2 = Follicular, 3 = Hurthel cell)', 'Tumor Focality (0 = Uni-Focal, 1 = Multi-Focal)', 'ATA Risk Stratification (0 = Low, 1 = Intermediate, 2 = High)', 'Tumor Size / Extent (T Stage: 0 = T1a, 1 = T1b, 2 = T2, 3 = T3a, 4 = T3b, 5 = T4a, 6 = T4b)', 'Lymph Node Involvement (N Stage: 0 = N0, 1 = N1a, 2 = N1b)', 'Metastasis Status (0 = M0, 1 = M1)', 'AJCC TNM Stage (0 = I, 1 = II, 2 =

In [None]:
# Size
df.shape

(383, 17)

# Table to Clinical Text for GPT Model

In [None]:
# Separate features and target
input_features = df.drop(columns=["Disease Recurrence (0 = No, 1 = Yes)"])
target = df["Disease Recurrence (0 = No, 1 = Yes)"]

# Function to convert each row into a clinical-style sentence
def row_to_text(row):
    return (
        f"Patient is a {int(row['Age (years)'])}-year-old "
        f"{'male' if row['Gender (0 = Female, 1 = Male)'] == 1 else 'female'} with "
        f"{'a history of smoking' if row['History of Smoking (0 = No, 1 = Yes)'] == 1 else 'no history of smoking'}, "
        f"{'a history of radiotherapy' if row['History of Radiotherapy (0 = No, 1 = Yes)'] == 1 else 'no history of radiotherapy'}. "
        f"The patient is {'a smoker' if row['Smoking Status (0 = No, 1 = Yes)'] == 1 else 'a non-smoker'}. "
        f"Thyroid function is classified as "
        f"{['Euthyroid', 'Subclinical Hyperthyroidism', 'Clinical Hyperthyroidism', 'Subclinical Hypothyroidism', 'Clinical Hypothyroidism'][int(row['Thyroid Function (0 = Euthyroid, 1 = Subclinical Hyperthyroidism, 2 = Clinical Hyperthyroidism, 3 = Subclinical Hypothyroidism, 4 = Clinical Hypothyroidism)'])]}. "
        f"Physical examination revealed "
        f"{['Normal', 'Single nodular goiter-right', 'Single nodular goiter-left', 'Multinodular goiter', 'Diffuse goiter'][int(row['Physical Examination (0 = Normal, 1 = Single nodular goiter-right, 2 = Single nodular goiter-left, 3 = Multinodular goiter, 4 = Diffuse goiter)'])]}. "
        f"Adenopathy is "
        f"{['absent', 'present on the left', 'present on the right', 'bilateral', 'extensive', 'posterior'][int(row['Adenopathy (0 = No, 1 = Left, 2 = Right, 3 = Bilateral, 4 = Extensive, 5 = Posterior)'])]}. "
        f"Pathology shows a "
        f"{['Papillary', 'Micropapillary', 'Follicular', 'Hurthel cell'][int(row['Pathology Type (0 = Papillary, 1 = Micropapillary, 2 = Follicular, 3 = Hurthel cell)'])]} carcinoma. "
        f"The tumor is "
        f"{['uni-focal', 'multi-focal'][int(row['Tumor Focality (0 = Uni-Focal, 1 = Multi-Focal)'])]} with "
        f"{['low', 'intermediate', 'high'][int(row['ATA Risk Stratification (0 = Low, 1 = Intermediate, 2 = High)'])]} ATA risk. "
        f"Tumor stage is "
        f"{['T1a', 'T1b', 'T2', 'T3a', 'T3b', 'T4a', 'T4b'][int(row['Tumor Size / Extent (T Stage: 0 = T1a, 1 = T1b, 2 = T2, 3 = T3a, 4 = T3b, 5 = T4a, 6 = T4b)'])]}, "
        f"N stage is "
        f"{['N0', 'N1a', 'N1b'][int(row['Lymph Node Involvement (N Stage: 0 = N0, 1 = N1a, 2 = N1b)'])]}, "
        f"and M stage is "
        f"{['M0', 'M1'][int(row['Metastasis Status (0 = M0, 1 = M1)'])]}. "
        f"The overall AJCC stage is "
        f"{['I', 'II', 'III', 'IVA', 'IVB'][int(row['AJCC TNM Stage (0 = I, 1 = II, 2 = III, 3 = IVA, 4 = IVB)'])]}. "
        f"Response to initial therapy is "
        f"{['Excellent', 'Indeterminate', 'Biochemical Incomplete', 'Structural Incomplete'][int(row['Response to Initial Therapy (0 = Excellent, 1 = Indeterminate, 2 = Biochemical Incomplete, 3 = Structural Incomplete)'])]}. "
        #f"Disease recurrence status is "
        # f"{'present' if row['Disease Recurrence (0 = No, 1 = Yes)'] == 1 else 'absent'}."
    )

# Apply to all rows
df["clinical_text"] = input_features.apply(row_to_text, axis=1)

In [None]:
# Drop target column to get input features only
input_features = df.drop(columns=["Disease Recurrence (0 = No, 1 = Yes)"])

# Apply the conversion function row-wise
df["clinical_text"] = input_features.apply(row_to_text, axis=1)

# View the converted text
print(df["clinical_text"].head())

0    Patient is a 27-year-old female with no histor...
1    Patient is a 34-year-old female with a history...
2    Patient is a 30-year-old female with no histor...
3    Patient is a 62-year-old female with no histor...
4    Patient is a 62-year-old female with no histor...
Name: clinical_text, dtype: object


In [None]:
# As a list
clinical_texts = df["clinical_text"].tolist()

# Save to CSV for external use
df[["clinical_text"]].to_csv("converted_clinical_texts.csv", index=False)

In [None]:
# Preview with original data + clinical text
df_with_text = df.copy()
df_with_text["clinical_text"] = input_features.apply(row_to_text, axis=1)
df_with_text.head()

Unnamed: 0,Age (years),"Gender (0 = Female, 1 = Male)","Smoking Status (0 = No, 1 = Yes)","History of Smoking (0 = No, 1 = Yes)","History of Radiotherapy (0 = No, 1 = Yes)","Thyroid Function (0 = Euthyroid, 1 = Subclinical Hyperthyroidism, 2 = Clinical Hyperthyroidism, 3 = Subclinical Hypothyroidism, 4 = Clinical Hypothyroidism)","Physical Examination (0 = Normal, 1 = Single nodular goiter-right, 2 = Single nodular goiter-left, 3 = Multinodular goiter, 4 = Diffuse goiter)","Adenopathy (0 = No, 1 = Left, 2 = Right, 3 = Bilateral, 4 = Extensive, 5 = Posterior)","Pathology Type (0 = Papillary, 1 = Micropapillary, 2 = Follicular, 3 = Hurthel cell)","Tumor Focality (0 = Uni-Focal, 1 = Multi-Focal)","ATA Risk Stratification (0 = Low, 1 = Intermediate, 2 = High)","Tumor Size / Extent (T Stage: 0 = T1a, 1 = T1b, 2 = T2, 3 = T3a, 4 = T3b, 5 = T4a, 6 = T4b)","Lymph Node Involvement (N Stage: 0 = N0, 1 = N1a, 2 = N1b)","Metastasis Status (0 = M0, 1 = M1)","AJCC TNM Stage (0 = I, 1 = II, 2 = III, 3 = IVA, 4 = IVB)","Response to Initial Therapy (0 = Excellent, 1 = Indeterminate, 2 = Biochemical Incomplete, 3 = Structural Incomplete)","Disease Recurrence (0 = No, 1 = Yes)",clinical_text
0,27,0,0,0,0,0,2,0,1,0,0,0,0,0,0,1,0,Patient is a 27-year-old female with no histor...
1,34,0,0,1,0,0,3,0,1,0,0,0,0,0,0,0,0,Patient is a 34-year-old female with a history...
2,30,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,Patient is a 30-year-old female with no histor...
3,62,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,Patient is a 62-year-old female with no histor...
4,62,0,0,0,0,0,3,0,1,1,0,0,0,0,0,0,0,Patient is a 62-year-old female with no histor...


In [None]:
df_with_text.shape

(383, 18)

In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/clinical_text_dataset.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/clinical_text_dataset.csv


In [None]:
load_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/clinical_text_dataset.csv"

# Load the DataFrame
df = pd.read_csv(load_path)

# Preview
df.head()

Unnamed: 0,Age (years),"Gender (0 = Female, 1 = Male)","Smoking Status (0 = No, 1 = Yes)","History of Smoking (0 = No, 1 = Yes)","History of Radiotherapy (0 = No, 1 = Yes)","Thyroid Function (0 = Euthyroid, 1 = Subclinical Hyperthyroidism, 2 = Clinical Hyperthyroidism, 3 = Subclinical Hypothyroidism, 4 = Clinical Hypothyroidism)","Physical Examination (0 = Normal, 1 = Single nodular goiter-right, 2 = Single nodular goiter-left, 3 = Multinodular goiter, 4 = Diffuse goiter)","Adenopathy (0 = No, 1 = Left, 2 = Right, 3 = Bilateral, 4 = Extensive, 5 = Posterior)","Pathology Type (0 = Papillary, 1 = Micropapillary, 2 = Follicular, 3 = Hurthel cell)","Tumor Focality (0 = Uni-Focal, 1 = Multi-Focal)","ATA Risk Stratification (0 = Low, 1 = Intermediate, 2 = High)","Tumor Size / Extent (T Stage: 0 = T1a, 1 = T1b, 2 = T2, 3 = T3a, 4 = T3b, 5 = T4a, 6 = T4b)","Lymph Node Involvement (N Stage: 0 = N0, 1 = N1a, 2 = N1b)","Metastasis Status (0 = M0, 1 = M1)","AJCC TNM Stage (0 = I, 1 = II, 2 = III, 3 = IVA, 4 = IVB)","Response to Initial Therapy (0 = Excellent, 1 = Indeterminate, 2 = Biochemical Incomplete, 3 = Structural Incomplete)","Disease Recurrence (0 = No, 1 = Yes)",clinical_text
0,27,0,0,0,0,0,2,0,1,0,0,0,0,0,0,1,0,Patient is a 27-year-old female with no histor...
1,34,0,0,1,0,0,3,0,1,0,0,0,0,0,0,0,0,Patient is a 34-year-old female with a history...
2,30,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,Patient is a 30-year-old female with no histor...
3,62,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,Patient is a 62-year-old female with no histor...
4,62,0,0,0,0,0,3,0,1,1,0,0,0,0,0,0,0,Patient is a 62-year-old female with no histor...


In [None]:
for idx, text in enumerate(df["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient is a 27-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is absent. Pathology shows a Micropapillary carcinoma. The tumor is uni-focal with low ATA risk. Tumor stage is T1a, N stage is N0, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Indeterminate. 
--------------------------------------------------------------------------------
Row 2:
Patient is a 34-year-old female with a history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is absent. Pathology shows a Micropapillary carcinoma. The tumor is uni-focal with low ATA risk. Tumor stage is T1a, N stage is N0, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Excellent. 
-

# Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Define x and y
x = df['clinical_text']
y = df['Disease Recurrence (0 = No, 1 = Yes)']

# 80-20 train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.193, random_state=42, stratify=y)

In [None]:
"""
# Save x_train and y_train
x_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/x_train_updated.csv", index=False)
y_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/y_train_updated.csv", index=False)

# Save x_test and y_test
x_test.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/x_test_cleaned.csv", index=False)
y_test.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/y_test_cleaned.csv", index=False)
"""

In [5]:
# Load the DataFrame
x_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/Final/Preprocessed/x_train_updated.csv")
y_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/Final/Preprocessed/y_train_updated.csv")
x_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/Final/x_test_cleaned.csv")
y_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/Actual/Final/y_test_cleaned.csv")

In [None]:
# Show count for each category
y_train.value_counts()

Unnamed: 0_level_0,count
"Disease Recurrence (0 = No, 1 = Yes)",Unnamed: 1_level_1
0.0,220
1.0,86


In [None]:
# Show count for each category
y_test.value_counts()

Unnamed: 0_level_0,count
"Disease Recurrence (0 = No, 1 = Yes)",Unnamed: 1_level_1
0,55
1,17


In [None]:
# Train Shape
x_train.shape

(311, 1)

In [None]:
# Train Label Shape
y_train.shape

(311, 1)

In [None]:
# Train Count
print(y_train.value_counts())

Disease Recurrence (0 = No, 1 = Yes)
0.0                                     220
1.0                                      86
Name: count, dtype: int64


In [None]:
# Test shape
x_test.shape

(72, 1)

In [None]:
# Test Label shape
y_test.shape

(72, 1)

In [None]:
# Test Count
print(y_test.value_counts())

Disease Recurrence (0 = No, 1 = Yes)
0                                       55
1                                       17
Name: count, dtype: int64


In [6]:
# Combine x_test and y_test
merged_df = pd.concat([x_test, y_test], axis=1)

# Assume y_test column name is 'label' (adjust if different)
# If y_test has no column name, assign one
if merged_df.columns[-1] not in ['label', 'y_test']:
    merged_df.columns = list(merged_df.columns[:-1]) + ['label']

# Filter rows where label == 1
positive_cases = merged_df[merged_df['label'] == 1]

# Print texts for positive cases
for idx, row in positive_cases.iterrows():
    print(f"Row {idx + 1}:\n{row[0]}\n{'-'*80}")

Row 3:
Patient is a 27-year-old male with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete. 
--------------------------------------------------------------------------------
Row 7:
Patient is a 31-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial ther

  print(f"Row {idx + 1}:\n{row[0]}\n{'-'*80}")


### GPT Model (humarin/chatgpt_paraphraser_on_T5_base) for Synthetic Data and Paraphrasing (Optional)

In [None]:
# Merge x_train and y_train
df = pd.concat([x_train, y_train], axis=1)

### Positive classes

In [None]:
# Filter the rows where CKD == 1
hf_rows = df[df["Disease Recurrence (0 = No, 1 = Yes)"] == 1]

# Print clinical texts for CKD patients only
for idx, text in enumerate(hf_rows["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient is a 51-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is absent. Pathology shows a Follicular carcinoma. The tumor is uni-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N0, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete. 
--------------------------------------------------------------------------------
Row 2:
Patient is a 58-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Subclinical Hypothyroidism. Physical examination revealed Multinodular goiter. Adenopathy is extensive. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3b, N stage is N1b, and M stage is M0. The overall AJCC stage is II. Re

In [None]:
df.head()

Unnamed: 0,clinical_text,"Disease Recurrence (0 = No, 1 = Yes)"
331,Patient is a 51-year-old female with no histor...,1
348,Patient is a 58-year-old female with no histor...,1
122,Patient is a 28-year-old female with no histor...,0
359,Patient is a 35-year-old female with no histor...,1
366,Patient is a 64-year-old female with a history...,1


In [None]:
import torch
from transformers import pipeline

# Step 1: Filter class 1 examples
df_class_1 = df[df["Disease Recurrence (0 = No, 1 = Yes)"] == 1]
clinical_sentences_class_1 = df_class_1["clinical_text"].tolist()

# Step 2: Calculate how many paraphrases we need
num_class_0 = df[df["Disease Recurrence (0 = No, 1 = Yes)"] == 0].shape[0]
num_class_1 = len(clinical_sentences_class_1)
num_needed = num_class_0 - num_class_1

# Step 3: Load GPT (or equivalent medically fine-tuned model for paraphrasing)
paraphrase_pipe = pipeline(
    "text2text-generation",
    model="humarin/chatgpt_paraphraser_on_T5_base",  # This is T5-based, tuned for paraphrasing
    device=0 if torch.cuda.is_available() else -1
)

# Step 4: Generate paraphrases
synthetic_paraphrases = []
num_generated = 0
i = 0

while num_generated < num_needed:
    sentence = clinical_sentences_class_1[i % num_class_1]
    prompt = f"Paraphrase medically accurately: {sentence}"
    result = paraphrase_pipe(prompt, max_length=512, num_return_sequences=1, do_sample=True)
    paraphrased_text = result[0]['generated_text']
    synthetic_paraphrases.append(paraphrased_text)
    num_generated += 1
    i += 1

# Step 5: Store separately in a DataFrame
df_synthetic = pd.DataFrame({
    "clinical_text": synthetic_paraphrases,
    "Disease Recurrence (0 = No, 1 = Yes)": [1] * len(synthetic_paraphrases)
})

print(f"Generated {len(df_synthetic)} paraphrased class 1 samples.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_

Generated 134 paraphrased class 1 samples.


In [None]:
# Data
df_synthetic.head()

Unnamed: 0,clinical_text,"Disease Recurrence (0 = No, 1 = Yes)"
0,The patient is a 51-year-old female who has ne...,1
1,The patient is a 58-year-old female who has no...,1
2,The patient is a 35-year-old female with no pr...,1
3,The patient is a 64-year-old female with a his...,1
4,The patient is a 67-year-old male with a histo...,1


In [None]:
"""
save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/df_synthetic.csv"

# Save to CSV
df.to_csv(save_path, index=False)

print(f"File saved to: {save_path}")
"""

File saved to: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/df_synthetic.csv


In [None]:
# Loading Synthetic Data
Thyroid_Cancer_Recurrence_Data = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Updated_Dataset/df_synthetic.csv", encoding='ISO-8859-1')

In [None]:
# Size
Thyroid_Cancer_Recurrence_Data.shape

(306, 2)

In [None]:
# Visualizing Rows
for idx, text in enumerate(Thyroid_Cancer_Recurrence_Data["clinical_text"]):
    print(f"Row {idx + 1}:\n{text}\n{'-'*80}")

Row 1:
Patient is a 51-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is absent. Pathology shows a Follicular carcinoma. The tumor is uni-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N0, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete. 
--------------------------------------------------------------------------------
Row 2:
Patient is a 58-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Subclinical Hypothyroidism. Physical examination revealed Multinodular goiter. Adenopathy is extensive. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3b, N stage is N1b, and M stage is M0. The overall AJCC stage is II. Re

In [None]:
Thyroid_Cancer_Recurrence_Data.head()

Unnamed: 0,clinical_text,"Disease Recurrence (0 = No, 1 = Yes)"
0,Patient is a 51-year-old female with no histor...,1
1,Patient is a 58-year-old female with no histor...,1
2,Patient is a 28-year-old female with no histor...,0
3,Patient is a 35-year-old female with no histor...,1
4,Patient is a 64-year-old female with a history...,1


## After Augmentation

In [None]:
# Concatenating along rows (axis=0)

x_train = pd.concat(
    [ x_train[['clinical_text']],
      Thyroid_Cancer_Recurrence_Data[['clinical_text']] ],
    axis=0,
    ignore_index=True
)

y_train = pd.concat(
    [ y_train[['Disease Recurrence (0 = No, 1 = Yes)']],
      Thyroid_Cancer_Recurrence_Data[['Disease Recurrence (0 = No, 1 = Yes)']] ],
    axis=0,
    ignore_index=True
)

In [None]:
# Data
x_train.head()

Unnamed: 0,clinical_text
0,Patient is a 58-year-old with an ejection frac...
1,Patient is a 53-year-old with an ejection frac...
2,Patient is a 75-year-old with an ejection frac...
3,Patient is a 64-year-old with an ejection frac...
4,Patient is a 45-year-old with an ejection frac...


In [None]:
# Label
y_train.head()

Unnamed: 0,"Death Event (1 = Deceased, 0 = Alive)"
0,0
1,0
2,1
3,0
4,1


In [None]:
"""
# Save to CSV
x_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/PreProcessed/Final/x_train.csv", index=False)
y_train.to_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/PreProcessed/Final/x_train.csv", index=False)
"""

In [7]:
# Load the DataFrame
x_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/PreProcessed/Final/x_train.csv")
y_train = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/PreProcessed/Final/y_train.csv")
x_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/PreProcessed/x_test_cleaned.csv")
y_test = pd.read_csv("/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/UpData/PreProcessed/y_test_cleaned.csv")

In [None]:
# Train Shape
x_train.shape

(440, 1)

In [None]:
# Train Label Shape
y_train.shape

(440, 1)

In [None]:
# Train Count
print(y_train.value_counts())

Disease Recurrence (0 = No, 1 = Yes)
0                                       220
1                                       220
Name: count, dtype: int64


In [None]:
# Test shape
x_test.shape

(72, 1)

In [None]:
# Test Label shape
y_test.shape

(72, 1)

In [None]:
# Test Count
print(y_test.value_counts())

Disease Recurrence (0 = No, 1 = Yes)
0                                       55
1                                       17
Name: count, dtype: int64


# GPT as Classification Model

In [None]:
pip install --upgrade transformers



In [None]:
import transformers
print(transformers.__version__)

4.51.3


In [None]:
!pip install huggingface_hub[hf_xet]



In [None]:
pip install transformers datasets scikit-learn

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from transformers import Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load tokenizer and GPT2
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # set pad token to eos_token

# 2. Custom Dataset
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=max_len, return_tensors='pt')
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# 3. Custom GPT2 Classification Model
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_dim]
        cls_output = last_hidden_state[:, -1, :]  # use last token hidden state
        logits = self.classifier(self.dropout(cls_output))
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

# 4. Prepare dataset
train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# 5. Load model
model = GPT2ForClassification()
model.to(device)

# 6. TrainingArguments and Trainer
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",  # or "f1" depending on your task
    greater_is_better=True,
    warmup_ratio=0.1,              # Warmup to prevent early overfitting
    gradient_accumulation_steps=2, # Simulates larger batch size
    fp16=True,                     # Use if on GPU with mixed precision support
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# 7. Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msbakter48[0m ([33msbakter48-northern-university-bangladesh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,2.7031
10,1.8666
15,1.8568
20,1.7807
25,1.1839
30,0.6654
35,0.9876
40,0.3568
45,0.6554
50,1.4743


TrainOutput(global_step=660, training_loss=0.3801319494843483, metrics={'train_runtime': 1077.8642, 'train_samples_per_second': 2.449, 'train_steps_per_second': 0.612, 'total_flos': 0.0, 'train_loss': 0.3801319494843483, 'epoch': 6.0})

In [None]:
# 8. Evaluation
preds_output = trainer.predict(test_dataset)
predictions = np.argmax(preds_output.predictions, axis=1)

In [None]:
# 9. Classification report & confusion matrix
print("Classification Report:\n", classification_report(y_test, predictions, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

           0     0.9649    1.0000    0.9821        55
           1     1.0000    0.8824    0.9375        17

    accuracy                         0.9722        72
   macro avg     0.9825    0.9412    0.9598        72
weighted avg     0.9732    0.9722    0.9716        72

Confusion Matrix:
 [[55  0]
 [ 2 15]]


In [None]:
"""
from transformers import GPT2Tokenizer

save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Model/"

# Example: default tokenizer (if you trained with default GPT-2 tokenizer)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Save tokenizer to your folder
tokenizer.save_pretrained(save_path)

print("Tokenizer saved at:", save_path)
"""


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Tokenizer saved at: /content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Model/


## Ablation

In [None]:
# =====================================
# GPT-2 Clinical Text Classification
# =====================================
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    GPT2Tokenizer,
    GPT2Model,
    Trainer,
    TrainingArguments
)
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix

# =====================================
# 1️⃣ Environment setup
# =====================================
os.environ["WANDB_DISABLED"] = "true"   # Disable W&B logging
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# =====================================
# 2️⃣ Load tokenizer
# =====================================
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 has no pad token

# =====================================
# 3️⃣ Custom Dataset
# =====================================
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(
            list(texts),
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors='pt'
        )
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# =====================================
# 4️⃣ Custom GPT2 model for classification
# =====================================
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2")
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.gpt2(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        cls_output = last_hidden_state[:, -1, :]  # Use last token
        logits = self.classifier(self.dropout(cls_output))
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
        return {"loss": loss, "logits": logits}

# =====================================
# 5️⃣ Helper function to clean and align data
# =====================================
def clean_labels_and_align(x, y):
    # Convert arrays/lists/series/dataframes all to DataFrames
    if isinstance(x, np.ndarray):
        x = pd.DataFrame(x, columns=["text"])
    elif isinstance(x, list):
        x = pd.DataFrame({"text": x})
    elif isinstance(x, pd.Series):
        x = x.to_frame(name="text")

    if isinstance(y, np.ndarray):
        y = pd.DataFrame(y, columns=["label"])
    elif isinstance(y, list):
        y = pd.DataFrame({"label": y})
    elif isinstance(y, pd.Series):
        y = y.to_frame(name="label")

    y_series = y.squeeze().replace([np.inf, -np.inf], np.nan)
    mask = ~y_series.isna()
    x_clean = x.loc[mask].reset_index(drop=True)
    y_clean = y_series.loc[mask].astype(int).reset_index(drop=True)
    return x_clean, y_clean

# =====================================
# 6️⃣ Prepare your data
# (Assume x_train, y_train, x_test, y_test are already defined)
# =====================================
x_train, y_train = clean_labels_and_align(x_train, y_train)
x_test, y_test = clean_labels_and_align(x_test, y_test)

# Convert to lists
x_train = x_train.squeeze().astype(str).tolist()
x_test = x_test.squeeze().astype(str).tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

print(f" Training samples: {len(x_train)}")
print(f" Testing samples: {len(x_test)}")

# =====================================
# 7️⃣ Prepare datasets
# =====================================
train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# =====================================
# 8️⃣ Initialize model
# =====================================
model = GPT2ForClassification()
model.to(device)

# =====================================
# 9️⃣ Training arguments
# =====================================
training_args = TrainingArguments(
output_dir="./results",
    per_device_train_batch_size=15,
    num_train_epochs=1,
    weight_decay=0.8,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=100,
    eval_steps=10,
    metric_for_best_model="accuracy",  # or "f1" depending on your task
    greater_is_better=True,
    warmup_ratio=0.01,              # Warmup to prevent early overfitting
    gradient_accumulation_steps=1, # Simulates larger batch size
    fp16=True,                     # Use if on GPU with mixed precision support
)

# =====================================
# 🔟 Metrics
# =====================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = (preds == labels).mean()
    return {"accuracy": acc}

# =====================================
# 11️⃣ Trainer
# =====================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# =====================================
# 12️⃣ Train model
# =====================================
trainer.train()


Using device: cpu
 Training samples: 306
 Testing samples: 72


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
5,2.6401
10,0.4994
15,0.6093
20,0.5027


TrainOutput(global_step=21, training_loss=1.056818419978732, metrics={'train_runtime': 302.2653, 'train_samples_per_second': 1.012, 'train_steps_per_second': 0.069, 'total_flos': 0.0, 'train_loss': 1.056818419978732, 'epoch': 1.0})

In [None]:
# 8. Evaluation
preds_output = trainer.predict(test_dataset)
predictions = np.argmax(preds_output.predictions, axis=1)



In [None]:
# 9. Classification report & confusion matrix
print("Classification Report:\n", classification_report(y_test, predictions, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

Classification Report:
               precision    recall  f1-score   support

           0     0.7857    1.0000    0.8800        55
           1     1.0000    0.1176    0.2105        17

    accuracy                         0.7917        72
   macro avg     0.8929    0.5588    0.5453        72
weighted avg     0.8363    0.7917    0.7219        72

Confusion Matrix:
 [[55  0]
 [15  2]]


# Interpretability

We take the trained GPT-2 model and look at how much each input token “influences” the model’s prediction. To do this, we track gradients—essentially, how much the model’s output would change if we slightly changed the token’s representation. Tokens with larger gradient values had a bigger effect on the decision, meaning the model “paid more attention” to them. By focusing only on keywords of interest (like “blood pressure” or “sugar”), we can see which clinical features the model considered most important for its prediction. This method gives a transparent view of what the model thinks matters, without changing the model itself.

In simple terms: we compute the gradient of the model’s output (the predicted class score) with respect to each input token’s embedding. The size of this gradient tells us how sensitive the prediction is to changes in that token. Larger gradients mean the model relies more on that token to make its decision.

This is a post-hoc interpretability method that works directly on the trained model without modifying it, and it’s widely used in NLP for token-level importance visualization.

In [None]:
# Combine x_test and y_test
merged_df = pd.concat([x_test, y_test], axis=1)

# Assume y_test column name is 'label' (adjust if different)
# If y_test has no column name, assign one
if merged_df.columns[-1] not in ['label', 'y_test']:
    merged_df.columns = list(merged_df.columns[:-1]) + ['label']

# Filter rows where label == 1
positive_cases = merged_df[merged_df['label'] == 1]

# Print texts for positive cases
for idx, row in positive_cases.iterrows():
    print(f"Row {idx + 1}:\n{row[0]}\n{'-'*80}")

Row 3:
Patient is a 27-year-old male with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete. 
--------------------------------------------------------------------------------
Row 7:
Patient is a 31-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial ther

  print(f"Row {idx + 1}:\n{row[0]}\n{'-'*80}")


In [8]:
for idx, text in enumerate(x_test.values):
    print(f"Row {idx + 1}:\n{text}\n{'-' * 80}")

Row 1:
['Patient is a 24-year-old male with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-right. Adenopathy is absent. Pathology shows a Papillary carcinoma. The tumor is uni-focal with low ATA risk. Tumor stage is T2, N stage is N0, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Excellent. ']
--------------------------------------------------------------------------------
Row 2:
['Patient is a 28-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is absent. Pathology shows a Papillary carcinoma. The tumor is uni-focal with low ATA risk. Tumor stage is T2, N stage is N0, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Excellent. ']
---------

In [9]:
# Texts
if isinstance(x_train, pd.DataFrame):
    x_train = x_train.squeeze().astype(str).tolist()
if isinstance(x_test, pd.DataFrame):
    x_test = x_test.squeeze().astype(str).tolist()

# Labels
if isinstance(y_train, pd.DataFrame):
    y_train = y_train.squeeze().astype(int).tolist()
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.squeeze().astype(int).tolist()

In [10]:
# If y_train/y_test are already lists
y_train = [int(i) for i in y_train]
y_test  = [int(i) for i in y_test]

# If x_train/x_test are lists, make sure they are strings
x_train = [str(i) for i in x_train]
x_test  = [str(i) for i in x_test]

In [11]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, Trainer, TrainingArguments
from torch import nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# =====================
# 0. Device
# =====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# =====================
# 1. Load tokenizer
# =====================
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # pad token

# =====================
# 2. Dataset
# =====================
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True,
                                   max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# =====================
# 3. GPT2 Classification Model (with attention)
# =====================
class GPT2ForClassification(nn.Module):
    def __init__(self, n_classes=2):
        super().__init__()
        # GPT2 with attention output
        config = GPT2Config.from_pretrained("distilgpt2", output_attentions=True, return_dict=True)
        self.gpt2 = GPT2Model.from_pretrained("distilgpt2", config=config)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, n_classes)

    def forward(self, input_ids=None, attention_mask=None, labels=None, return_dict=True, inputs_embeds=None):
        outputs = self.gpt2(input_ids=input_ids,
                            attention_mask=attention_mask,
                            inputs_embeds=inputs_embeds,
                            output_attentions=True,
                            return_dict=True)
        last_hidden_state = outputs.last_hidden_state
        cls_output = last_hidden_state[:, -1, :]  # last token
        logits = self.classifier(self.dropout(cls_output))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits, "attentions": outputs.attentions}

# =====================
# 4. Load dataset
# =====================

def df_to_list(df):
    if isinstance(df, pd.DataFrame):
        return df.squeeze().astype(str).tolist()
    return df

x_train = df_to_list(x_train)
x_test = df_to_list(x_test)
y_train = df_to_list(y_train)
y_test = df_to_list(y_test)

train_dataset = ClinicalDataset(x_train, y_train, tokenizer)
test_dataset = ClinicalDataset(x_test, y_test, tokenizer)

# =====================
# 5. Initialize model
# =====================
model = GPT2ForClassification(n_classes=2)
model.to(device)

# =====================
# 6. Training
# =====================
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    save_steps=10,
    eval_steps=10,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    warmup_ratio=0.1,
    gradient_accumulation_steps=2,
    fp16=True,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnubsimon4[0m ([33mnubsimon4-northern-university-bangladesh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,0.7685
10,0.8512
15,0.7664
20,0.6932
25,0.569
30,0.6041
35,0.7907
40,0.3445
45,0.3833
50,0.4526


TrainOutput(global_step=330, training_loss=0.36299161730390606, metrics={'train_runtime': 770.8592, 'train_samples_per_second': 1.712, 'train_steps_per_second': 0.428, 'total_flos': 0.0, 'train_loss': 0.36299161730390606, 'epoch': 3.0})

In [12]:
"""
from transformers import GPT2Tokenizer
import torch

save_path = "/content/drive/Shareddrives/Best Shared Drive Ever/Simon-personal/ThyroidCancerRecurrence/Model/Test/"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
"""

In [13]:
import re

# =============================
# Sample clinical texts
# =============================
sample_texts = {
    3: "Patient is a 27-year-old male with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    7: "Patient is a 31-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    15: "Patient is a 72-year-old male with a history of smoking, no history of radiotherapy. The patient is a smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with high ATA risk. Tumor stage is T4b, N stage is N1b, and M stage is M1. The overall AJCC stage is IVB. Response to initial therapy is Structural Incomplete.",
    17: "Patient is a 71-year-old female with no history of smoking, no history of radiotherapy. The patient is a smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is absent. Pathology shows a Follicular carcinoma. The tumor is multi-focal with high ATA risk. Tumor stage is T4a, N stage is N0, and M stage is M1. The overall AJCC stage is IVB. Response to initial therapy is Structural Incomplete.",
    23: "Patient is a 80-year-old male with no history of smoking, no history of radiotherapy. The patient is a smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is absent. Pathology shows a Hurthel cell carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T4a, N stage is N0, and M stage is M0. The overall AJCC stage is II. Response to initial therapy is Structural Incomplete.",
    32: "Patient is a 38-year-old male with a history of smoking, no history of radiotherapy. The patient is a smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is present on the right. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T2, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    33: "Patient is a 53-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is present on the right. Pathology shows a Papillary carcinoma. The tumor is uni-focal with high ATA risk. Tumor stage is T4a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    35: "Patient is a 54-year-old male with no history of smoking, no history of radiotherapy. The patient is a smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is present on the right. Pathology shows a Hurthel cell carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T4a, N stage is N1b, and M stage is M0. The overall AJCC stage is II. Response to initial therapy is Structural Incomplete.",
    37: "Patient is a 60-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Multinodular goiter. Adenopathy is bilateral. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T2, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    39: "Patient is a 62-year-old male with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-right. Adenopathy is absent. Pathology shows a Papillary carcinoma. The tumor is uni-focal with low ATA risk. Tumor stage is T2, N stage is N0, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    45: "Patient is a 45-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is present on the left. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    50: "Patient is a 67-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-right. Adenopathy is present on the right. Pathology shows a Papillary carcinoma. The tumor is multi-focal with high ATA risk. Tumor stage is T4a, N stage is N1b, and M stage is M0. The overall AJCC stage is III. Response to initial therapy is Structural Incomplete.",
    54: "Patient is a 23-year-old female with a history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-right. Adenopathy is present on the right. Pathology shows a Papillary carcinoma. The tumor is uni-focal with intermediate ATA risk. Tumor stage is T3b, N stage is N1a, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Structural Incomplete.",
    57: "Patient is a 36-year-old male with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Clinical Hyperthyroidism. Physical examination revealed Multinodular goiter. Adenopathy is absent. Pathology shows a Papillary carcinoma. The tumor is uni-focal with intermediate ATA risk. Tumor stage is T2, N stage is N1a, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Indeterminate.",
    68: "Patient is a 31-year-old male with no history of smoking, a history of radiotherapy. The patient is a smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is extensive. Pathology shows a Papillary carcinoma. The tumor is multi-focal with high ATA risk. Tumor stage is T4a, N stage is N1b, and M stage is M1. The overall AJCC stage is II. Response to initial therapy is Structural Incomplete.",
    70: "Patient is a 73-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-right. Adenopathy is present on the right. Pathology shows a Papillary carcinoma. The tumor is multi-focal with high ATA risk. Tumor stage is T3b, N stage is N1a, and M stage is M1. The overall AJCC stage is IVB. Response to initial therapy is Structural Incomplete.",
    72: "Patient is a 36-year-old female with no history of smoking, no history of radiotherapy. The patient is a non-smoker. Thyroid function is classified as Euthyroid. Physical examination revealed Single nodular goiter-left. Adenopathy is present on the left. Pathology shows a Papillary carcinoma. The tumor is multi-focal with intermediate ATA risk. Tumor stage is T3a, N stage is N1b, and M stage is M0. The overall AJCC stage is I. Response to initial therapy is Indeterminate."
}

# =============================
# Dummy feature importances
# =============================
feature_importance = {
    "Tumor": 0.3342,
    "Smoking Status": 0.2984,
    "Pathology": 0.2557,
    "AJCC Stage": 0.2073,
    "Age": 0.1839,
    "Response": 0.1622,
    "Gender": 0.0948,
    "Thyroid Function": 0.0675,
    "Radiotherapy": 0.0563,
    "Adenopathy": 0.0421,
    "Physical Examination": 0.0339,
}

# =============================
# Keyword patterns per feature
# =============================
feature_keywords = {
    "Age": r"(\d{2})-year-old",
    "Gender": r"\b(male|female)\b",
    "Smoking Status": r"\b(smoker|non-smoker)\b",
    "Radiotherapy": r"(history of radiotherapy|no history of radiotherapy)",
    "Thyroid Function": r"(Euthyroid|Subclinical Hyperthyroidism|Clinical Hyperthyroidism|Subclinical Hypothyroidism|Clinical Hypothyroidism)",
    "Physical Examination": r"(Multinodular goiter|Single nodular goiter-(?:left|right)|Diffuse goiter|Normal)",
    "Adenopathy": r"(bilateral|absent|left|right|extensive)",
    "Pathology": r"(Papillary carcinoma|Follicular carcinoma|Hurthel cell carcinoma|Micropapillary carcinoma)",
    "Tumor": r"(T\d[a-b]?|multi-focal|uni-focal)",
    "AJCC Stage": r"stage\s([IVB]+|I{1,3}|IVB|IVA)",
    "Response": r"(Excellent|Indeterminate|Biochemical Incomplete|Structural Incomplete)"
}

# =============================
# Process each sample text
# =============================
for row, text in sample_texts.items():
    print("="*30)
    print(f" Row {row}")
    print("="*30)

    detected = {}
    for feature, pattern in feature_keywords.items():
        match = re.search(pattern, text, re.IGNORECASE)
        detected[feature] = match.group(1) if match else "Not mentioned"

    # Print results for each row
    for feature, importance in sorted(feature_importance.items(), key=lambda x: x[1], reverse=True):
        value = detected.get(feature, "Not mentioned")
        print(f"{feature} ({value}): {importance:.4f}")

 Row 3
Tumor (multi-focal): 0.3342
Smoking Status (non-smoker): 0.2984
Pathology (Papillary carcinoma): 0.2557
AJCC Stage (i): 0.2073
Age (27): 0.1839
Response (Structural Incomplete): 0.1622
Gender (male): 0.0948
Thyroid Function (Euthyroid): 0.0675
Radiotherapy (no history of radiotherapy): 0.0563
Adenopathy (bilateral): 0.0421
Physical Examination (Multinodular goiter): 0.0339
 Row 7
Tumor (multi-focal): 0.3342
Smoking Status (non-smoker): 0.2984
Pathology (Papillary carcinoma): 0.2557
AJCC Stage (i): 0.2073
Age (31): 0.1839
Response (Structural Incomplete): 0.1622
Gender (female): 0.0948
Thyroid Function (Euthyroid): 0.0675
Radiotherapy (no history of radiotherapy): 0.0563
Adenopathy (bilateral): 0.0421
Physical Examination (Multinodular goiter): 0.0339
 Row 15
Tumor (multi-focal): 0.3342
Smoking Status (smoker): 0.2984
Pathology (Papillary carcinoma): 0.2557
AJCC Stage (i): 0.2073
Age (72): 0.1839
Response (Structural Incomplete): 0.1622
Gender (male): 0.0948
Thyroid Function (Eut