In [2]:
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

### Load Data

In [3]:
def preprocess_data(pos_file_path=None, neg_file_path=None, categorical_columns=None, label_col="label", test_size=0.2, random_state=0):
    # Initialize an empty DataFrame for concatenation
    data_frames = []
    
    # Load positive dataset if the path is provided
    if pos_file_path is not None:
        pos_df = pd.read_csv(pos_file_path)
        pos_df[label_col] = 1  # Label positive samples as 1
        data_frames.append(pos_df)
    
    # Load negative dataset if the path is provided
    if neg_file_path is not None:
        neg_df = pd.read_csv(neg_file_path)
        neg_df[label_col] = 0  # Label negative samples as 0
        data_frames.append(neg_df)
    
    # Concatenate available datasets
    if not data_frames:
        raise ValueError("At least one of pos_file_path or neg_file_path must be provided.")
    data_df = pd.concat(data_frames, axis=0)
    
    # Drop unnecessary columns if present
    #if 'dod' in data_df.columns:
    #    data_df.drop(columns=['dod'], inplace=True)
    
    # Split into features and labels
    X = data_df.drop(columns=[label_col, "subject_id"])
    y = data_df[label_col]
    
    # Encode categorical columns
    #if categorical_columns:
    #    for col in categorical_columns:
    #        le = LabelEncoder()
    #        X[col] = le.fit_transform(X[col].astype(str))
    
    # Fill missing values with column means
    #X.fillna(X.mean(), inplace=True)
    
    # Standardize numerical features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Split into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_valid, y_train, y_valid

# Usage
pos_file_path = "generic_features_for_positive_patient.csv"
neg_file_path = "generic_features_for_random_patient.csv"
CAT_COLUMN = ['race', 'marital_status', 'gender', 'dod']

X_train, X_valid, y_train, y_valid = preprocess_data(pos_file_path, neg_file_path, CAT_COLUMN, label_col="label")





### Initial Training

In [7]:
# Initialize TabNet model
model = TabNetClassifier()


# Train the model
model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    max_epochs=100,
    patience=10,
    batch_size=1024, 
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Predictions
y_pred = model.predict(X_valid)



accuracy = accuracy_score(y_valid, y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')



epoch 0  | loss: 0.69196 | val_0_auc: 0.70438 |  0:00:00s
epoch 1  | loss: 0.62365 | val_0_auc: 0.80786 |  0:00:00s
epoch 2  | loss: 0.53319 | val_0_auc: 0.85587 |  0:00:00s
epoch 3  | loss: 0.46638 | val_0_auc: 0.88118 |  0:00:00s
epoch 4  | loss: 0.42848 | val_0_auc: 0.90671 |  0:00:00s
epoch 5  | loss: 0.39929 | val_0_auc: 0.91792 |  0:00:01s
epoch 6  | loss: 0.37911 | val_0_auc: 0.92297 |  0:00:01s
epoch 7  | loss: 0.36739 | val_0_auc: 0.92964 |  0:00:01s
epoch 8  | loss: 0.35244 | val_0_auc: 0.93207 |  0:00:01s
epoch 9  | loss: 0.34683 | val_0_auc: 0.93797 |  0:00:01s
epoch 10 | loss: 0.34143 | val_0_auc: 0.93778 |  0:00:02s
epoch 11 | loss: 0.33991 | val_0_auc: 0.94058 |  0:00:02s
epoch 12 | loss: 0.33441 | val_0_auc: 0.94157 |  0:00:02s
epoch 13 | loss: 0.32426 | val_0_auc: 0.94624 |  0:00:02s
epoch 14 | loss: 0.31866 | val_0_auc: 0.94561 |  0:00:02s
epoch 15 | loss: 0.31253 | val_0_auc: 0.94507 |  0:00:03s
epoch 16 | loss: 0.30366 | val_0_auc: 0.94529 |  0:00:03s
epoch 17 | los



In [5]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8503611971104231


In [6]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9122807017543859


In [8]:
feature_importances = model.feature_importances_

# Load the DataFrame
df = pd.read_csv("generic_features_for_positive_patient.csv")

# Get column names
columns = df.columns.drop("subject_id")


# Pair feature importances with column names
feature_importance_dict = dict(zip(columns, feature_importances))

# Display the feature importance for each column
for column, importance in feature_importance_dict.items():
    print(f"{column}: {importance}")


race: 0.0021550639560379108
marital_status: 0.07263022284860916
age: 0.1935844041026247
average_apsiii: 0.02673013617234582
avg_charlson_comorbidity_index: 0.03169528516412599
gender: 0.054589863364509376
dod: 0.02127761062733792
avg_ph: 0.20356743019347365
avg_body_weight: 0.04065667136613818
BMI: 0.009111778991516951
height_inches: 0.052398351611021826
average_los_icu: 5.025734337689912e-06
avg_glucose: 0.02107819875087031
avg_heart_rate: 0.03754229765893099
avg_mbp: 0.0015110488155761698
avg_resp_rate: 0.015965659398276894
avg_spo2: 0.0017490473816516216
avg_temperature: 0.11976966828198839
avg_systolic_blood_pressure: 0.02066167261337099
avg_diastolic_blood_pressure: 0.07332056296725535


### Fine Tuning

In [51]:
'''

# Freeze all layers except the last layer for fine-tuning
for name, param in model.network.named_parameters():
    # Freeze all parameters except those in the last block
    if "blocks" in name and not name.startswith("network.blocks.0"):
        param.requires_grad = True  # Unfreeze the last block
    else:
        param.requires_grad = False  # Freeze earlier layers

'''

# Usage
fine_tune_file_path = "negative_generic_features.csv"

X_train, X_valid, y_train, y_valid = preprocess_data(pos_file_path, fine_tune_file_path, CAT_COLUMN, label_col="label")

# Initialize TabNet model
model = TabNetClassifier()

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    max_epochs=50, 
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
)



y_pred = model.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
print(f'Validation Accuracy: {accuracy * 100:.2f}%')






epoch 0  | loss: 0.7419  | val_0_auc: 0.56109 |  0:00:00s
epoch 1  | loss: 0.69035 | val_0_auc: 0.55941 |  0:00:00s
epoch 2  | loss: 0.66765 | val_0_auc: 0.61374 |  0:00:00s
epoch 3  | loss: 0.65481 | val_0_auc: 0.65047 |  0:00:00s
epoch 4  | loss: 0.64501 | val_0_auc: 0.65563 |  0:00:00s
epoch 5  | loss: 0.62571 | val_0_auc: 0.68999 |  0:00:00s
epoch 6  | loss: 0.61982 | val_0_auc: 0.70128 |  0:00:01s
epoch 7  | loss: 0.62219 | val_0_auc: 0.71007 |  0:00:01s
epoch 8  | loss: 0.61772 | val_0_auc: 0.70687 |  0:00:01s
epoch 9  | loss: 0.60666 | val_0_auc: 0.71625 |  0:00:01s
epoch 10 | loss: 0.5986  | val_0_auc: 0.71277 |  0:00:01s
epoch 11 | loss: 0.58132 | val_0_auc: 0.71887 |  0:00:01s
epoch 12 | loss: 0.59513 | val_0_auc: 0.71843 |  0:00:01s
epoch 13 | loss: 0.57907 | val_0_auc: 0.71639 |  0:00:02s
epoch 14 | loss: 0.58078 | val_0_auc: 0.71272 |  0:00:02s
epoch 15 | loss: 0.58746 | val_0_auc: 0.72484 |  0:00:02s
epoch 16 | loss: 0.58266 | val_0_auc: 0.74366 |  0:00:02s
epoch 17 | los

