In [2]:
import numpy as np
import pandas as pd
import time
import pandas as pd
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier

from sklearn.metrics import roc_auc_score, confusion_matrix

Load your dataset

In [3]:
# Load your dataset
df = pd.read_csv('data_train_infection.csv')
# df = pd.read_csv('data_train_rebleeding.csv')
# df = pd.read_csv('data_train_mortality.csv')

# Define target column
target_col = "infection"
# target_col = "rebleeding"
# target_col = "mortality"

# Identify the feature columns
feature_cols = [col for col in df.columns if col != target_col]

# Create the feature matrix X and target vector Y
X = df[feature_cols].values
y = df[target_col].values

# Create the data dictionary
data = {
    'variable_names': feature_cols,
    'X': X,
    'Y': y,
    'outcome_name': target_col 
}

print("Variable names:", data['variable_names'])
print("X shape:", data['X'].shape)
print("Y shape:", data['Y'].shape)

Variable names: ['Antibiotic_prophylaxis', 'Sex', 'HCC', 'Ascites', 'Hepatic_encephalopathy', 'Prior_SBP', 'ICU_admission', 'Age_bin_0', 'Age_bin_1', 'Blood_transfused_in_48_hours_u__bin_0', 'Blood_transfused_in_48_hours_u__bin_1', 'Platelet_count_x10_3_uL__bin_0', 'Platelet_count_x10_3_uL__bin_1', 'WBC_x10_3_uL__bin_0', 'WBC_x10_3_uL__bin_1', 'Hemoglobin_g_L__bin_0', 'Hemoglobin_g_L__bin_1', 'INR_bin_0', 'INR_bin_1', 'Na_mEq_L__bin_0', 'Na_mEq_L__bin_1', 'Creatinine_mg_L__bin_0', 'Creatinine_mg_L__bin_1', 'Bilirubin_mg_dL__bin_0', 'Bilirubin_mg_dL__bin_1', 'ALT_IU_L__bin_0', 'ALT_IU_L__bin_1', 'Albumin_g_dL__bin_0', 'Albumin_g_dL__bin_1', 'Systolic_blood_pressure_mmHg__bin_0', 'Systolic_blood_pressure_mmHg__bin_1', 'Heart_rate_beats_min__bin_0', 'Heart_rate_beats_min__bin_1', 'Hospitalization_day__bin_0', 'Hospitalization_day__bin_1', 'Etiology_of_cirrhosis_BC', 'Etiology_of_cirrhosis_HBV', 'Etiology_of_cirrhosis_HCV', 'Etiology_of_cirrhosis_NBNC', 'Etiology_of_bleeding_peptic_ulcer',

In [4]:
data['X'][0]

array([1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1.,
       1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.])

In [5]:
data['Y']

array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,  1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1.,  1., -1., -1.,  1., -1., -1.,  1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1.,  1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1.,  1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1.,  1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1.,  1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1

In [6]:
sparsity = 20       # no. of parameters
parent_size = 50    # beam search no. to retain
lb=-5
ub=5

RiskScoreOptimizer_m = RiskScoreOptimizer(X = data['X'], y = data['Y'], k = sparsity, lb=lb, ub=ub, parent_size = parent_size)

In [7]:
start_time = time.time()
RiskScoreOptimizer_m.optimize()
print("Optimization takes {:.2f} seconds.".format(time.time() - start_time))

Optimization takes 283.43 seconds.


In [8]:
multipliers, intercepts, coefficients = RiskScoreOptimizer_m.get_models()
print("We generate {} risk score models from the sparse diverse pool".format(len(multipliers)))

We generate 10 risk score models from the sparse diverse pool


In [9]:
X_train = data['X']
y_train = data['Y']

best_results = []

for i in range(len(coefficients)):
    beta = coefficients[i]
    intercept = intercepts[i]

    # Compute scores (including intercept)
    scores_train = X_train @ beta + intercept

    # Determine threshold range
    max_thresh = int(np.ceil(intercept + np.sum(np.maximum(beta, 0))))
    min_thresh = int(np.floor(intercept + np.sum(np.minimum(beta, 0))))
    print(f"Model {i}: Theoretical score range from {min_thresh} to {max_thresh}")

    best_youden = -np.inf
    best_thresh = None
    best_metrics = {}

    # Loop through possible thresholds
    for thresh in range(min_thresh, max_thresh + 1):
        preds = np.where(scores_train >= thresh, 1, -1)

        tn, fp, fn, tp = confusion_matrix(y_train, preds, labels=[-1, 1]).ravel()

        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        youden_index = sensitivity + specificity - 1

        if youden_index > best_youden:
            best_youden = youden_index
            best_thresh = thresh
            best_metrics = {
                'tp': tp,
                'fp': fp,
                'tn': tn,
                'fn': fn,
                'sensitivity': sensitivity,
                'specificity': specificity,
                'threshold': thresh,
                'model_idx': i,
                'intercept': intercept,
                'beta': beta
            }

    best_results.append(best_metrics)

# Select model with best Youden index overall
best_model = max(best_results, key=lambda x: x['sensitivity'] + x['specificity'])

print(f"Best model index: {best_model['model_idx']}, threshold: {best_model['threshold']}")

thresh = best_model['threshold']
beta = coefficients[best_model['model_idx']]
intercept = intercepts[best_model['model_idx']]

Model 0: Theoretical score range from -17 to 6
Model 1: Theoretical score range from -16 to 4
Model 2: Theoretical score range from -16 to 4
Model 3: Theoretical score range from -16 to 4
Model 4: Theoretical score range from -17 to 5
Model 5: Theoretical score range from -16 to 4
Model 6: Theoretical score range from -16 to 4
Model 7: Theoretical score range from -16 to 4
Model 8: Theoretical score range from -16 to 4
Model 9: Theoretical score range from -15 to 4
Best model index: 0, threshold: -2


In [10]:
scores_train = X_train @ beta + intercept

preds_train = np.where(scores_train >= thresh, 1, -1)

tn, fp, fn, tp = confusion_matrix(y_train, preds_train, labels=[-1, 1]).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
npv = tn / (tn + fn) if (tn + fn) > 0 else 0

# F1 scores for both classes
f1_pos = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0
f1_neg = 2 * (specificity * npv) / (specificity + npv) if (specificity + npv) > 0 else 0
f1_macro = (f1_pos + f1_neg) / 2

# AUC Score
# labels_test should be {1, -1}, convert to {1, 0} for AUC
labels_binary = (y_train == 1).astype(int)
auc = roc_auc_score(labels_binary, scores_train)

print("=== Train Set Evaluation ===")
print(f"Best model index: {best_model['model_idx']}")
print(f"Threshold: {thresh}")
print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
print(f"Accuracy: {accuracy:.3f}")
print(f"Sensitivity (Recall): {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Negative Predictive Value: {npv:.3f}")
print(f"F1 Macro Score: {f1_macro:.3f}")
print(f"AUC Score: {auc:.3f}")

=== Train Set Evaluation ===
Best model index: 0
Threshold: -2
TP: 22, FP: 100, TN: 459, FN: 7
Accuracy: 0.818
Sensitivity (Recall): 0.759
Specificity: 0.821
Precision: 0.180
Negative Predictive Value: 0.985
F1 Macro Score: 0.594
AUC Score: 0.862


In [11]:
# Load your test dataset
test_df = pd.read_csv('data_test_infection.csv')
# test_df = pd.read_csv('data_train_rebleeding.csv')
# test_df = pd.read_csv('data_train_mortality.csv')

# Define target column
target_col = "infection"
# target_col = "rebleeding"
# target_col = "mortality"

# Identify the feature columns
feature_cols = [col for col in test_df.columns if col != target_col]

# Create the feature matrix X and target vector Y
X = test_df[feature_cols].values
y = test_df[target_col].values

# Create the data dictionary as expected by RiskSLIM
data_test = {
    'variable_names': feature_cols,
    'X': X,
    'Y': y,
    'outcome_name': target_col 
}

print("Variable names:", data_test['variable_names'])
print("X shape:", data_test['X'].shape)
print("Y shape:", data_test['Y'].shape)

Variable names: ['Antibiotic_prophylaxis', 'Sex', 'HCC', 'Ascites', 'Hepatic_encephalopathy', 'Prior_SBP', 'ICU_admission', 'Age_bin_0', 'Age_bin_1', 'Blood_transfused_in_48_hours_u__bin_0', 'Blood_transfused_in_48_hours_u__bin_1', 'Platelet_count_x10_3_uL__bin_0', 'Platelet_count_x10_3_uL__bin_1', 'WBC_x10_3_uL__bin_0', 'WBC_x10_3_uL__bin_1', 'Hemoglobin_g_L__bin_0', 'Hemoglobin_g_L__bin_1', 'INR_bin_0', 'INR_bin_1', 'Na_mEq_L__bin_0', 'Na_mEq_L__bin_1', 'Creatinine_mg_L__bin_0', 'Creatinine_mg_L__bin_1', 'Bilirubin_mg_dL__bin_0', 'Bilirubin_mg_dL__bin_1', 'ALT_IU_L__bin_0', 'ALT_IU_L__bin_1', 'Albumin_g_dL__bin_0', 'Albumin_g_dL__bin_1', 'Systolic_blood_pressure_mmHg__bin_0', 'Systolic_blood_pressure_mmHg__bin_1', 'Heart_rate_beats_min__bin_0', 'Heart_rate_beats_min__bin_1', 'Hospitalization_day__bin_0', 'Hospitalization_day__bin_1', 'Etiology_of_cirrhosis_BC', 'Etiology_of_cirrhosis_HBV', 'Etiology_of_cirrhosis_HCV', 'Etiology_of_cirrhosis_NBNC', 'Etiology_of_bleeding_peptic_ulcer',

In [12]:
X_test = data_test['X']
y_test = data_test['Y']
scores_test = X_test @ beta + intercept

preds_test = np.where(scores_test >= thresh, 1, -1)

tn, fp, fn, tp = confusion_matrix(y_test, preds_test, labels=[-1, 1]).ravel()

accuracy = (tp + tn) / (tp + tn + fp + fn)
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0  # Recall
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
npv = tn / (tn + fn) if (tn + fn) > 0 else 0

# F1 scores for both classes
f1_pos = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0
f1_neg = 2 * (specificity * npv) / (specificity + npv) if (specificity + npv) > 0 else 0
f1_macro = (f1_pos + f1_neg) / 2

# AUC Score
# labels_test should be {1, -1}, convert to {1, 0} for AUC
labels_binary = (y_test == 1).astype(int)
auc = roc_auc_score(labels_binary, scores_test)

print("=== Test Set Evaluation ===")
print(f"Best model index: {best_model['model_idx']}")
print(f"Threshold: {thresh}")
print(f"TP: {tp}, FP: {fp}, TN: {tn}, FN: {fn}")
print(f"Accuracy: {accuracy:.3f}")
print(f"Sensitivity (Recall): {sensitivity:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Negative Predictive Value: {npv:.3f}")
print(f"F1 Macro Score: {f1_macro:.3f}")
print(f"AUC Score: {auc:.3f}")

=== Test Set Evaluation ===
Best model index: 0
Threshold: -2
TP: 5, FP: 41, TN: 199, FN: 8
Accuracy: 0.806
Sensitivity (Recall): 0.385
Specificity: 0.829
Precision: 0.109
Negative Predictive Value: 0.961
F1 Macro Score: 0.530
AUC Score: 0.684


# fasterrisk method of prediction

    predict(self, X):
        y_score = (self.intercept + X.dot(self.coefficients)) / self.multiplier
        y_pred = 2 * (y_score > 0) - 1
        return y_pred

not used as it assumes the threshold to be 0

left here as reference

In [19]:
model_index = best_model['model_idx']
multiplier = multipliers[model_index]
intercept = intercepts[model_index]
coefficient = coefficients[model_index]

# print(model_index)
# print(multiplier)
# print(intercept)
# print(coefficient)

RiskScoreClassifier_m = RiskScoreClassifier(multiplier, intercept, coefficient, X_train = X_train)
y_test_pred = RiskScoreClassifier_m.predict(data_test['X'])
print("y_test are predicted to be {}".format(y_test_pred))

y_test are predicted to be [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]


In [20]:
diff = np.setdiff1d(y_test_pred, preds_test)
print(diff)

[]
