In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from opacus import PrivacyEngine
import os
import pandas as pd
import numpy as np
from utils import preprocess_data, train_private_logreg, train_logreg, test_private_logreg
import matplotlib.pyplot as plt
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import shutup

# Train/Test data

In [2]:
train_sources = ['chapman', 'cpsc', 'ptb', 'sph']

train_ids = []
for source in train_sources:
    df = pd.read_csv(f'data/sources_clean_zoher/clean_all_{source}.csv')
    for idx, row in df.iterrows():
        train_ids.append(row['path'].split('/')[-1].split('.')[0].split('_')[0])

test_source = 'g12ec'
test_ids = []
df = pd.read_csv(f'data/sources_clean_zoher/clean_all_{test_source}.csv')
for idx, row in df.iterrows():
    test_ids.append(row['path'].split('/')[-1].split('.')[0].split('_')[0])

train_feats = pd.DataFrame()
for source in ['ChapmanShaoxing_Ningbo', 'CPSC_CPSC-Extra', 'PTB_PTBXL', 'SPH']:
    source_feats = pd.read_csv(f'data/{source}_feats.csv')
    train_feats = pd.concat((train_feats, source_feats), ignore_index=True)
train_feats = train_feats[train_feats['id'].isin(train_ids)]


df_test = pd.read_csv('data/G12EC_feats.csv')
test_feats = df_test[df_test['id'].isin(test_ids)]

X_train, y_train = preprocess_data(train_feats)
X_test, y_test = preprocess_data(test_feats)

print(X_train.shape, X_test.shape)

(94192, 22) (8827, 22)


# Training/Testing

In [None]:
num_epochs = 50
batch_size = 20000
lr = 50
models = {}
for epsilon in [1, 10, 100]:
    model = train_private_logreg(X_train, y_train, epsilon=epsilon,
                                 num_epochs=num_epochs, batch_size=batch_size, lr=lr)
    models[f'e_{epsilon}'] = model

models['e_inf'] = train_logreg(X_train, y_train, num_epochs=num_epochs, batch_size=20000, lr=50)

train_scores = {}
test_scores = {}

for k in models.keys():
    train_scores[f'{k}'] = test_private_logreg(models[k], X_train, y_train)
    test_scores[f'{k}'] = test_private_logreg(models[k], X_test, y_test)

print(f'         train scores                                    test scores')
for k in train_scores.keys():
    key_str = (k + '  ')[:5]  # Pad/truncate key for alignment
    print(f'{key_str}   {train_scores[k]}      {test_scores[k]}')


Epoch: 1, Loss: 0.4046, Epsilon: 99.99

In [4]:
import pickle
import numpy as np

# Define 10 fixed, reproducible seeds
fixed_seeds = [42, 7, 123, 2023, 999, 314, 0, 88, 17, 555]

num_epochs = 500
batch_size = int(0.5*len(X_train))
lr = 20
epsilons = [1, 10, 100]

all_results = []

for seed in fixed_seeds:
    np.random.seed(seed)
    # If you're using other libs like torch or random, set those seeds too
    # random.seed(seed)
    # torch.manual_seed(seed)

    models = {}

    for epsilon in epsilons:
        model = train_private_logreg(X_train, y_train, epsilon=epsilon,
                                     num_epochs=num_epochs, batch_size=batch_size, lr=lr)
        models[f'e_{epsilon}'] = model

    models['e_inf'] = train_logreg(X_train, y_train, num_epochs=num_epochs,
                                   batch_size=batch_size, lr=lr)

    train_scores = {}
    test_scores = {}

    for k in models:
        train_scores[k] = test_private_logreg(models[k], X_train, y_train)
        test_scores[k] = test_private_logreg(models[k], X_test, y_test)

    all_results.append({
        'seed': seed,
        'train_scores': train_scores,
        'test_scores': test_scores
    })

# Save to pickle
with open('logreg_results.pkl', 'wb') as f:
    pickle.dump(all_results, f)


Epoch: 500, Loss: 0.1690, Epsilon: 100.00

In [6]:
all_results

[{'seed': 42,
  'train_scores': {'e_1': {'macro_auc': 0.7787, 'micro_auc': 0.9031},
   'e_10': {'macro_auc': 0.7981, 'micro_auc': 0.9079},
   'e_100': {'macro_auc': 0.7975, 'micro_auc': 0.908},
   'e_inf': {'macro_auc': 0.7985, 'micro_auc': 0.9081}},
  'test_scores': {'e_1': {'macro_auc': 0.7092, 'micro_auc': 0.7958},
   'e_10': {'macro_auc': 0.7192, 'micro_auc': 0.815},
   'e_100': {'macro_auc': 0.718, 'micro_auc': 0.8114},
   'e_inf': {'macro_auc': 0.7177, 'micro_auc': 0.8063}}},
 {'seed': 7,
  'train_scores': {'e_1': {'macro_auc': 0.7768, 'micro_auc': 0.903},
   'e_10': {'macro_auc': 0.798, 'micro_auc': 0.908},
   'e_100': {'macro_auc': 0.7982, 'micro_auc': 0.908},
   'e_inf': {'macro_auc': 0.7987, 'micro_auc': 0.908}},
  'test_scores': {'e_1': {'macro_auc': 0.7032, 'micro_auc': 0.7533},
   'e_10': {'macro_auc': 0.7185, 'micro_auc': 0.8154},
   'e_100': {'macro_auc': 0.7182, 'micro_auc': 0.8111},
   'e_inf': {'macro_auc': 0.7214, 'micro_auc': 0.8105}}},
 {'seed': 123,
  'train_score

47096

In [11]:
for i in all_results:
    print(i['test_scores'])

{'e_1': {'macro_auc': 0.7092, 'micro_auc': 0.7958}, 'e_10': {'macro_auc': 0.7192, 'micro_auc': 0.815}, 'e_100': {'macro_auc': 0.718, 'micro_auc': 0.8114}, 'e_inf': {'macro_auc': 0.7177, 'micro_auc': 0.8063}}
{'e_1': {'macro_auc': 0.7032, 'micro_auc': 0.7533}, 'e_10': {'macro_auc': 0.7185, 'micro_auc': 0.8154}, 'e_100': {'macro_auc': 0.7182, 'micro_auc': 0.8111}, 'e_inf': {'macro_auc': 0.7214, 'micro_auc': 0.8105}}
{'e_1': {'macro_auc': 0.7043, 'micro_auc': 0.7448}, 'e_10': {'macro_auc': 0.7198, 'micro_auc': 0.8136}, 'e_100': {'macro_auc': 0.7203, 'micro_auc': 0.8151}, 'e_inf': {'macro_auc': 0.719, 'micro_auc': 0.8097}}
{'e_1': {'macro_auc': 0.712, 'micro_auc': 0.8078}, 'e_10': {'macro_auc': 0.7177, 'micro_auc': 0.8152}, 'e_100': {'macro_auc': 0.7162, 'micro_auc': 0.8105}, 'e_inf': {'macro_auc': 0.7206, 'micro_auc': 0.8092}}
{'e_1': {'macro_auc': 0.7097, 'micro_auc': 0.7651}, 'e_10': {'macro_auc': 0.72, 'micro_auc': 0.8102}, 'e_100': {'macro_auc': 0.7189, 'micro_auc': 0.8099}, 'e_inf': 

In [14]:
int(0.5*len(X_train))

47096

# Grid search

In [None]:
X, y = preprocess_data(test_size=0)
param_grid = {
    'num_epochs': [10, 20, 50, 100, 200, 500],
    'batch_size': [0.05, 0.1, 0.2, 0.5],
    'lr': [0.1, 1, 5, 10, 20, 50]
}

mskf = MultilabelStratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scores = {}
best_score = 0
best_prms = None
for num_epochs in param_grid['num_epochs']:
    for batch_size in param_grid['batch_size']:
        for lr in param_grid['lr']:
            scores[f'{num_epochs}_{batch_size}_{lr}'] = []
            for train_index, test_index in mskf.split(X, y):
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index] 
                model = train_private_logreg(X_train, y_train, epsilon=1, num_epochs=num_epochs,
                                             batch_size=int(batch_size*len(X_train)), lr=lr)
                macro_auc = test_private_logreg(model, X_test, y_test)['macro_auc']
                scores[f'{num_epochs}_{batch_size}_{lr}'].append(macro_auc)
            # print(scores)
            if np.mean(scores[f'{num_epochs}_{batch_size}_{lr}']) > best_score:
                best_score = np.mean(scores[f'{num_epochs}_{batch_size}_{lr}'])
                best_prms = {'num_epochs': num_epochs,
                             'batch_size': batch_size,
                             'lr': lr}
                print(best_prms, best_score)