© 2024 Nokia
Licensed under the BSD 3 Clause Clear License  
SPDX-License-Identifier: BSD-3-Clause-Clear

In [1]:
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

from definitions import ROOT_DIR

In [2]:
sns.set(font_scale = 1.5)

seed = 2
tf.random.set_seed(seed)
np.random.seed(seed)

# MODEL

In [3]:
# Dataset-specific
data_folder = 'MESA'
working_directory = os.path.join('SimCLR', data_folder)
# SimCLR finetuned model
subfolder = '20230330-110822_l2_hs128_e100_esFalse_bs128_wTrue_rFalse'
model_name = 'simclr.finetuned.0.80.hdf5'
frozen_layers = ''
added_layers = 2
tag = 'finetuned2'
pretrained_model = tf.keras.models.load_model(os.path.join(working_directory, subfolder, model_name), compile=False)  # compile=False as we use the model only for inference
image_folder = os.path.join(data_folder, 'img', subfolder)
if not os.path.exists(image_folder):
    os.makedirs(image_folder)
pretrained_model.summary()

Model: "base_model_simclrlinear"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 101, 5)]          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 78, 32)            3872      
_________________________________________________________________
dropout (Dropout)            (None, 78, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 63, 64)            32832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 63, 64)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 56, 96)            49248     
_________________________________________________________________
dropout_2 (Dropout)          (None, 56, 96)

# DATA

In [4]:
# Load preprocessed data
np_train = (np.load(os.path.join(working_directory, 'train_x.npy')),
           np.load(os.path.join(working_directory, 'train_y.npy')))
np_val = (np.load(os.path.join(working_directory, 'val_x.npy')),
           np.load(os.path.join(working_directory, 'val_y.npy')))
np_test = (np.load(os.path.join(working_directory, 'test_x.npy')),
           np.load(os.path.join(working_directory, 'test_y.npy')))

probs = pretrained_model.predict(np_test[0])
predictions = np.argmax(probs, axis=1)
# print(simclr_utitlities.evaluate_model_simple(pretrained_model.predict(np_test[0]), np_test[1], return_dict=True))

In [5]:
print("Train - Validation - Test Set Shapes:")
print("Train X: {} - Val X: {} - Test X: {}".format(np_train[0].shape, np_val[0].shape, np_test[0].shape))
print("Train y: {} - Val y: {} - Test y: {}".format(np_train[1].shape, np_val[1].shape, np_test[1].shape))

Train - Validation - Test Set Shapes:
Train X: (1449147, 101, 5) - Val X: (365497, 101, 5) - Test X: (452015, 101, 5)
Train y: (1449147, 2) - Val y: (365497, 2) - Test y: (452015, 2)


In [6]:
# os.chdir(ROOT_DIR)
subjects = pd.read_csv(os.path.join('../../datasets', data_folder, 'demographics.csv'), delimiter=';')
# subject IDs per train-validation-test set
train_listfile = pd.read_csv(os.path.join('datasets', data_folder, 'train_listfile.csv'))
val_listfile = pd.read_csv(os.path.join('datasets', data_folder, 'val_listfile.csv'))
test_listfile = pd.read_csv(os.path.join('datasets', data_folder, 'test_listfile.csv'))
# change back the working directory
os.chdir(os.path.join('code', 'baselines', 'SimCLR'))
subjects.head()

FileNotFoundError: [Errno 2] No such file or directory: 'datasets\\MESA\\train_listfile.csv'

In [None]:
test_listfile

In [None]:
# adding predictions
test_listfile.loc[:, "y_pred"] = predictions
test_listfile.loc[:, "y_prob"] = probs
test_listfile.head()

# HOW MANY SAMPLES PER USER

In [None]:
test_listfile.mesaid.value_counts()

In [None]:
print("Unique users: {} vs. Total samples: {}".format(len(test_listfile.mesaid.unique()), test_listfile.shape[0]))

In [None]:
print("Max samples per user: {} vs. Min samples per user: {}".format(test_listfile.mesaid.value_counts().max(), test_listfile.mesaid.value_counts().min()))

# AGGREGATION
Options:
Get the mode (y_true, y_pred) per user
Get the weighted fairness metric based on the #samples per user

## MODE (w/ argmax)

In [None]:
test_listfile_aggregated = test_listfile.groupby('mesaid')[['wake', 'y_pred']].agg(pd.Series.mode)
test_listfile_aggregated.head()

In [None]:
# indeed the rows are as many as the users
test_listfile_aggregated.shape[0]

## MEDIAN (w/ probabilities)

In [None]:
test_listfile_aggregated_median = test_listfile.groupby('mesaid')[['y_prob']].agg(pd.Series.median)
test_listfile_aggregated.head()

In [None]:
# merge demographics in test df
test_listfile_aggregated_median = test_listfile_aggregated_median.merge(subjects, on="mesaid", how="left")
test_listfile_aggregated_median.head()

In [None]:
# adding new age attribute
test_listfile_aggregated_median.loc[:, 'nsrr_age_gt65'] = test_listfile_aggregated_median.nsrr_age.map(lambda age: 'no' if age < 65 else 'yes')
test_listfile_aggregated_median.head()

# HOW DIFFERENT ARE THE PREDICTED DISTRIBUTIONS NOW?

In [None]:
plt.hist(test_listfile_aggregated_median.loc[test_listfile_aggregated_median.nsrr_sex == "female", "y_prob"], alpha=0.5, label='Aggregated')
plt.hist(test_listfile_aggregated_median.loc[test_listfile_aggregated_median.nsrr_sex == "female", "y_prob"], alpha=0.5, label='Raw')
plt.legend(loc='upper right')
plt.show()