In [None]:

    
def __getitem__(self, item):
    """
    Returns:
        tensors (dict)
            'rolling_ftseq', [batch_size, 119]
            'ts', [batch_size, 239, 184]
            'statics', [batch_size, 54]
            'next_timepoint', [batch_size, 56]
            'next_timepoint_was_measured', [batch_size, 56]
            'disch_24h', [batch_size, 1]
            'disch_48h', [batch_size, 1]
            'Final Acuity Outcome', [batch_size, 1]
            'ts_mask', [batch_size, 239]
            'tasks_binary_multilabel', [batch_size, 7]
            'note_ids', [batch_size, 239, 512]
            'note_masks', [batch_size, 239, 512]
            'note_segment_ids', [batch_size, 239, 512]
            'note_hours_idx', [batch_size, 239]
            'note_hours_num', [batch_size])
    """
    # We'll use these for a bit of special processing surrounding our masked imputation task, so we
    # define them now.
    ts_vals_key, ts_is_measured_key, imputation_mask_key = 'ts_vals', 'ts_is_measured', 'ts_mask'
    temptime = time.time()
    # Loading data
    try:
        tensors = None
        if self.epoch_cached:
            tensors = self.cached_epoch[item]
        else:
            loaded, cached_item = self.load_save_path(item=item)
            if loaded: tensors = cached_item
        # print("gettime = ", time.time()-temptime)
        return tensors
        if tensors is not None:
            if self.save_data_only: return {'null': torch.zeros((1, 1))}
            tensors.update({k:v.float() for k,v in tensors.items() if v.dtype==torch.float16})

            if 'rolling_fts' in tensors:
                tensors['rolling_ftseq'] = tensors.pop('rolling_fts')

            # Now adding the mask key.
            if self.imputation_mask_rate > 0:
                any_masked = False
                while not any_masked:
                    mask_prob = np.random.uniform(size=(self.max_seq_len, 1))
                    any_masked = ((mask_prob < self.imputation_mask_rate).sum() > 0)
                tensors[imputation_mask_key] = torch.Tensor(np.where(
                    mask_prob < self.imputation_mask_rate,
                    np.ones_like(mask_prob), np.zeros_like(mask_prob)
                ))
            elif 'ts_mask' in tensors: del tensors['ts_mask']
            print("gettime = ", time.time()-temptime)
            return tensors
            # 若有cache就return了 否则会进入到后面的dfs
    except:
        print(f"Failed to load item {item}")
        print(f"Save path: {self.get_save_path(item=item)}")
        raise

    # Now we actually need to create the item, but we may not have bothered to lad the dataframes yet. If
    # not, we'll do that now.
    try:
        self.dfs
        # 没有self.dfs
        print_shapes = False
    except AttributeError as e:
        print(f"Failed to load item from {self.get_save_path(item=item)}. Reloading dfs and creating it.")
        assert hasattr(self, 'reload_self_dir'), f"Can't build items as lacks dfs or reload_self_dir!"

        full_self_path = os.path.join(self.reload_self_dir, f"{self.train_tune_test}_dataset.pkl")
        assert os.path.isfile(full_self_path), f"{full_self_path} doesn't exist! Can't reload dfs."

        full_dataset = depickle(full_self_path)
        self.dfs = full_dataset.dfs
        self.subjects = full_dataset.subjects
        self.orig_subjects = full_dataset.orig_subjects

        self.reset_index()
        print_shapes = True
        print("Reloaded dfs. Continuing.")

    # Icustay id is always first.
    idx = self.index[item]
    if type(idx) is tuple:
        icustay_id, end_time = idx
        start_time = max(end_time - self.max_seq_len, 0)
        seq_len = end_time - start_time
    else:
        icustay_id = idx
        if self.sequence_len:
            end_time   = self.sequence_len
            start_time = max(end_time - self.max_seq_len, 0)
            seq_len    = end_time - start_time
        else:
            max_seq_len = min(self.max_hours[item], self.max_seq_len)
            end_time    = random.randint(self.min_seq_len, self.max_hours[item]) # the end time for this patient
            start_time  = max(end_time - max_seq_len, 0) # the start time corresponding to the random_end_time
            seq_len     = end_time - start_time

    assert seq_len <= self.max_seq_len, f"seq_len is {seq_len}, which is not less than or equal to max seq_length=={max_seq_len}"

    correction_attempts = 0
    while 'rolling_fts' in self.dfs and 'rolling_ftseq' not in self.dfs:
        try:
            # print("Amending dfs to include rolling_ftseq")
            self.dfs['rolling_ftseq'] = self.dfs['rolling_fts']
            self.dfs.pop('rolling_fts', None)
        except: pass
        correction_attempts += 1
        if correction_attempts > 10:
            raise ValueError(f"Failed to correct dataframes fts v. ftseq bug!")


    # collect the indices for the patient
    idxs = {k: (df.index.get_level_values('icustay_id') == icustay_id) for k, df in self.dfs.items()}
    # We'll piggy back on our "next_timepoint" task for this imputation task. A more elegant solution
    # would be to just store the measurement indicators and use them for both this task and the
    # next timepoint prediction, but that's not how things are implemented for now.
    idxs[ts_is_measured_key] = idxs['next_timepoint_was_measured'].copy()

    # get the indices for each df between start_time and end_time
    # Note for our special case of `ts_is_measured_key` & `next_timepoint_was_measured`, we still have it
    # the case that the input features end at *<* end_time, whereas the target extractions are
    # *==* end_time, so this should be valid.
    for idxs_k, dfs_k in (
        ('ts', 'ts'), ('notes', 'notes'), (ts_is_measured_key, 'next_timepoint_was_measured'),
    ):
        if idxs_k in idxs:
            hours_in = self.dfs[dfs_k].index.get_level_values('hours_in')
            idxs[idxs_k] &= ((hours_in >= start_time) & (hours_in < end_time))


    # get the next task for predictions
    for k in [
        'rolling_tasks_binary_multilabel', 'rolling_tasks_multiclass', 'rolling_ftseq', 'next_timepoint',
        'next_timepoint_was_measured',
    ]:
        if k not in self.dfs or self.dfs[k] is None: continue
        if k in idxs: idxs[k] &= (self.dfs[k].index.get_level_values('hours_in') == end_time)

    # get the correct subset of the dfs
    dfs = {k: df.loc[idxs[k]].copy() for k, df in self.dfs.items() if df is not None}
    dfs[ts_is_measured_key] = self.dfs['next_timepoint_was_measured'].loc[idxs[ts_is_measured_key]].copy()

    # break up all of these dataframes that were processed as one into individual dfs
    for k in ('rolling_tasks_multiclass', 'static_tasks_multiclass'):
        df = dfs[k]
        for c in df.columns:
            dfs[c] = df[[c]]

        del dfs[k]

    if seq_len != len(dfs['ts']):
        print(idx, start_time, end_time, self.sequence_len)
        raise AssertionError("Length mismatch! %d v %d" % (seq_len, len(dfs['ts'])))

    # For the next timepoint, we only want the means of measured labs.
    # TODO(mmd): Is this the right place for this logic? Or should it go earlier?
    cols = dfs['next_timepoint'].columns
    # print(cols)
    mean_labs_cols = [c for c in cols if type(c) is tuple and c[1] == 'mean']
    dfs['next_timepoint'] = dfs['next_timepoint'][mean_labs_cols].fillna(value=-1)

    # Here, we pull out data for a masked imputation task. We want to store separately the continuous TS
    # values (not imputed, as we don't want to predict imputed values), indicators of whether TS vals were
    # measured (to mask out values we don't want to include in our imputation value and to predict what
    # values should be imputed at any masked timepoint), and a mask key for the entire timeseries to
    # indicate which timepoints are actually masked.

    dfs[ts_vals_key] = dfs['ts'].loc[:, mean_labs_cols].copy().fillna(0)
    # dfs[ts_is_measured_key] is already defined, based on the logic above.
    # dfs[imputation_mask_key] we'll actually construct later, in the numpy arrays directly, as it doesn't have the
    # same structure (e.g., column names) as the real dfs, we just need to match shape.


    # TS continuous ais the only remaining actual timeseries feature.
    # It needs to be imputed, padded, and reshaped.
    dfs['ts'].loc[:, self.ts_continuous_cols] = self.impute_fn(
        dfs['ts'].loc[:, self.ts_continuous_cols]
    ).fillna(0) # First impute, then fill w/ 0.


    if self.using_pretrained_notes:
        dfs['notes'].loc[:, self.notes_cols] = self.impute_fn(
            dfs['notes'].loc[:, self.notes_cols]
        ).fillna(0) # First impute, then fill w/ 0.  # this is producing nans

    np_arrays = {k: df.values for k, df in dfs.items()}
    # We will deal with notes separately if we are integrating them instead of simply using pretrained embeddings
    if not self.using_pretrained_notes:
        np_arrays.pop('notes', None)

    # Now adding the mask key.
    if self.imputation_mask_rate > 0:
        any_masked = False
        while not any_masked:
            mask_prob = np.random.uniform(size=(self.max_seq_len, 1))
            any_masked = ((mask_prob < self.imputation_mask_rate).sum() > 0)
        np_arrays[imputation_mask_key] = np.where(
            mask_prob < self.imputation_mask_rate, np.ones_like(mask_prob), np.zeros_like(mask_prob)
        )

    # Padding
    for k in ('ts', ts_vals_key, ts_is_measured_key, 'notes'):
        if k in np_arrays:
            num_features = np_arrays[k].shape[1]
            if np_arrays[k].shape[0] != self.max_seq_len:
                if self.max_seq_len > seq_len:
                    pad = np.zeros((self.max_seq_len - seq_len, num_features))
                    np_arrays[k] = np.expand_dims(np.concatenate((np_arrays[k], pad)), 0)
            elif self.max_seq_len == seq_len:
                np_arrays[k] = np.expand_dims(np_arrays[k], 0)

    try:
        np_arrays['tasks_binary_multilabel'] = np.concatenate(
            [np_arrays[k] for k in self.binary_multilabel_task_concat_order], axis=1
        )
        del np_arrays['rolling_tasks_binary_multilabel']
        del np_arrays['static_tasks_binary_multilabel']
    except ValueError as e:
        print(idx, start_time, end_time, self.sequence_len)
        for k in self.binary_multilabel_task_concat_order:
            print(f"{k}: {np_arrays[k].shape}")
        raise

    # Notes
    if self.using_integrated_notes:
        raise NotImplementedError("Doesn't support notes at present.")

    tensors = {}
    for k, arr in np_arrays.items():
        #assert arr.shape[0] == 1, f"Must only have one first dimension for {k}! Got {arr.shape}"
        # print(k, arr.shape)
        if arr.shape[0] == 1: tensors[k] = torch.tensor(arr[0])
        else: tensors[k] = torch.tensor(arr)

    if self.imputation_mask_rate == 0:
        assert 'ts_mask' not in tensors, f"{item}, {idx}, {k: t.shape for k, t in tensors.items()}"
    else: