# Encode data

This notebook runs part of the Multi-Omics Variational autoEncoder (MOVE) framework for using the structure the VAE has identified for extracting categorical data assositions across all continuous datasets. In the MOVE paper we used it for identifiying drug assosiations in clinical and multi-omics data. This part is a guide for encoding the data that can be used as input in MOVE. 

In [19]:
# Import functions

import numpy as np
from collections import defaultdict
import pandas as pd
import yaml


In [20]:
# path = './data/'

In [21]:
# Functions for encoding

def encode_cat(raw_input, na='NA'):
   """
   Encodes categorical data into one-hot encoding
   
   inputs:
       raw_input: a list of source data sorted by IDs from baseline_ids.txt file
   returns:
       data_input: one hot encoded data
   """ 
    
   matrix = np.array(raw_input)
   n_labels = matrix.shape[1]
   n_samples = matrix.shape[0]
   
   unique_sorted_data = np.unique(raw_input)
   num_classes = len(unique_sorted_data[~np.isnan(unique_sorted_data)])
   uniques = [*range(num_classes), 'nan']
 
   # make endocding dict
   encodings = defaultdict(dict)
   count = 0
   no_unique = 0
   
   for u in uniques:
      if u == na:
         encodings[u] = np.zeros(num_classes)
         continue
      encodings[u] = np.zeros(num_classes)
      encodings[u][count] = 1
      count += 1

   # encode the data
   data_input = np.zeros((n_samples,n_labels,num_classes))
   i = 0
   for patient in matrix:
      
      data_sparse = np.zeros((n_labels, num_classes))
      count = 0
      for lab in patient:
         if no_unique == 1:
            data_sparse[count] = encodings[count][lab]
         else:
            if lab != na:
               lab = int(float(lab))
            data_sparse[count] = encodings[lab]
         count += 1
      
      data_input[i] = data_sparse
      i += 1
      
   return data_input

def encode_con(raw_input):
   """
   Log transforms and z-normalizes the data
   
   Input: 
       raw_input: a list of source data sorted by IDs from baseline_ids.txt file
   Returns:
       data_input: numpy array with log transformed and z-score normalized data
       mask_col: a np.array vector of Bolean values that correspond to nonzero sd values 
   """

   matrix = np.array(raw_input)
   consum = matrix.sum(axis=1)
   
   data_input = np.log2(matrix + 1) 
   
   # remove 0 variance
   std = np.nanstd(data_input, axis=0)
   mask_col = std != 0
   data_input = data_input[:,mask_col]
    
   # z-score normalize
   mean = np.nanmean(data_input, axis=0)
   std = np.nanstd(data_input, axis=0)
   data_input -= mean
   data_input /= std
   return data_input, mask_col 


def sort_data(data, ids, labels):
    """
    Sorts data based on the ids file
    
    Inputs:
        data: a dictionary with the data to encode
        ids: a list of personal identfiers (ID) from baseline_ids.txt file
        labels: a list of column names from the source data file
    Returns:
        sorted_data: a list of source data sorted by IDs from baseline_ids.txt file
    """

    n_labels = len(labels)
    sorted_data = list()

    for _ids in ids: #check: ids/ids
      if _ids in data:
         sorted_data.append(data[_ids])
      else:
         tmp = [0]*n_labels
         sorted_data.append(tmp)
    return sorted_data

def read_files(path, data_type, ids_file_name, na):
    """
    Function reads the input file into the dictionary
    
    Inputs:
        data_type: a string that defines a name of .tsv file to encode
        na: a string that defines how NA values are defined in the source data file
    Returns:
        ids: a list of personal identfiers (ID) from baseline_ids.txt file
        raw_input: a dictionary with the data to encode
        header: a list of column names from the source data file
    """
    
    ids = list()
    with open(path + f"{ids_file_name}.txt", "r") as f:
        for line in f:
            ids.append(line.rstrip()) 
             
    raw_input = dict()
    with open(path + f"{data_type}.tsv", "r") as f:
        header = f.readline()
        for line in f:
            line = line.rstrip()
            tmp = np.array(line.split("\t"))
            vals = tmp[1:]
            vals[vals == na] = np.nan
            vals = list(map(float, vals))
            raw_input[tmp[0]] = vals
    header = header.split("\t")
    
    return ids, raw_input, header

def generate_file(var_type, path, data_type, ids_file_name, na='NA'):
    """
    Function encodes source data type and saves the file
    
    inputs:
        var_type: a string out of ['categorical', 'continuous'], defines input data type to encode
        path: a string that defines a path to the directory the input data is stored
        data_type: a string that defines a name of .tsv file to encode
        na: a string that defines how NA values are defined in the source data file
    """
    
    ids, raw_input, header = read_files(path, data_type, ids_file_name, na)
    sorted_data = sort_data(raw_input, ids, header)
    
    if var_type == 'categorical':
        data_input = encode_cat(sorted_data, 'nan')
    elif var_type == 'continuous':
        data_input, _ = encode_con(sorted_data)
    
    np.save(path + f"{data_type}.npy", data_input)

For encoding the data you need to have each dataset/data type in a format for N x M, where N is the numer of samples/individuals and M is the number of features. For using the dataset specific weighting in the training of the VAE you need to process the datasets individually or split them when you read them in. The continuous data is z-score normalised and the categorical data is one-hot encoded. Below is an example of processing a continuous dataset and two categorical datasets with different number of categories. To ensure the correct order the ID's are used for sorting the data accordingly.

In [30]:
if __name__ == "__main__":
    
    # Reads the data as dictionary
    with open(r'data.yaml') as file:
        data_dict = yaml.load(file, Loader=yaml.FullLoader)
    
    # Takes variables from the read file
    path = data_dict['path']
    ids_file_name = data_dict['ids_file_name']
    na_encoding = data_dict['na_encoding']

    # Encodes categorical data
    for cat_data in data_dict['categorical_data_files']:
        generate_file('categorical', path, 
                      cat_data, ids_file_name, na_encoding)
        print(f'Encoded {cat_data}')
    
    # Encodes continuous data 
    for con_data in data_dict['continuous_data_files']:
        generate_file('continuous', path, con_data, ids_file_name, na_encoding)    
        print(f'Encoded {con_data}')
    

Encoded diabetes_genotypes
Encoded baseline_drugs
Encoded baseline_categorical
Encoded baseline_continuous
Encoded baseline_transcriptomics
Encoded baseline_diet_wearables
Encoded baseline_proteomic_antibodies
Encoded baseline_target_metabolomics
Encoded baseline_untarget_metabolomics
Encoded baseline_metagenomics
