# Encode data

This notebook runs part of the Multi-Omics Variational autoEncoder (MOVE) framework for using the structure the VAE has identified for extracting categorical data assositions across all continuous datasets. In the MOVE paper we used it for identifiying drug assosiations in clinical and multi-omics data. This part is a guide for encoding the data that can be used as input in MOVE. 

In [2]:
# Import functions

import numpy as np
from collections import defaultdict

In [3]:
import pandas as pd
file_bad = pd.read_csv('./data/baseline_metagenomics.tsv', sep='\t', header=0)
file_good = pd.read_csv('./data/baseline_untarget_metabolomics.tsv', sep='\t', header=0)

In [4]:
# print(file_bad)

In [5]:
path = './'

In [6]:
# Functions for encoding

def encode_cat(raw_input, num_classes=None, uniques=None, na='NA'):
   matrix = np.array(raw_input)
   print(f'unique: {np.unique(matrix)}')
   n_labels = matrix.shape[1]
   n_samples = matrix.shape[0]
   
   # make endocding dict
   encodings = defaultdict(dict)
   count = 0
   no_unique = 0
   
   if uniques is None:
      no_unique = 1
      encodings = defaultdict(dict)
      for lab in range(0,n_labels):
         uniques = np.unique(matrix[:,lab])
         uniques = sorted(uniques)
         num_classes = len(uniques[uniques != na])
         count = 0
         for u in uniques:
            if u == na:
               encodings[lab][u] = np.zeros(num_classes)
               continue
            encodings[lab][u] = np.zeros(num_classes)
            encodings[lab][u][count] = 1
            count += 1
   else:
      for u in uniques:
         if u == na:
            encodings[u] = np.zeros(num_classes)
            continue
         encodings[u] = np.zeros(num_classes)
         encodings[u][count] = 1
         count += 1
   
   # encode the data
   data_input = np.zeros((n_samples,n_labels,num_classes))
   i = 0
   for patient in matrix:
      
      data_sparse = np.zeros((n_labels, num_classes))
      count = 0
      for lab in patient:
         if no_unique == 1:
            data_sparse[count] = encodings[count][lab]
         else:
            if lab != na:
               lab = int(float(lab))
            data_sparse[count] = encodings[lab]
         count += 1
      
      data_input[i] = data_sparse
      i += 1
      
   return data_input

def encode_con(raw_input):
   
   matrix = np.array(raw_input)
   print(f'unique: {np.unique(matrix)}')
   consum = matrix.sum(axis=1)
   
   data_input = np.log2(matrix + 1)
   
   # remove 0 variance
   std = np.nanstd(data_input, axis=0)
   mask_col = std != 0
   data_input = data_input[:,mask_col]
   # z-score normalize
   mean = np.nanmean(data_input, axis=0)
   
   std = np.nanstd(data_input, axis=0)
   
   data_input = data_input  #check: data_input=data_input
   data_input -= mean
   data_input /= std
   return data_input, mask_col  # Added return function

def sort_data(data, ids, labels):
   n_labels = len(labels)
   sorted_data = list()

   for ids in ids: #check: ids/ids

      if ids in data:
         sorted_data.append(data[ids])
      else:
         #tmp = np.zeros((n_labels))
         #tmp[:] = np.nan
         tmp = [0]*n_labels
         sorted_data.append(tmp)  #With random data just puts everything to zeros
   return sorted_data

For encoding the data you need to have each dataset/data type in a format for N x M, where N is the numer of samples/individuals and M is the number of features. For using the dataset specific weighting in the training of the VAE you need to process the datasets individually or split them when you read them in. The continuous data is z-score normalised and the categorical data is one-hot encoded. Below is an example of processing a continuous dataset and two categorical datasets with different number of categories. To ensure the correct order the ID's are used for sorting the data accordingly.

In [25]:
def generate_cat_file(data_type, num_classes): #Todo make separate get IDs
    ids = list()
    with open(path + "data/baseline_ids.txt", "r") as f:
        for line in f:
            ids.append(line.rstrip()) 
             
    raw_input = dict()
    with open(path + f"data/{data_type}.tsv", "r") as f:
        header = f.readline()
        for line in f:
            line = line.rstrip()
            tmp = np.array(line.split("\t"))
            vals = tmp[1:]
            vals[vals == 'NA'] = np.nan
            vals = list(map(float, vals))
            raw_input[tmp[0]] = vals
    header = header.split("\t")
    
    # Set the number of classes and categories
    
    if num_classes==2:
        uniques = [0, 1, 'nan']
    elif num_classes==3:
        uniques = [0, 1, 2, 'nan']
    elif num_classes==4:
        uniques = [0, 1, 2, 3, 'nan']
    elif num_classes==5:
        uniques = [0, 1, 2, 3, 4, 'nan'] #add as input, or raise value error 
    

    # Sort and encode the data
    sorted_data = sort_data(raw_input, ids, header)
    
     # Set the number of classes and raise error if doesn't match
        
    _uniques = [*range(num_classes)]
    uniques = [*range(num_classes), 'nan']
#     uniques_real = 
    if np.array_equal(np.array(_uniques), np.unique(np.array(sorted_data))):
        print('equal1')
    if np.array_equal(np.array(uniques), np.unique(np.array(sorted_data))):
        print('equal2')
    
    if not np.array_equal(np.array(_uniques), np.unique(np.array(sorted_data))) and \
       not np.array_equal(np.array(uniques), np.unique(np.array(sorted_data))):
       raise ValueError(f'In generate_cat_file() provided num_classes does not correspond to given dataset for {data_type} datatype')
    
    print(f'unique: {np.unique(sorted_data)}')
    data_input = encode_cat(sorted_data, num_classes, uniques, 'nan')
    np.save(path + f"data/{data_type}.npy", data_input)    
    
    
generate_cat_file('diabetes_genotypes', 3)
generate_cat_file('baseline_drugs', 2)
generate_cat_file('baseline_categorical', 5)


ValueError: In generate_cat_file() provided num_classes does not correspond to given dataset for diabetes_genotypes datatype

In [None]:
def generate_con_file(data_type):
    ids = list()
    with open(path + "data/baseline_ids.txt", "r") as f:
        for line in f:
            ids.append(line.rstrip())

    # Encode continuous
    raw_input = dict()
    with open(path + f"data/{data_type}.tsv", "r") as f:
        header = f.readline()
        for line in f:
            line = line.rstrip()
            tmp = np.array(line.split("\t"))
            vals = tmp[1:]
            vals[vals == 'NA'] = np.nan
            
            vals = list(map(float, vals))
            raw_input[tmp[0]] = vals

    header = header.split("\t")
    sorted_data = sort_data(raw_input, ids, header)

    data_input, mask = encode_con(sorted_data)
    np.save(path + f"data/{data_type}.npy", sorted_data)

generate_con_file('baseline_continuous')
generate_con_file('baseline_transcriptomics')
generate_con_file('baseline_diet_wearables')
generate_con_file('baseline_proteomic_antibodies')
generate_con_file('baseline_target_metabolomics')
generate_con_file('baseline_untarget_metabolomics')
generate_con_file('baseline_metagenomics')
