# Encode data

This notebook runs part of the Multi-Omics Variational autoEncoder (MOVE) framework for using the structure the VAE has identified for extracting categorical data assositions across all continuous datasets. In the MOVE paper we used it for identifiying drug assosiations in clinical and multi-omics data. This part is a guide for encoding the data that can be used as input in MOVE. 

In [None]:
# Import functions
import numpy as np
from collections import defaultdict

In [None]:
# Functions for encoding

def encode_cat(raw_input, num_classes=None, uniques=None, na='NA'):
   matrix = np.array(raw_input)
   n_labels = matrix.shape[1]
   n_samples = matrix.shape[0]
   
   # make endocding dict
   encodings = defaultdict(dict)
   count = 0
   no_unique = 0
   
   if uniques is None:
      no_unique = 1
      encodings = defaultdict(dict)
      for lab in range(0,n_labels):
         uniques = np.unique(matrix[:,lab])
         uniques = sorted(uniques)
         num_classes = len(uniques[uniques != na])
         count = 0
         for u in uniques:
            if u == na:
               encodings[lab][u] = np.zeros(num_classes)
               continue
            encodings[lab][u] = np.zeros(num_classes)
            encodings[lab][u][count] = 1
            count += 1
   else:
      for u in uniques:
         if u == na:
            encodings[u] = np.zeros(num_classes)
            continue
         encodings[u] = np.zeros(num_classes)
         encodings[u][count] = 1
         count += 1
   
   # encode the data
   data_input = np.zeros((n_samples,n_labels,num_classes))
   i = 0
   for patient in matrix:
      
      data_sparse = np.zeros((n_labels, num_classes))
      
      count = 0
      for lab in patient:
         if no_unique == 1:
            data_sparse[count] = encodings[count][lab]
         else:
            if lab != na:
               lab = int(float(lab))
            data_sparse[count] = encodings[lab]
         count += 1
      
      data_input[i] = data_sparse
      i += 1
      
   return data_input

def encode_con(raw_input):
   
   matrix = np.array(raw_input)
   consum = matrix.sum(axis=1)
   
   data_input = np.log2(matrix + 1)
   
   # remove 0 variance
   std = np.nanstd(data_input, axis=0)
   mask_col = std != 0
   data_input = data_input[:,mask_col]
   
   # z-score normalize
   mean = np.nanmean(data_input, axis=0)
   std = np.nanstd(data_input, axis=0)
   
   data_input = data_input
   data_input -= mean
   data_input /= std

def sort_data(data, ids, labels):
   n_labels = len(labels)
   sorted_data = list()
   for ids in ids:
      if ids in data:
         sorted_data.append(data[ids])
      else:
         #tmp = np.zeros((n_labels))
         #tmp[:] = np.nan
         tmp = [0]*n_labels
         sorted_data.append(tmp)
   
   return sorted_data

For encoding the data you need to have each dataset/data type in a format for N x M, where N is the numer of samples/individuals and M is the number of features. For using the dataset specific weighting in the training of the VAE you need to process the datasets individually or split them when you read them in. The continuous data is z-score normalised and the categorical data is one-hot encoded. Below is an example of processing a continuous dataset and two categorical datasets with different number of categories. To ensure the correct order the ID's are used for sorting the data accordingly.

In [None]:
# Get ids to sort the data
ids = list()
with open(path + "data/baseline_ids.txt", "r") as f:
   for line in f:
      ids.append(line.rstrip())

# Encode continuous
raw_input = dict()
with open(path + "data/baseline_transcriptomics.tsv", "r") as f:
   header = f.readline()
   for line in f:
      line = line.rstrip()
      tmp = np.array(line.split("\t"))
      vals = tmp[1:]
      vals[vals == 'NA'] = np.nan
      vals = list(map(float, vals))
      raw_input[tmp[0]] = vals


header = header.split("\t")

sorted_data = sort_data(np.array(raw_input), ids, header)
data_input, mask = encode_con(np.array(sorted_data))
np.save(path + "data/baseline_transcriptomics.npy", sorted_data)

In [None]:
# Encode categorical data with two categories
raw_input = list()
with open(path + "data/baseline_drugs.tsv", "r") as f:
   header = f.readline()
   for line in f:
      line = line.rstrip()
      tmp = line.split("\t")
      raw_input.append(tmp[1:])

header = header.split("\t")

# Set the number of classes and categories
num_classes = 2
uniques = [0, 1, 'nan']

# Sort and encode the data
sorted_data = sort_data(np.array(raw_input), ids, header)
data_input = encode_cat(np.array(sorted_data), num_classes, uniques, 'nan')
np.save(path + "data/baseline_drugs.npy", data_input)

In [None]:
# Encode categorical data with three categories
raw_input = list()
with open(path + "data/diabetes_genotypes_all.tsv", "r") as f:
   header = f.readline()
   for line in f:
      line = line.rstrip()
      tmp = line.split("\t")
      raw_input.append(tmp[1:])


header = header.split("\t")

# Set the number of classes and categories
num_classes = 3
uniques = [0, 1, 2, 'nan']

# Sort and encode the data
sorted_data = sort_data(raw_input, ids, header)
data_input = encode_cat(np.array(sorted_data), num_classes, uniques, 'nan')
np.save(path + "data/diabetes_genotypes.npy", data_input)