## Unsupervised Exploratory approach with TensorFlow

### Setup

In [39]:
import os
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd

### Paths

In [40]:
# Input path
input_path = '../output/'

# CSV output
output_path = '../../4_unsupervised_learning/exploratory_approach/'

# Global configuration path
glob_conf_path = '../../config/global_config_paper.py'

## Load global variables

In [41]:
exec(open(glob_conf_path).read())

## Create tsv files
To explore data using https://projector.tensorflow.org/

* Vectors:  
    Example of 3 vectors with dimension 4:  
    0.1\t0.2\t0.5\t0.9  
    0.2\t0.1\t5.0\t0.2  
    0.4\t0.1\t7.0\t0.8  
        
    
* Metadata  
    Example of 3 data points and 2 columns.  
    Note: If there is more than one column, the first row will be parsed as column labels.  
    actor_id\emotion_id  
    1\ttri  
    8\tint  
    6\tsur

In [42]:
import csv

# Import csv
multi_df = pd.read_csv(input_path + 'multimodal_norm_encoded.csv')
metadata_df = pd.read_csv(input_path + 'metadata_df.csv').drop(columns='sex.1')

# Important categorical features from Early Fusion
imp_features = multi_df[['ob_diag',
                         'marital_status_single',
                         'hc_inst_private',
                         'has_paid_TV']]

metadata_df = pd.concat([metadata_df,imp_features], axis=1)

# Convert BMI_SD to normal, overweight and obesity
#Function for returning the interpretation of cuttoffs based on the number of standard deviations (SD).
def interpret_BMI_z_score(SD:float):
    """Returns the BMI interpretation based on the number of Standard Deviations (SD) from the z-scores
    according to gender calculated from the BMI-for-age tables for children and adolescents from 
    5 to 19 years by the World Health Organization (WHO). 
    Available: https://www.who.int/tools/growth-reference-data-for-5to19-years/indicators/bmi-for-age.
    Argument:
    - SD: standard deviation calculated from the z-score tables.
    """
    #Assign label
    if (SD < -1.0):
         SD_label = "Underweight"
    if (SD < 1.0) & (SD >= -1.0):
        SD_label = "Normal"
    if (SD >= 1.0) & (SD < 2.0):
        SD_label = "Overweight"
    if (SD >= 2.0):
        SD_label = "Obese"
    return SD_label

#Function uses a list comprehension expression to go through a database and return the SD interpretation.
def list_comp_SD(df):
    return pd.Series([
    interpret_BMI_z_score(SD)
    for (SD) in (df["BMI_SD"])
  ])
  
metadata_df["SD_label"] = list_comp_SD(metadata_df)
del metadata_df["BMI_SD"]
metadata_df

Unnamed: 0,sex,region,strata,locality_size,locality_type,age_years,ob_diag,marital_status_single,hc_inst_private,has_paid_TV,SD_label
0,female,Centre,3rd_strata,">100,000",urban,10,1.0,0.0,0.0,0,Normal
1,female,Centre,2nd_strata,">100,000",urban,12,1.0,0.0,1.0,1,Normal
2,female,Centre,2nd_strata,">100,000",urban,6,1.0,0.0,0.0,1,Normal
3,female,Centre,3rd_strata,">100,000",urban,15,0.0,0.0,0.0,1,Obese
4,male,Centre,3rd_strata,">100,000",urban,9,0.0,0.0,0.0,1,Obese
...,...,...,...,...,...,...,...,...,...,...,...
2462,male,Centre,1st_strata,"<2,500",rural,13,0.0,0.0,0.0,0,Normal
2463,male,Centre,2nd_strata,"<2,500",rural,16,1.0,0.0,0.0,0,Overweight
2464,male,Centre,2nd_strata,"<2,500",rural,5,1.0,0.0,0.0,0,Normal
2465,female,Centre,2nd_strata,"<2,500",rural,14,0.0,0.0,0.0,0,Normal


## Multimodality

In [43]:
# Prepare vectors
vectors_df = multi_df.drop(columns=['label','group'])

# Prepare label
multi_df['label'] = multi_df['label'].map(label_id_to_short_label)

# Prepare demographics as metadata
label = multi_df[['label']]
metadata_df = pd.concat([metadata_df,label], axis=1)

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)                       

In [44]:
metadata_df

Unnamed: 0,sex,region,strata,locality_size,locality_type,age_years,ob_diag,marital_status_single,hc_inst_private,has_paid_TV,SD_label,label
0,female,Centre,3rd_strata,">100,000",urban,10,1.0,0.0,0.0,0,Normal,Norm
1,female,Centre,2nd_strata,">100,000",urban,12,1.0,0.0,1.0,1,Normal,Norm
2,female,Centre,2nd_strata,">100,000",urban,6,1.0,0.0,0.0,1,Normal,Norm
3,female,Centre,3rd_strata,">100,000",urban,15,0.0,0.0,0.0,1,Obese,OwOb
4,male,Centre,3rd_strata,">100,000",urban,9,0.0,0.0,0.0,1,Obese,OwOb
...,...,...,...,...,...,...,...,...,...,...,...,...
2462,male,Centre,1st_strata,"<2,500",rural,13,0.0,0.0,0.0,0,Normal,Norm
2463,male,Centre,2nd_strata,"<2,500",rural,16,1.0,0.0,0.0,0,Overweight,OwOb
2464,male,Centre,2nd_strata,"<2,500",rural,5,1.0,0.0,0.0,0,Normal,Norm
2465,female,Centre,2nd_strata,"<2,500",rural,14,0.0,0.0,0.0,0,Normal,Norm


## Unimodality

### Categoricals

In [45]:
# Prepare vectors
vectors_df = multi_df.drop(columns=['label','group'])

# Prepare label
multi_df['label'] = multi_df['label'].map(label_id_to_short_label)

# Prepare demographics as metadata
label = multi_df[['label']]
metadata_df = pd.concat([metadata_df,label], axis=1)

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'categorical_vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'categorical_metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)                       

### Numericals

In [46]:
# Prepare vectors
vectors_df = multi_df.drop(columns=['label','group'])

# Prepare label
multi_df['label'] = multi_df['label'].map(label_id_to_short_label)

# Prepare demographics as metadata
label = multi_df[['label']]
metadata_df = pd.concat([metadata_df,label], axis=1)

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'numerical_vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'numerical_metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)                       