# Create Phenopackets for TWOC demonstrator

Read input data from `TWOC-MultiOmics-Studies-COVID_300Samples.xlsx`. Requires `pandas` and `openpyxl`.

In [32]:
phenotype_data_file = "../data/TWOC-MultiOmics-Studies-COVID_300Samples.xlsx"
biosample_data_file = "../data/IDs_Individuals-vs-Samples_incl-all-CONTROLS.xlsx"

In [33]:
import pandas as pd
phenotype_data = pd.read_excel(
    phenotype_data_file, 
    sheet_name = 0,  # first sheet
    header = [0],    # use 2nd row as header
    skiprows = 1)    # skip first row with ontology terms
#phenotype_data.head()

biosample_data = pd.read_excel(
    biosample_data_file,
    sheet_name = 0,  # first sheet
    header = [0],
    skiprows = None)
biosample_data.head()


Unnamed: 0,Study Subject ID,Sample ID,BD Time-Point,Transcript.,Metabol.,Prote.,Sex,Age,Unnamed: 8,* ref = see earlier Master Table ; ** T2 Transcriptomics samples are available but were not included in our FAIRification process
0,INCOV001,INCOV001-BL,T1,Y,Y,Y,ref,ref,,
1,INCOV001,INCOV001-AC,T2,N,Y,Y,ref,ref,,
2,INCOV002,INCOV002-BL,T1,Y,Y,Y,ref,ref,,
3,INCOV002,INCOV002-AC,T2,N,Y,Y,ref,ref,,
4,INCOV003,INCOV003-BL,T1,Y,Y,Y,ref,ref,,


Get ontology terms from first row.

In [34]:
column_info = pd.read_excel(
    phenotype_data_file,
    sheet_name = 0,
    header = [0], 
    nrows = 1     # first row only
).iloc[0,]        # convert to Series
# create dict
ontology_terms = {v: k for k, v in column_info.iteritems()}
ontology_terms

{'Study Subject': 'http://purl.obolibrary.org/obo/NCIT_C41189',
 'Patient': 'http://purl.obolibrary.org/obo/NCIT_C16960',
 'Symptoms Consistent with COVID-19': 'http://purl.obolibrary.org/obo/NCIT_C173069',
 'COVID-19': 'http://purl.obolibrary.org/obo/MONDO_0100096',
 'Sex': 'http://purl.obolibrary.org/obo/NCIT_C28421',
 'Age-Years': 'http://purl.obolibrary.org/obo/NCIT_C37908',
 'Patient Status': 'http://purl.obolibrary.org/obo/NCIT_C166244',
 'ICU': 'http://purl.obolibrary.org/obo/NCIT_C53511',
 'Study DOI': 'http://edamontology.org/data_1188',
 'Ethnicity': 'http://purl.obolibrary.org/obo/GECKO_0000061',
 'BMI': 'http://purl.obolibrary.org/obo/ExO_0000105',
 'Smoking': 'http://purl.obolibrary.org/obo/NCIT_C154329',
 'COVID-19 Disease Severity (WHO Ordinal) Scale': 'http://purl.obolibrary.org/obo/NCIT_C178899',
 'Charlson Comorbidity Index': 'http://purl.obolibrary.org/obo/NCIT_C176422',
 'APACHE II Score': 'http://purl.obolibrary.org/obo/NCIT_C121113',
 'Mechanical Ventilation': 'ht

Define functions to convert rows to Phenopackets Individuals

In [35]:
from math import nan
import numpy as np
import phenopackets 
from datetime import date
from calendar import timegm
from google.protobuf.timestamp_pb2 import Timestamp
from google.protobuf.json_format import Parse, MessageToJson
import json

def get_timestamp(date_str):
    """Create Timestamp in seconds since Unix epoch
    from a date string, e.g, '1970-01-01'
    """
    y, m, d = tuple([int(i) for i in date_str.split("-")])
    timestamp = Timestamp(seconds = timegm(date(y, m, d).timetuple()))
    return timestamp


def get_age(years):
    """Create Age object from age in years
    """
    if not np.isnan(years):
        age = phenopackets.TimeElement(
            age = phenopackets.Age(
                iso8601duration = "P{0}Y".format(round(years))))
    else:
        age = None
    return age


def get_sex(sex):
    """Convert values for column 'Sex' to Phenopackets Sex
    https://phenopacket-schema.readthedocs.io/en/latest/sex.html#rstsex
    """
    if isinstance(sex, str):
        formatted_sex = sex.upper()
    elif np.isnan(sex):
        formatted_sex = "UNKNOWN_SEX"
    else:
        raise TypeError("Unknown input type", type(sex), sex)
    return formatted_sex


def create_individual(row):
    """Create Phenopackets Individual from row.
    Also see phenopackets schema documentation
    https://phenopacket-schema.readthedocs.io/en/latest/individual.html#rstindividual
    """
    individual = phenopackets.Individual(
        id = row["Study Subject"],
        time_at_last_encounter = get_age(row["Age-Years"]), 
        #vital_status = phenopackets.VitalStatus(status = "ALIVE"),
        sex = get_sex(row["Sex"]), # phenotypic sex
        #karyotypic_sex = phenopackets.KaryotypicSex(),
        taxonomy = phenopackets.OntologyClass(
            id = "NCBITaxon:9606", label = "Homo sapiens"))
    return individual

def create_biosamples(row):
    """Create biosamples from rows from biosample vs individuals xls file.
    """
    biosample_info = phenopackets.Biosample(
        id = str(row["Sample ID"]),
        individual_id = row["Study Subject ID"])
        #time_of_collection = row["BD Time-Point"])
    return biosample_info

def create_phenotypic_features(row):
    """Create phenotypic features from rows from xls file
    """
    phenotypic_features = []
    for i in range(len(row)-6):
        row_name = row.keys()[i+6]
        onto = (ontology_terms[row_name].split("/"))[-1]
        if str(row[row_name]) != 'nan':
            id = str(row[row_name])
        else:
            id = None
        #print(row_name, row[row_name], id, onto)
        phenotypic_feature = phenopackets.PhenotypicFeature(
            type = phenopackets.OntologyClass(
                id = id, label = onto)
            )
        phenotypic_features.append(phenotypic_feature)
    return phenotypic_features
            

Apply functions defined above to data frame with phenotype information

In [36]:
all_individuals = phenotype_data.apply(create_individual, axis = 1)

biosamples = biosample_data.apply(create_biosamples, axis = 1)
#type(all_individuals)

#type(biosamples[0])
phenotypics = phenotype_data.apply(create_phenotypic_features, axis = 1)
#phenotypics[0]

Next steps: 
- add Individual to Phenopacket, see https://phenopacket-schema.readthedocs.io/en/latest/phenopacket.html
- add Biosamples, PhenotypcFeatures, etc. (information from remaining columns)
- reuse existing building blocks where possible
- use suitable ontology terms for labels and values (is possible)
- save as json

Create Phenopackets objects 
Add individuals to Phenopackets objects

In [37]:
def create_phenopackets(individuals, biosampless, phenotypics):
    pheno_pkts = []
    for i, indiv in enumerate(individuals):
        for biosample in biosampless:
            if indiv.id == biosample.id:
                #print(indiv.id, biosample.id)
                new_pheno = phenopackets.Phenopacket(
                    id = indiv.id,
                    subject = indiv,
                    phenotypic_features = phenotypics[i])
                new_pheno.biosamples.extend([biosample])
                pheno_pkts.append(new_pheno)
    return pheno_pkts



Convert Phenopackets objects to JSON or JSON-LD OR turtle RDF format

In [40]:
import rdflib
pheno_pkts = create_phenopackets(all_individuals, biosamples, phenotypics)
#pheno_pkts[0]
with open('../data/phenopackets_TWOC_json.json', 'w') as jsfile:
    pkts = []
    for pheno_pkt in pheno_pkts:
        pkt = MessageToJson(pheno_pkt)
        pkts.append(pkt)
    jsfile.write(json.dumps(pkts))
"""

print(pkt)
data = {[]}

for pkt in pkts:
        data['records'].append(json.loads(line))
        
print(json.dumps(data))"""

"\n\nprint(pkt)\ndata = {[]}\n\nfor pkt in pkts:\n        data['records'].append(json.loads(line))\n        \nprint(json.dumps(data))"