In [1]:
# auto reloading of local scripts under dev
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

from google.cloud import storage
storage_client = storage.Client()

In [3]:
# load local lib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.terra.table_utils import *
from src.terra.expt_design.upload_init_tables_from_local import construct_bare_family_table_from_individual_table

from src.gcs_utils import *
from src.utils import *

In [4]:
# mostly for debugging
import http
import re
import dateutil
import pandas as pd
from firecloud import api as fapi

# Prep local table for uploading

## Load minimal table, holding BAM path and individual only

In [5]:
minimal_flowcell_table = pd.read_csv('hgsvc2/minimal.flowcell.metadata.table.tsv', sep='\t')
minimal_flowcell_table.head()

Unnamed: 0,input_bam,alias,individual
0,gs://broad-dsde-methods-long-reads/datasets/HG...,GM18534,NA18534
1,gs://broad-dsde-methods-long-reads/datasets/HG...,GM18534,NA18534
2,gs://broad-dsde-methods-long-reads/datasets/HG...,GM18939,NA18939
3,gs://broad-dsde-methods-long-reads/datasets/HG...,GM18939,NA18939
4,gs://broad-dsde-methods-long-reads/datasets/HG...,GM18989,NA18989


In [6]:
def filter_away_bam(bam_gs: str) -> bool:
    transformed = bam_gs.lower()
    return ('ccs' in transformed or 'hifi' in transformed) \
           and ('subreads.bam' in transformed or 'scraps.bam' in transformed)

In [7]:
minimal_flowcell_table = minimal_flowcell_table.loc[~minimal_flowcell_table['input_bam'].apply(filter_away_bam), :]

## Load pedigree info

In [8]:
pedigree = pd.read_csv('hgsvc2/1KGP_2504_and_698.ped', sep='\t', header=None)
pedigree.columns = ['family', 'individual', 'father', 'mother', 'sex', 'phenotype']

In [9]:
def determine_identity(row) -> str:
    identity = 'singlet'
    if '0' != str(row['father']) and '0' != str(row['father']):
        if row['father'] in minimal_flowcell_table['individual'].tolist() or row['mother'] in minimal_flowcell_table['individual'].tolist():
            identity = 'son' if 1==row['sex'] else 'daughter'
        else:
            identity = 'singlet'
    else:
        family = row['family']
        family_members = set(pedigree.loc[pedigree['family'] == family,'individual'].unique().tolist())
        if 1 < len(family_members):
            if 1 < len(minimal_flowcell_table.loc[minimal_flowcell_table['individual'].isin(family_members), 'individual'].unique()):
                identity = 'father' if 1==row['sex'] else 'mother'
            else:
                identity = 'singlet'
        else:
            identity = 'singlet'
    return identity

pedigree['identity'] = pedigree.apply(determine_identity, axis=1)

In [10]:
pedigree['father'] = pedigree['father'].apply(lambda s: 'NA' if '0' == str(s) else str(s))
pedigree['mother'] = pedigree['mother'].apply(lambda s: 'NA' if '0' == str(s) else str(s))
pedigree['sex'] = pedigree['sex'].apply(lambda i: 'M' if 1==i else 'F')

pedigree.drop(['phenotype'], axis=1,  inplace=True)

##### Annotate flowcell table with pedigree information

In [11]:
enriched_flowcell_table = pd.merge(minimal_flowcell_table, pedigree, on='individual', how='left')

## Existing table, to get population information

In [12]:
existing_table = fetch_existing_root_table('broad-firecloud-dsde-methods', 'lr-hgsvc2', 'sample')

In [13]:
population_info_from_existing_table = existing_table[['family', 'pop']].sort_values(by=['family']).reset_index(drop=True)
population_info_from_existing_table.drop_duplicates(inplace=True, ignore_index=True)
population_info_from_existing_table = dict(zip(population_info_from_existing_table['family'].tolist(), population_info_from_existing_table['pop'].tolist()))

##### Further annotate flowcell table with population information

In [14]:
enriched_flowcell_table['population'] = enriched_flowcell_table['family'].apply(lambda f: population_info_from_existing_table.get(f, 'NotAvailable'))

##### Manually get these

In [15]:
enriched_flowcell_table.loc[enriched_flowcell_table['population'] == 'NotAvailable','family'].sort_values().unique()

array(['2484a', 'BB67', 'BD19', 'CLM29', 'CLM51', 'GB116', 'GB40', 'GB57',
       'HG00171', 'HG00268', 'IT003', 'IT220', 'LWK001', 'NA18534',
       'NA18939', 'NA18989', 'NA19317', 'NA19320', 'NA19347', 'NA19384',
       'NA20509', 'NA20847', 'NG16', 'PEL38', 'SL05', 'SL50', 'ST012',
       'ST116', 'VN002', 'VN061', 'm001'], dtype=object)

## After manually finding information for these families, annotate

In [16]:
manually_annotated_families = pd.read_csv('hgsvc2/missing.family.pop.code.tsv', sep='\t')

In [17]:
manually_annotated_families = dict(zip(manually_annotated_families['Family'].tolist(), manually_annotated_families['Code'].tolist()))

In [18]:
enriched_flowcell_table['population'] = enriched_flowcell_table.apply(lambda row:
                                                                      manually_annotated_families[row['family']] if 'NotAvailable' == row['population'] else row['population'],
                                                                      axis=1)

In [19]:
enriched_flowcell_table.shape

(198, 9)

In [20]:
idx = enriched_flowcell_table['identity'] == 'singlet'
print(f"{len(enriched_flowcell_table.loc[idx,'individual'].unique())} singlet individuals.")
print(f"{len(enriched_flowcell_table.loc[~idx,'family'].unique())} families.")

41 singlet individuals.
4 families.


In [21]:
for f in enriched_flowcell_table.loc[idx,'family']:
    if len(enriched_flowcell_table.loc[enriched_flowcell_table['family'] == f, 'individual'].unique()) > 1:
        raise ValueError(f"{f} incorrectly classified as singlet family")

## Now just some easy stuff

In [22]:
enriched_flowcell_table['input_pbi'] = enriched_flowcell_table['input_bam'].apply(lambda s: s+'.pbi')

In [23]:
def classify_bam(bam_gs: str) -> str:
    transformed = bam_gs.lower()
    if transformed.endswith('ccs.bam') or transformed.endswith('hifi_reads.bam'):
        return 'CCS'
    elif transformed.endswith('subreads.bam') or transformed.endswith('clr.bam'):
        return 'CLR'
    else:
        raise ValueError(f"Assumption on BAMs naming convention following library type is broken\n{bam_gs}")

In [24]:
enriched_flowcell_table['library_type'] = enriched_flowcell_table['input_bam'].apply(classify_bam)

In [25]:
enriched_flowcell_table['flowcell_id'] = enriched_flowcell_table['input_bam'].apply(lambda gs: gs.split('/')[-1].split('.')[-3])

In [26]:
ccs_flows = enriched_flowcell_table.loc[enriched_flowcell_table['library_type'] == 'CCS', :].reset_index(drop=True)
clr_flows = enriched_flowcell_table.loc[enriched_flowcell_table['library_type'] == 'CLR', :].reset_index(drop=True)
print(f"{len(ccs_flows)} CCS flowcells")
print(f"{len(clr_flows)} CLR flowcells")

133 CCS flowcells
65 CLR flowcells


In [27]:
enriched_flowcell_table.shape

(198, 12)

# Data table organization

We will organize the data tables into the following structure:

* `ccs-flowcell`
* `clr-flowcell`
* `ccs-sample`
* `clr-sample`
* `individual`
* `singlet`
* `family`
* `population`

In [28]:
target_namespace = 'broad-firecloud-dsde-methods'
target_workspace = 'HGSVC2-unified'

In [29]:
sorted(list(ccs_flows.columns))

['alias',
 'family',
 'father',
 'flowcell_id',
 'identity',
 'individual',
 'input_bam',
 'input_pbi',
 'library_type',
 'mother',
 'population',
 'sex']

### Format and upload flowcell level tables

In [33]:
ccs_flowcells = ccs_flows[['individual', 'sex', 'family', 'identity', 'father', 'mother', 'population', 'input_bam', 'input_pbi', 'library_type', 'alias']]

In [34]:
ccs_flowcells.insert(loc=0, column = 'entity:ccs-flowcell_id', value=ccs_flows['flowcell_id'])
ccs_flowcells.head()

Unnamed: 0,entity:ccs-flowcell_id,individual,sex,family,identity,father,mother,population,input_bam,input_pbi,library_type,alias
0,GM18989_20210510_Lee_m64119_210501_214725,NA18989,M,NA18989,singlet,,,JPT,gs://broad-dsde-methods-long-reads/datasets/HG...,gs://broad-dsde-methods-long-reads/datasets/HG...,CCS,GM18989
1,GM18989_20210513_Lee_m64039_210506_192609,NA18989,M,NA18989,singlet,,,JPT,gs://broad-dsde-methods-long-reads/datasets/HG...,gs://broad-dsde-methods-long-reads/datasets/HG...,CCS,GM18989
2,GM18989_20210513_Lee_m64039_210508_014218,NA18989,M,NA18989,singlet,,,JPT,gs://broad-dsde-methods-long-reads/datasets/HG...,gs://broad-dsde-methods-long-reads/datasets/HG...,CCS,GM18989
3,GM18989_20210624_Lee_m64039_210616_161955,NA18989,M,NA18989,singlet,,,JPT,gs://broad-dsde-methods-long-reads/datasets/HG...,gs://broad-dsde-methods-long-reads/datasets/HG...,CCS,GM18989
4,m54329U_201019_191314,NA19320,F,NA19320,singlet,,,LWK,gs://broad-dsde-methods-long-reads/datasets/HG...,gs://broad-dsde-methods-long-reads/datasets/HG...,CCS,GM19320


In [35]:
upload_root_table(target_namespace, target_workspace, ccs_flowcells)

In [36]:
clr_flowcells = clr_flows[['individual', 'sex', 'family', 'identity', 'father', 'mother', 'population', 'input_bam', 'input_pbi', 'library_type', 'alias']]
clr_flowcells.insert(loc=0, column = 'entity:clr-flowcell_id', value=clr_flows['flowcell_id'])
upload_root_table(target_namespace, target_workspace, clr_flowcells)

### Make and upload library-type-specific sample tables

In [37]:
ccs_samples = construct_bare_family_table_from_individual_table(individual_fc_table=ccs_flowcells,
                                                                flowcell_id_col='entity:ccs-flowcell_id',
                                                                group_by='individual',
                                                                family_id_col='family', relation_col='identity')
ccs_samples

Unnamed: 0,individual,entity:ccs-flowcell_id,family,identity
0,HG00268,"[HG00268_20210121_Lee_m64119_210113_164007, HG...",HG00268,singlet
1,HG00512,[HG00512_PacBio_Sequel2_CCS__EDEVI_20190920_S6...,SH032,father
2,HG00513,[HG00513_PacBio_Sequel2_CCS__EDEVI_20190920_S6...,SH032,mother
3,HG00514,"[m54329U_200715_194535, m54329U_200717_235548,...",SH032,daughter
4,HG00731,"[HG00731_20190925_EEE_m54329U_190528_231241, H...",PR05,father
5,HG00732,"[HG00732_20200722_EEE_m54329U_200528_200534, H...",PR05,mother
6,HG00733,"[HG00733_20190925_EEE_m54329U_190607_185248, H...",PR05,daughter
7,HG01352,"[m54329U_200819_192918, m54329U_200824_191532,...",CLM29,singlet
8,HG01457,"[HG01457_20210706_Lee_m64039_210627_040723, HG...",CLM51,singlet
9,HG02059,"[m54329U_210430_224654, m54329U_210505_001822,...",VN061,singlet


In [38]:
upload_set_table(target_namespace, target_workspace, table=ccs_samples,
                 current_set_type_name='individual', desired_set_type_name='ccs-sample',
                 current_membership_col_name='entity:ccs-flowcell_id', desired_membership_col_name='ccs-flowcell',
                 operation=MembersOperationType.RESET)

2021-12-30 11:46:58,014 - INFO - uploaded set level table, next fill-in members...


In [39]:
clr_samples = construct_bare_family_table_from_individual_table(individual_fc_table=clr_flowcells,
                                                                flowcell_id_col='entity:clr-flowcell_id',
                                                                group_by='individual',
                                                                family_id_col='family', relation_col='identity')
upload_set_table(target_namespace, target_workspace, table=clr_samples,
                 current_set_type_name='individual', desired_set_type_name='clr-sample',
                 current_membership_col_name='entity:clr-flowcell_id', desired_membership_col_name='clr-flowcell',
                 operation=MembersOperationType.RESET)

2021-12-30 11:47:16,979 - INFO - uploaded set level table, next fill-in members...


### Format and upload remaining tables (these tables are not suitable to be used for WDL computations)

#### Prep

In [40]:
ccs_samples = construct_bare_family_table_from_individual_table(individual_fc_table=ccs_flowcells,
                                                                flowcell_id_col='entity:ccs-flowcell_id',
                                                                group_by='individual',
                                                                family_id_col='family', relation_col='identity')
clr_samples = construct_bare_family_table_from_individual_table(individual_fc_table=clr_flowcells,
                                                                flowcell_id_col='entity:clr-flowcell_id',
                                                                group_by='individual',
                                                                family_id_col='family', relation_col='identity')
ccs_samples.rename({'entity:ccs-flowcell_id': 'ccs-flowcells'}, axis=1, inplace=True)
clr_samples.rename({'entity:clr-flowcell_id': 'clr-flowcells'}, axis=1, inplace=True)

In [41]:
individuals_table = pd.merge(ccs_samples, clr_samples, on='individual', how='outer')
individuals_table.shape

(52, 7)

In [42]:
def resolve_pedigree(row) -> pd.Series:
    if pd.isna(row['family_x']) and pd.isna(row['identity_x']):
        return pd.Series([row['family_y'], row['identity_y']])
    else:
        return pd.Series([row['family_x'], row['identity_x']])

In [43]:
resolved_pedigree = individuals_table.apply(resolve_pedigree, axis=1)
resolved_pedigree.columns = ['family', 'identity']

In [44]:
resolved_pedigree

Unnamed: 0,family,identity
0,HG00268,singlet
1,SH032,father
2,SH032,mother
3,SH032,daughter
4,PR05,father
5,PR05,mother
6,PR05,daughter
7,CLM29,singlet
8,CLM51,singlet
9,VN061,singlet


In [45]:
individuals_table = pd.concat([individuals_table, resolved_pedigree], axis=1)
individuals_table.drop(labels=['family_x', 'identity_x', 'family_y', 'identity_y'], axis=1, inplace=True)
individuals_table.head()

Unnamed: 0,individual,ccs-flowcells,clr-flowcells,family,identity
0,HG00268,"[HG00268_20210121_Lee_m64119_210113_164007, HG...",,HG00268,singlet
1,HG00512,[HG00512_PacBio_Sequel2_CCS__EDEVI_20190920_S6...,"[EDEVI_20200115_S64049_PL100139347A-1_A01, EDE...",SH032,father
2,HG00513,[HG00513_PacBio_Sequel2_CCS__EDEVI_20190920_S6...,"[EDEVI_20200115_S64049_PL100139348A-1_B01, EDE...",SH032,mother
3,HG00514,"[m54329U_200715_194535, m54329U_200717_235548,...","[m54329U_200723_213143, m54329U_200724_231822]",SH032,daughter
4,HG00731,"[HG00731_20190925_EEE_m54329U_190528_231241, H...","[20191208_r64076_20191206_232343_B01, 20191210...",PR05,father


In [47]:
family_2_pop = dict(zip(enriched_flowcell_table['family'].tolist(), enriched_flowcell_table['population'].tolist()))
individuals_table['population'] = individuals_table['family'].apply(lambda f: family_2_pop[f])
individuals_table.head()

Unnamed: 0,individual,ccs-flowcells,clr-flowcells,family,identity,population
0,HG00268,"[HG00268_20210121_Lee_m64119_210113_164007, HG...",,HG00268,singlet,FIN
1,HG00512,[HG00512_PacBio_Sequel2_CCS__EDEVI_20190920_S6...,"[EDEVI_20200115_S64049_PL100139347A-1_A01, EDE...",SH032,father,CHS
2,HG00513,[HG00513_PacBio_Sequel2_CCS__EDEVI_20190920_S6...,"[EDEVI_20200115_S64049_PL100139348A-1_B01, EDE...",SH032,mother,CHS
3,HG00514,"[m54329U_200715_194535, m54329U_200717_235548,...","[m54329U_200723_213143, m54329U_200724_231822]",SH032,daughter,CHS
4,HG00731,"[HG00731_20190925_EEE_m54329U_190528_231241, H...","[20191208_r64076_20191206_232343_B01, 20191210...",PR05,father,PUR


##### Table: `individual`

In [48]:
individuals_table_barebone = individuals_table[['individual', 'family', 'identity', 'population']].copy(deep=True)
individuals_table_barebone.rename({'individual': 'entity:individual_id'}, axis=1, inplace=True)
individuals_table_barebone.head()

Unnamed: 0,entity:individual_id,family,identity,population
0,HG00268,HG00268,singlet,FIN
1,HG00512,SH032,father,CHS
2,HG00513,SH032,mother,CHS
3,HG00514,SH032,daughter,CHS
4,HG00731,PR05,father,PUR


In [49]:
response = fapi.upload_entities(target_namespace, target_workspace,
                                entity_data=individuals_table_barebone.to_csv(sep='\t', index=False),
                                model='flexible')
response.ok

True

In [50]:
def __upload_one_set(ns: str, ws: str,
                     etype: str, ename: str,
                     member_type: str, members: List[str],
                     operation: MembersOperationType) -> None:
    """
    For a given set identified by etype and ename, fill-in it's members,
    assuming the member entities already exists on Terra.
    :param ns: namespace
    :param ws: workspace
    :param etype: entity type
    :param ename: entity UUID
    :param member_type: entity type of the members
    :param members: list of member uuids
    :param operation: whether to override or append to existing membership list
    :return:
    """

    operations = list()
    response = fapi.get_entity(ns, ws, etype, ename)
    if not response.ok:
        raise FireCloudServerError(response.status_code, response.text)

    attributes = response.json().get('attributes')
    if f'{member_type}s' not in attributes:
        operations.append({
            "op": "CreateAttributeEntityReferenceList",
            "attributeListName": f"{member_type}s"
        })
        members_to_upload = members
    else:
        old_members = [e['entityName'] for e in attributes[f'{member_type}s']['items']]
        if operation == MembersOperationType.MERGE:
            members_to_upload = list(set(members) - set(old_members))
        else:
            for member_id in old_members:
                operations.append({
                    "op": "RemoveListMember",
                    "attributeListName": f"{member_type}s",
                    "removeMember": {"entityType":f"{member_type}",
                                     "entityName":f"{member_id}"}
                })
            members_to_upload = members

    for member_id in members_to_upload:
        operations.append({
            "op": "AddListMember",
            "attributeListName": f"{member_type}s",
            "newMember": {"entityType":f"{member_type}",
                          "entityName":f"{member_id}"}
        })
    logger.debug(operations)

    response = fapi.update_entity(ns, ws,
                                  etype=etype,
                                  ename=ename,
                                  updates=operations)
    if not response.ok:
        raise FireCloudServerError(response.status_code, response.text)

In [51]:
for i in range(len(individuals_table)):
    set_uuid = individuals_table.iloc[i, 0]

    ccs_members = individuals_table.iloc[i, 1]
    if 'nan' != str(ccs_members):
        try:
            __upload_one_set(target_namespace, target_workspace,
                             etype='individual', ename=set_uuid,
                             member_type='ccs-flowcell', members=ccs_members,
                             operation=MembersOperationType.RESET)
        except FireCloudServerError:
            logger.error(f"Failed to upload ccs membership information for {set_uuid}")
            raise

    clr_members = individuals_table.iloc[i, 2]
    if 'nan' != str(clr_members):
        try:
            __upload_one_set(target_namespace, target_workspace,
                             etype='individual', ename=set_uuid,
                             member_type='clr-flowcell', members=clr_members,
                             operation=MembersOperationType.RESET)
        except FireCloudServerError:
            logger.error(f"Failed to upload clr membership information for {set_uuid}")
            raise

##### Tables `singlet` and `family`

In [52]:
singlets_and_families = individuals_table.groupby('family')\
        .agg({'family': lambda x: x.tolist()[0],
              'individual': lambda x: x.tolist(),
              'population': lambda x: x.tolist()[0]})\
        .reset_index(drop=True)

In [53]:
singlets = singlets_and_families.loc[singlets_and_families['individual'].apply(lambda l: 1==len(l)),:]
families = singlets_and_families.loc[singlets_and_families['individual'].apply(lambda l: 1<len(l)),:]

In [54]:
singlets.shape

(41, 3)

In [55]:
families.shape

(4, 3)

In [56]:
upload_set_table(target_namespace, target_workspace, singlets,
                 current_set_type_name='family', desired_set_type_name='singlet',
                 current_membership_col_name='individual', desired_membership_col_name='individuals', operation=MembersOperationType.RESET)

2021-12-30 11:56:23,461 - INFO - uploaded set level table, next fill-in members...


In [57]:
upload_set_table(target_namespace, target_workspace, families,
                 current_set_type_name='family', desired_set_type_name='family',
                 current_membership_col_name='individual', desired_membership_col_name='individuals', operation=MembersOperationType.RESET)

2021-12-30 11:56:39,160 - INFO - uploaded set level table, next fill-in members...


##### Table `population`

In [61]:
singlets_and_families.head()

Unnamed: 0,family,individual,population
0,1328,[NA12329],CEPH
1,2436,[NA19983],ASW
2,2484a,[NA20355],ASW
3,BB13,[HG02011],ACB
4,BB67,[HG02554],ACB


In [80]:
populations = singlets_and_families.groupby('population')\
        .agg({'population': lambda x: x.tolist()[0],
              'family': lambda x: x.tolist(),
              'individual': lambda x: x.tolist()})\
        .reset_index(drop=True)
populations

Unnamed: 0,population,family,individual
0,ACB,"[BB13, BB67]","[[HG02011], [HG02554]]"
1,ASW,"[2436, 2484a]","[[NA19983], [NA20355]]"
2,BEB,"[BD19, HG03009]","[[HG03807], [HG03009]]"
3,CDX,[HG00864],[[HG00864]]
4,CEPH,[1328],[[NA12329]]
5,CHS,"[NA18534, SH032]","[[NA18534], [HG00512, HG00513, HG00514]]"
6,CLM,"[CLM03, CLM29, CLM51]","[[HG01114], [HG01352], [HG01457]]"
7,ESN,"[NG16, NG98]","[[HG02953], [HG03371]]"
8,FIN,"[HG00171, HG00268]","[[HG00171], [HG00268]]"
9,GBR,[HG00096],[[HG00096]]


In [81]:
def separate_family_singlet(row) -> pd.Series:
    a = row['family']
    b = row['individual']
    assert len(a) == len(b),\
        f'family array and individual array assumed to be same length, not so with {row}'

    families = list()
    singlets = list()

    for i in range(len(a)):
        f = a[i]
        persons = b[i]
        if len(persons) == 1:
            singlets.append(f)
        else:
            families.append(f)

    return pd.Series([families, singlets])

In [82]:
separated_families_singlets = populations.apply(separate_family_singlet, axis=1)
separated_families_singlets

Unnamed: 0,0,1
0,[],"[BB13, BB67]"
1,[],"[2436, 2484a]"
2,[],"[BD19, HG03009]"
3,[],[HG00864]
4,[],[1328]
5,[SH032],[NA18534]
6,[],"[CLM03, CLM29, CLM51]"
7,[],"[NG16, NG98]"
8,[],"[HG00171, HG00268]"
9,[],[HG00096]


In [83]:
separated_families_singlets.columns = ['families', 'singlets']

In [84]:
populations = pd.concat([populations, separated_families_singlets], axis=1).drop(['family', 'individual'], axis=1)
populations.head()

Unnamed: 0,population,families,singlets
0,ACB,[],"[BB13, BB67]"
1,ASW,[],"[2436, 2484a]"
2,BEB,[],"[BD19, HG03009]"
3,CDX,[],[HG00864]
4,CEPH,[],[1328]


In [86]:
populations

Unnamed: 0,population,families,singlets
0,ACB,[],"[BB13, BB67]"
1,ASW,[],"[2436, 2484a]"
2,BEB,[],"[BD19, HG03009]"
3,CDX,[],[HG00864]
4,CEPH,[],[1328]
5,CHS,[SH032],[NA18534]
6,CLM,[],"[CLM03, CLM29, CLM51]"
7,ESN,[],"[NG16, NG98]"
8,FIN,[],"[HG00171, HG00268]"
9,GBR,[],[HG00096]


In [88]:
populations_barebone = populations[['population']].copy(deep=True)
populations_barebone.rename({'population': 'entity:population_id'}, axis=1, inplace=True)
populations_barebone.head()

Unnamed: 0,entity:population_id
0,ACB
1,ASW
2,BEB
3,CDX
4,CEPH


In [89]:
response = fapi.upload_entities(target_namespace, target_workspace,
                                entity_data=populations_barebone.to_csv(sep='\t', index=False),
                                model='flexible')
response.ok

True

In [91]:
for i in range(len(populations)):
    set_uuid = populations.iloc[i, 0]

    families = populations.iloc[i, 1]
    if 0 < len(families):
        try:
            __upload_one_set(target_namespace, target_workspace,
                             etype='population', ename=set_uuid,
                             member_type='family', members=families,
                             operation=MembersOperationType.RESET)
        except FireCloudServerError:
            logger.error(f"Failed to upload family membership information for {set_uuid}")
            raise

    singlets = populations.iloc[i, 2]
    if 0 < len(singlets):
        try:
            __upload_one_set(target_namespace, target_workspace,
                             etype='population', ename=set_uuid,
                             member_type='singlet', members=singlets,
                             operation=MembersOperationType.RESET)
        except FireCloudServerError:
            logger.error(f"Failed to upload singlet membership information for {set_uuid}")
            raise