In [1]:
#PROLOGUE
# Import packages
import pyspark
import dxpy
import dxdata
import pandas as pd
import re

dxdata.__version__

# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)


# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

# Access dataset 
dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset['participant']

# Whitelist Cohort Has QCed
# 1. FAST & FIRST & FreeSurfer
# 2. no sex chromosome aeuploidy
# 3. genetic enthicty is Caucasian 
# 4. genetic kinship to other participants is not ten or more third-degree relatives identified (if kinship is restricted to 0, many will be eliminated)
# 5. not outlier for heterozygosity or missing rate 
cohort = dxdata.load_cohort(folder="/cohorts/", name="whitelist_strict")
main_entity = dataset.primary_entity

In [2]:
# select fields
# 31 - sex
# 21022 - age at recruitment
# 22001 - genetic sex
# 22019 - sex chromosome aneuploidy 
# 22021 - genetic kinship to other participants
# 22027 - outliers for heterozygosity or missing rate (968 participants of poor genotype quality)
# 189   - townsend deprivation index at recruitment - error: sd=0 in regenie step1
# 22006 - genetic ethic grouping - error: sd=0 in regenie step1
# 22009 - genetic principle component
def field_names_for_ids(field_ids):
    from distutils.version import LooseVersion
    fields = []
    for field_id in field_ids:
        field_id = 'p' + str(field_id)
        fields += participant.find_fields(name_regex=r'^{}(_i\d+)?(_a\d+)?$'.format(field_id))
    return sorted([field.name for field in fields], key=lambda n: LooseVersion(n))

d = {}
pdf = {}
FIRST_range = range(25011, 25025)
for FIRST_id in FIRST_range:
    field_ids = ['31', '189', '21022', '22001', '22019', '22021', '22027', '22009', FIRST_id]
    field_names = ['eid'] + field_names_for_ids(field_ids)
    d[FIRST_id] = main_entity.retrieve_fields(names=field_names, filter_sql = cohort.sql, engine=dxdata.connect())
    pdf[FIRST_id] = d[FIRST_id].toPandas()

pdf[25011].head(10)

Unnamed: 0,eid,p31,p189,p21022,p22001,p22009_a1,p22009_a2,p22009_a3,p22009_a4,p22009_a5,...,p22009_a36,p22009_a37,p22009_a38,p22009_a39,p22009_a40,p22019,p22021,p22027,p25011_i2,p25011_i3
0,1000107,0,-2.19084,64,0,-10.3224,3.43553,-4.80312,3.12216,7.66955,...,-5.14209,2.66901,-4.07529,-0.22906,2.18121,,0,,6834.0,
1,1000199,1,-3.01883,63,1,-9.38779,3.69347,-1.44656,-0.770351,0.518569,...,1.36982,-1.96747,-7.26091,-0.539832,3.08557,,0,,8367.0,
2,1004260,0,-0.07086,60,0,-12.4291,4.23242,-1.28228,5.20862,6.06866,...,-1.08125,-2.58157,-2.45652,-1.67039,-1.17261,,0,,7584.0,
3,1005693,1,-2.02879,64,1,-12.5637,5.01062,0.385386,-1.83014,-0.142879,...,5.26375,1.38544,1.09697,4.44081,-3.69251,,0,,8189.0,
4,1007247,0,-5.1054,66,0,-14.4005,0.258867,1.37215,6.62336,-4.23639,...,0.418678,3.77214,1.64619,-1.10726,-7.64322,,0,,7137.0,
5,1011109,0,-2.34627,49,0,-12.5958,5.90764,-1.72736,1.9688,-5.6551,...,-9.54361,6.11407,-0.540703,-0.427796,-7.29792,,0,,7912.0,
6,1012364,1,-1.99924,65,1,-12.6569,3.93515,-1.15924,1.0294,-7.63154,...,-5.11869,-3.34524,-4.81801,-7.0687,-0.125541,,0,,7097.0,
7,1014412,0,-2.22032,52,0,-12.9918,4.05698,-1.68971,5.66593,9.80612,...,2.70361,0.344521,2.67441,-0.873471,2.2038,,0,,8849.0,
8,1015078,0,-4.266,59,0,-11.1765,2.33954,-0.370975,0.844696,2.38532,...,-2.30561,3.44098,2.28155,-1.36966,4.05663,,0,,7705.0,
9,1015486,0,-4.26072,65,0,-12.7825,3.66302,0.257334,2.70395,-2.50511,...,0.422122,1.32738,-1.76446,-2.87155,-0.511488,,0,,7418.0,


In [3]:
pdf[25024].shape

(24361, 50)

In [4]:
# filter for sex == genetic sex 
for name, df in pdf.items():
    pdf[name] = df[df['p31'] == df['p22001']]

pdf[25023].shape

(24350, 50)

In [7]:
# change var col name for better readibility 
for name, df in pdf.items():
    pdf[name] = df.rename(columns =
                          {'eid':'IID', 'p31': 'sex', 'p189': 'deprivation_index', 'p21022': 'age', 
                          'p22001': 'genetic_sex', 
                          'p22019': 'sex_chromosome_aneuploidy',
                          'p22021': 'kinship_to_other_participants',
                          'p22027': 'outliers_for_heterozygosity_or_missing'})

# add FID column (which is a required input for regenie)
for FIRST_id in FIRST_range:
    D = pdf[FIRST_id] 
    D['FID'] = D['IID']

for name, df in pdf.items():
    temp_cols = df.columns.tolist()
    new_cols = temp_cols[-1:] + temp_cols[:-1]
    pdf[name] = df[new_cols]

pdf[25023].head(10)

Unnamed: 0,FID,IID,sex,deprivation_index,age,genetic_sex,pc1,pc2,pc3,pc4,...,pc36,pc37,pc38,pc39,pc40,sex_chromosome_aneuploidy,kinship_to_other_participants,outliers_for_heterozygosity_or_missing,p25023_i2,p25023_i3
0,1000107,1000107,0,-2.19084,64,0,-10.3224,3.43553,-4.80312,3.12216,...,-5.14209,2.66901,-4.07529,-0.22906,2.18121,,0,,464.0,
1,1000199,1000199,1,-3.01883,63,1,-9.38779,3.69347,-1.44656,-0.770351,...,1.36982,-1.96747,-7.26091,-0.539832,3.08557,,0,,367.0,
2,1004260,1004260,0,-0.07086,60,0,-12.4291,4.23242,-1.28228,5.20862,...,-1.08125,-2.58157,-2.45652,-1.67039,-1.17261,,0,,557.0,
3,1005693,1005693,1,-2.02879,64,1,-12.5637,5.01062,0.385386,-1.83014,...,5.26375,1.38544,1.09697,4.44081,-3.69251,,0,,462.0,
4,1007247,1007247,0,-5.1054,66,0,-14.4005,0.258867,1.37215,6.62336,...,0.418678,3.77214,1.64619,-1.10726,-7.64322,,0,,456.0,
5,1011109,1011109,0,-2.34627,49,0,-12.5958,5.90764,-1.72736,1.9688,...,-9.54361,6.11407,-0.540703,-0.427796,-7.29792,,0,,507.0,
6,1012364,1012364,1,-1.99924,65,1,-12.6569,3.93515,-1.15924,1.0294,...,-5.11869,-3.34524,-4.81801,-7.0687,-0.125541,,0,,627.0,
7,1014412,1014412,0,-2.22032,52,0,-12.9918,4.05698,-1.68971,5.66593,...,2.70361,0.344521,2.67441,-0.873471,2.2038,,0,,676.0,
8,1015078,1015078,0,-4.266,59,0,-11.1765,2.33954,-0.370975,0.844696,...,-2.30561,3.44098,2.28155,-1.36966,4.05663,,0,,224.0,
9,1015486,1015486,0,-4.26072,65,0,-12.7825,3.66302,0.257334,2.70395,...,0.422122,1.32738,-1.76446,-2.87155,-0.511488,,0,,478.0,


In [8]:
for name, df in pdf.items():
     pdf[name] = df.rename(columns = lambda x: re.sub('p22009_a','pc',x))
        
pdf[25022].head(10)

Unnamed: 0,FID,IID,sex,deprivation_index,age,genetic_sex,pc1,pc2,pc3,pc4,...,pc36,pc37,pc38,pc39,pc40,sex_chromosome_aneuploidy,kinship_to_other_participants,outliers_for_heterozygosity_or_missing,p25022_i2,p25022_i3
0,1000107,1000107,0,-2.19084,64,0,-10.3224,3.43553,-4.80312,3.12216,...,-5.14209,2.66901,-4.07529,-0.22906,2.18121,,0,,1344.0,
1,1000199,1000199,1,-3.01883,63,1,-9.38779,3.69347,-1.44656,-0.770351,...,1.36982,-1.96747,-7.26091,-0.539832,3.08557,,0,,1158.0,
2,1004260,1004260,0,-0.07086,60,0,-12.4291,4.23242,-1.28228,5.20862,...,-1.08125,-2.58157,-2.45652,-1.67039,-1.17261,,0,,1515.0,
3,1005693,1005693,1,-2.02879,64,1,-12.5637,5.01062,0.385386,-1.83014,...,5.26375,1.38544,1.09697,4.44081,-3.69251,,0,,1148.0,
4,1007247,1007247,0,-5.1054,66,0,-14.4005,0.258867,1.37215,6.62336,...,0.418678,3.77214,1.64619,-1.10726,-7.64322,,0,,1583.0,
5,1011109,1011109,0,-2.34627,49,0,-12.5958,5.90764,-1.72736,1.9688,...,-9.54361,6.11407,-0.540703,-0.427796,-7.29792,,0,,1213.0,
6,1012364,1012364,1,-1.99924,65,1,-12.6569,3.93515,-1.15924,1.0294,...,-5.11869,-3.34524,-4.81801,-7.0687,-0.125541,,0,,1206.0,
7,1014412,1014412,0,-2.22032,52,0,-12.9918,4.05698,-1.68971,5.66593,...,2.70361,0.344521,2.67441,-0.873471,2.2038,,0,,1215.0,
8,1015078,1015078,0,-4.266,59,0,-11.1765,2.33954,-0.370975,0.844696,...,-2.30561,3.44098,2.28155,-1.36966,4.05663,,0,,1312.0,
9,1015486,1015486,0,-4.26072,65,0,-12.7825,3.66302,0.257334,2.70395,...,0.422122,1.32738,-1.76446,-2.87155,-0.511488,,0,,1489.0,


In [9]:
# change pheno col name for better readibility
# remove pheno instance 3
for name, df in pdf.items():
    colname = 'p' + str(name) + '_i2'
    dropcolname = 'p' + str(name) + '_i3'
    pdf[name] = df.rename(columns =
                          {colname: 'pheno' + str(name)})
    pdf[name].drop(
    columns=[dropcolname], axis=1, inplace=True, errors='ignore'
)
    
pdf[25011].head(10)

Unnamed: 0,FID,IID,sex,deprivation_index,age,genetic_sex,pc1,pc2,pc3,pc4,...,pc35,pc36,pc37,pc38,pc39,pc40,sex_chromosome_aneuploidy,kinship_to_other_participants,outliers_for_heterozygosity_or_missing,pheno25011
0,1000107,1000107,0,-2.19084,64,0,-10.3224,3.43553,-4.80312,3.12216,...,-0.032479,-5.14209,2.66901,-4.07529,-0.22906,2.18121,,0,,6834.0
1,1000199,1000199,1,-3.01883,63,1,-9.38779,3.69347,-1.44656,-0.770351,...,0.491396,1.36982,-1.96747,-7.26091,-0.539832,3.08557,,0,,8367.0
2,1004260,1004260,0,-0.07086,60,0,-12.4291,4.23242,-1.28228,5.20862,...,-3.61305,-1.08125,-2.58157,-2.45652,-1.67039,-1.17261,,0,,7584.0
3,1005693,1005693,1,-2.02879,64,1,-12.5637,5.01062,0.385386,-1.83014,...,-1.62101,5.26375,1.38544,1.09697,4.44081,-3.69251,,0,,8189.0
4,1007247,1007247,0,-5.1054,66,0,-14.4005,0.258867,1.37215,6.62336,...,2.86727,0.418678,3.77214,1.64619,-1.10726,-7.64322,,0,,7137.0
5,1011109,1011109,0,-2.34627,49,0,-12.5958,5.90764,-1.72736,1.9688,...,4.4269,-9.54361,6.11407,-0.540703,-0.427796,-7.29792,,0,,7912.0
6,1012364,1012364,1,-1.99924,65,1,-12.6569,3.93515,-1.15924,1.0294,...,-5.34841,-5.11869,-3.34524,-4.81801,-7.0687,-0.125541,,0,,7097.0
7,1014412,1014412,0,-2.22032,52,0,-12.9918,4.05698,-1.68971,5.66593,...,-0.513259,2.70361,0.344521,2.67441,-0.873471,2.2038,,0,,8849.0
8,1015078,1015078,0,-4.266,59,0,-11.1765,2.33954,-0.370975,0.844696,...,-2.5329,-2.30561,3.44098,2.28155,-1.36966,4.05663,,0,,7705.0
9,1015486,1015486,0,-4.26072,65,0,-12.7825,3.66302,0.257334,2.70395,...,0.131047,0.422122,1.32738,-1.76446,-2.87155,-0.511488,,0,,7418.0


In [10]:
# get WES
path_to_family_file = f'/mnt/project/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release/ukb23158_c1_b0_v1.fam' 
plink_fam_df = pd.read_csv(path_to_family_file, delimiter='\s', dtype='object',
                           names = ['FID','IID','Father ID','Mother ID', 'sex', 'Pheno'], engine='python')

# intersect the phenotype file with the 480k WES .fam file to filter for the phenotype dataframe for the 480k participants
for name,df in pdf.items():
    pdf[name] = df.join(plink_fam_df.set_index('IID'), on='IID', rsuffix='_fam', how='inner')
    pdf[name].drop(
    columns=['genetic_sex','sex_chromosome_aneuploidy','outliers_for_heterozygosity_or_missing','FID_fam','Father ID_fam','Mother ID_fam','sex_fam', 'Pheno_fam', 'Father ID', 'Mother ID', 'Pheno','kinship_to_other_participants'], axis=1, inplace=True, errors='ignore'
)

pdf[25023].head(10)

Unnamed: 0,FID,IID,sex,deprivation_index,age,pc1,pc2,pc3,pc4,pc5,...,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pheno25023
0,1000107,1000107,0,-2.19084,64,-10.3224,3.43553,-4.80312,3.12216,7.66955,...,0.058821,-1.26867,0.102561,-0.032479,-5.14209,2.66901,-4.07529,-0.22906,2.18121,464.0
1,1000199,1000199,1,-3.01883,63,-9.38779,3.69347,-1.44656,-0.770351,0.518569,...,-2.23428,2.84579,4.02455,0.491396,1.36982,-1.96747,-7.26091,-0.539832,3.08557,367.0
2,1004260,1004260,0,-0.07086,60,-12.4291,4.23242,-1.28228,5.20862,6.06866,...,-1.73334,-5.06496,-4.72318,-3.61305,-1.08125,-2.58157,-2.45652,-1.67039,-1.17261,557.0
3,1005693,1005693,1,-2.02879,64,-12.5637,5.01062,0.385386,-1.83014,-0.142879,...,1.73379,2.13079,4.87249,-1.62101,5.26375,1.38544,1.09697,4.44081,-3.69251,462.0
4,1007247,1007247,0,-5.1054,66,-14.4005,0.258867,1.37215,6.62336,-4.23639,...,3.00333,-5.39546,-1.63352,2.86727,0.418678,3.77214,1.64619,-1.10726,-7.64322,456.0
5,1011109,1011109,0,-2.34627,49,-12.5958,5.90764,-1.72736,1.9688,-5.6551,...,-3.48166,2.13947,-0.201932,4.4269,-9.54361,6.11407,-0.540703,-0.427796,-7.29792,507.0
6,1012364,1012364,1,-1.99924,65,-12.6569,3.93515,-1.15924,1.0294,-7.63154,...,-7.82758,2.24489,0.037451,-5.34841,-5.11869,-3.34524,-4.81801,-7.0687,-0.125541,627.0
7,1014412,1014412,0,-2.22032,52,-12.9918,4.05698,-1.68971,5.66593,9.80612,...,1.39593,-4.74284,0.773784,-0.513259,2.70361,0.344521,2.67441,-0.873471,2.2038,676.0
8,1015078,1015078,0,-4.266,59,-11.1765,2.33954,-0.370975,0.844696,2.38532,...,5.22898,3.93259,-1.97791,-2.5329,-2.30561,3.44098,2.28155,-1.36966,4.05663,224.0
9,1015486,1015486,0,-4.26072,65,-12.7825,3.66302,0.257334,2.70395,-2.50511,...,0.58889,-3.71645,3.89807,0.131047,0.422122,1.32738,-1.76446,-2.87155,-0.511488,478.0


In [11]:
pdf[25023].shape

(23585, 46)

In [12]:
# save phenotype file as csv and upload to project storage
for name,df in pdf.items():
    df.to_csv(str(name) + '.csv', na_rep='NA', index=False, quoting=3)

In [13]:
%%bash

head 25011.csv 

FID,IID,sex,deprivation_index,age,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16,pc17,pc18,pc19,pc20,pc21,pc22,pc23,pc24,pc25,pc26,pc27,pc28,pc29,pc30,pc31,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pheno25011
1000107,1000107,0,-2.19084,64,-10.3224,3.43553,-4.80312,3.12216,7.66955,1.73731,-3.35441,0.667715,4.2336,-2.81616,-5.98459,2.44161,-1.38907,6.17117,-0.881941,1.35898,2.33466,3.02209,0.150301,0.312959,-2.69258,-2.57436,-0.382614,-1.51296,2.09896,-2.7798,2.00963,-2.73047,-4.52226,-0.176631,-5.72564,0.0588211,-1.26867,0.102561,-0.032479,-5.14209,2.66901,-4.07529,-0.22906,2.18121,6834.0
1000199,1000199,1,-3.01883,63,-9.38779,3.69347,-1.44656,-0.770351,0.518569,0.909801,0.754956,1.82701,4.20984,-3.61143,-4.01316,1.33684,-3.33941,1.74009,0.121887,0.675944,2.22411,3.77217,1.99826,5.33941,-0.395441,5.96525,-4.03808,3.12201,-5.02048,0.157766,3.07114,0.604188,-3.63791,-0.591159,4.86021,-2.23428,2.84579,4.02455,0.491396,1.36982,-1.96747,-7.26091,-0.539832,3.08557,8

In [14]:
# put all csv files in a list
import glob
all_files = glob.glob("*.csv")

pheno_list=[]
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    pheno_list.append(df)

In [15]:
pheno_list[3]

Unnamed: 0,FID,IID,sex,deprivation_index,age,pc1,pc2,pc3,pc4,pc5,...,pc32,pc33,pc34,pc35,pc36,pc37,pc38,pc39,pc40,pheno25014
0,1000107,1000107,0,-2.19084,64,-10.32240,3.435530,-4.803120,3.122160,7.669550,...,0.058821,-1.268670,0.102561,-0.032479,-5.142090,2.669010,-4.07529,-0.229060,2.18121,3191.0
1,1000199,1000199,1,-3.01883,63,-9.38779,3.693470,-1.446560,-0.770351,0.518569,...,-2.234280,2.845790,4.024550,0.491396,1.369820,-1.967470,-7.26091,-0.539832,3.08557,3938.0
2,1004260,1004260,0,-0.07086,60,-12.42910,4.232420,-1.282280,5.208620,6.068660,...,-1.733340,-5.064960,-4.723180,-3.613050,-1.081250,-2.581570,-2.45652,-1.670390,-1.17261,3142.0
3,1005693,1005693,1,-2.02879,64,-12.56370,5.010620,0.385386,-1.830140,-0.142879,...,1.733790,2.130790,4.872490,-1.621010,5.263750,1.385440,1.09697,4.440810,-3.69251,3605.0
4,1007247,1007247,0,-5.10540,66,-14.40050,0.258867,1.372150,6.623360,-4.236390,...,3.003330,-5.395460,-1.633520,2.867270,0.418678,3.772140,1.64619,-1.107260,-7.64322,3666.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23580,6017278,6017278,1,5.61726,49,-12.89360,3.863100,1.362490,6.045840,1.934850,...,-0.153978,-2.722730,4.539870,-3.439560,-4.912080,5.023050,2.79718,-1.957980,-4.95931,3686.0
23581,6019418,6019418,1,0.61056,46,-14.35820,4.593890,-3.586570,5.142670,7.384340,...,-0.004897,0.643796,4.351150,0.410045,-3.597580,3.635660,-2.25041,-1.406710,-1.61883,3737.0
23582,6020273,6020273,1,-2.29283,62,-10.74110,1.652630,-3.641710,-1.671120,-5.966020,...,0.500061,-1.368080,-0.748386,-4.305680,3.525770,-4.163110,1.21038,-0.012954,5.38395,3613.0
23583,6020987,6020987,0,-2.35246,44,-11.33090,5.622410,-0.733846,-2.241800,-10.378600,...,-1.702530,1.929710,-1.872950,1.641710,-1.034200,-4.598040,1.93335,-0.707340,2.28782,3858.0


In [18]:
# merge all phenos into a phenotable 
from functools import reduce

pheno_merged = reduce(lambda  left,right: pd.merge(left,right,on=['FID', 'IID', 'sex', 'deprivation_index', 'age', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15', 'pc16', 'pc17', 'pc18', 'pc19', 'pc20', 'pc21', 'pc22', 'pc23', 'pc24', 'pc25', 'pc26', 'pc27', 'pc28', 'pc29', 'pc30', 'pc31', 'pc32', 'pc33', 'pc34', 'pc35', 'pc36', 'pc37', 'pc38', 'pc39', 'pc40'], 
                                            how='outer'), pheno_list)
pheno_merged.head(10)

Unnamed: 0,FID,IID,sex,deprivation_index,age,pc1,pc2,pc3,pc4,pc5,...,pheno25015,pheno25016,pheno25017,pheno25018,pheno25019,pheno25020,pheno25021,pheno25022,pheno25023,pheno25024
0,1000107,1000107,0,-2.19084,64,-10.3224,3.43553,-4.80312,3.12216,7.66955,...,3831.0,3712.0,1428.0,1588.0,3454.0,3771.0,1360.0,1344.0,464.0,316.0
1,1000199,1000199,1,-3.01883,63,-9.38779,3.69347,-1.44656,-0.770351,0.518569,...,5528.0,4815.0,1749.0,2092.0,2251.0,2890.0,625.0,1158.0,367.0,235.0
2,1004260,1004260,0,-0.07086,60,-12.4291,4.23242,-1.28228,5.20862,6.06866,...,4692.0,4587.0,2020.0,1815.0,3621.0,3990.0,1546.0,1515.0,557.0,459.0
3,1005693,1005693,1,-2.02879,64,-12.5637,5.01062,0.385386,-1.83014,-0.142879,...,4913.0,4678.0,1798.0,2013.0,4009.0,3420.0,1508.0,1148.0,462.0,452.0
4,1007247,1007247,0,-5.1054,66,-14.4005,0.258867,1.37215,6.62336,-4.23639,...,4330.0,4282.0,1621.0,1584.0,3559.0,3626.0,1563.0,1583.0,456.0,277.0
5,1011109,1011109,0,-2.34627,49,-12.5958,5.90764,-1.72736,1.9688,-5.6551,...,5271.0,5511.0,1550.0,1816.0,3506.0,4533.0,1351.0,1213.0,507.0,494.0
6,1012364,1012364,1,-1.99924,65,-12.6569,3.93515,-1.15924,1.0294,-7.63154,...,4232.0,4222.0,1893.0,1869.0,3672.0,3598.0,1242.0,1206.0,627.0,418.0
7,1014412,1014412,0,-2.22032,52,-12.9918,4.05698,-1.68971,5.66593,9.80612,...,4999.0,5379.0,1976.0,2141.0,4170.0,4160.0,1687.0,1215.0,676.0,553.0
8,1015078,1015078,0,-4.266,59,-11.1765,2.33954,-0.370975,0.844696,2.38532,...,4141.0,4251.0,1486.0,1750.0,3620.0,3699.0,1087.0,1312.0,224.0,356.0
9,1015486,1015486,0,-4.26072,65,-12.7825,3.66302,0.257334,2.70395,-2.50511,...,4235.0,4433.0,1607.0,1742.0,3855.0,3823.0,1410.0,1489.0,478.0,442.0


In [20]:
pheno_merged.to_csv("FIRSTtable.phe", na_rep='NA', sep='\t', index=False, quoting=3)

In [None]:
%%bash

dx upload *phe --path "project-GFv8ZJQJGgfVjGjpJ7vB2X1p:whitelist_strict/PartA/"
dx upload *csv --path "project-GFv8ZJQJGgfVjGjpJ7vB2X1p:whitelist_strict/PartA/"