# Import packages

In [None]:
import pyspark
import dxpy
import dxdata

In [None]:
# Spark initialization (Done only once; do not rerun this cell unless you select Kernel -> Restart kernel).
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [None]:
# Automatically discover dispensed database name and dataset id
dispensed_database = dxpy.find_one_data_object(
    classname='database', 
    name='app*', 
    folder='/', 
    name_mode='glob', 
    describe=True)
dispensed_database_name = dispensed_database['describe']['name']

dispensed_dataset = dxpy.find_one_data_object(
    typename='Dataset', 
    name='app*.dataset', 
    folder='/', 
    name_mode='glob')
dispensed_dataset_id = dispensed_dataset['id']

Access dataset

In [None]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

Dataset 'entities' are virtual tables linked to one another.

In [None]:
dataset.entities

In [None]:
participant = dataset['participant']

In [None]:
#field_names = ['eid', 'p31', 'p21022', 'p40005_i0', 'p93_i0_a0']

Looking up fields, given UKB showcase field id

In [None]:
# Returns all field objects for a given UKB showcase field id

def fields_for_id(field_id):
    from distutils.version import LooseVersion
    field_id = str(field_id)
    fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
    return sorted(fields, key=lambda f: LooseVersion(f.name))

# Returns all field names for a given UKB showcase field id

def field_names_for_id(field_id):
    return [f.name for f in fields_for_id(field_id)]

In [None]:
field_ids = ['21003', '102']
# sum flattens list of lists
field_names = ['eid'] \
    + sum([field_names_for_id(field_id) for field_id in field_ids], [])

Grabbing fields into a Spark DataFrame

In [None]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

In [None]:
# See the first five entries as a Pandas DataFrame:
df.limit(5).toPandas()

Saving results

In [None]:
# Saving as CSV file
df.toPandas().to_csv('participants.csv', index=False)

Writing results back to the project

In [None]:
%%bash
dx upload participants.csv --dest /