In [1]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F

In [2]:
import os
pyspark_submit_args = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [3]:
os.environ.get('PYSPARK_SUBMIT_ARGS', None)

'--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'

In [20]:
sc = SparkSession\
    .builder\
    .appName("myEEGSession")\
    .config("spark.mongodb.input.uri", "mongodb://52.40.36.24:27017/eeg.eeg_metadata?readPreference=primaryPreferred") \
    .getOrCreate()

In [21]:
logger = sc._jvm.org.apache.log4j
logger.LogManager.getRootLogger().setLevel(logger.Level.FATAL)

# Export collection directly

In [39]:
sqlContext = SQLContext(sc)

dfr = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource")
pipeline = [{'$match':{'participant_id': 'B9-1-2'}}]
df = dfr.option("pipeline", pipeline).load()

In [40]:
df.printSchema()

root
 |-- Delivery_type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Gestational_Age: double (nullable = true)
 |-- Maternal_age: double (nullable = true)
 |-- Multiple_births: double (nullable = true)
 |-- Prematurity_Level: double (nullable = true)
 |-- Relative_size: string (nullable = true)
 |-- Weight_gms: double (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- num_recording: integer (nullable = true)
 |-- participant_group: string (nullable = true)
 |-- participant_id: string (nullable = true)



In [44]:
df.show()  # won't show columns whose values are array

+-------------+------+---------------+------------+---------------+-----------------+-------------+----------+--------------------+-------------+-----------------+--------------+
|Delivery_type|Gender|Gestational_Age|Maternal_age|Multiple_births|Prematurity_Level|Relative_size|Weight_gms|                 _id|num_recording|participant_group|participant_id|
+-------------+------+---------------+------------+---------------+-----------------+-------------+----------+--------------------+-------------+-----------------+--------------+
|      Vaginal|     F|           31.3|        29.0|            2.0|              2.0|          AGA|    1515.0|[5c3ecaaa788b2eea...|            2|     24_month_EEG|        B9-1-2|
|      Vaginal|     F|           31.3|        29.0|            2.0|              2.0|          AGA|    1515.0|[5c3ecaaa788b2eea...|            3|     12_month_EEG|        B9-1-2|
+-------------+------+---------------+------------+---------------+-----------------+-------------+------

In [42]:
df

DataFrame[Delivery_type: string, Gender: string, Gestational_Age: double, Maternal_age: double, Multiple_births: double, Prematurity_Level: double, Relative_size: string, Weight_gms: double, _id: struct<oid:string>, num_recording: int, participant_group: string, participant_id: string]

In [32]:
df.rdd.collect()

[Row(Delivery_type='Vaginal', Gender='F', Gestational_Age=27.2, Maternal_age=26.0, Multiple_births=1.0, Prematurity_Level=1.0, Relative_size='SGA', Weight_gms=1070.0, _id=Row(oid='5c3eca9d788b2eea015d1c95'), num_recording=2, participant_group='06_month_EEG', participant_id='A1-1-1'),
 Row(Delivery_type='Urgent cesarean section', Gender='F', Gestational_Age=32.4, Maternal_age=21.0, Multiple_births=1.0, Prematurity_Level=3.0, Relative_size='AGA', Weight_gms=1845.0, _id=Row(oid='5c3eca9e788b2eea015d1c9a'), num_recording=1, participant_group='06_month_EEG', participant_id='A10-1-1'),
 Row(Delivery_type='Emergent cesarean section', Gender='F', Gestational_Age=24.0, Maternal_age=39.0, Multiple_births=1.0, Prematurity_Level=1.0, Relative_size='AGA', Weight_gms=700.0, _id=Row(oid='5c3eca9e788b2eea015d1c9f'), num_recording=1, participant_group='06_month_EEG', participant_id='A11-1-1'),
 Row(Delivery_type='Urgent cesarean section', Gender='F', Gestational_Age=32.0, Maternal_age=35.0, Multiple_bi

In [27]:
df.select("participant_id", "participant_group").show()

+--------------+-----------------+
|participant_id|participant_group|
+--------------+-----------------+
|       B14-1-1|     24_month_EEG|
|        B9-1-2|     24_month_EEG|
|        B9-2-2|     24_month_EEG|
|       A14-1-1|     12_month_EEG|
|        B1-1-3|     12_month_EEG|
|        B1-2-3|     12_month_EEG|
|        B1-3-3|     12_month_EEG|
|       B12-1-1|     12_month_EEG|
|       B13-1-1|     12_month_EEG|
|       B16-1-2|     12_month_EEG|
+--------------+-----------------+



In [43]:
df.count()

2

In [36]:
dfr_2 = sc.read.format("com.mongodb.spark.sql.DefaultSource").option("uri",
"mongodb://52.40.36.24:27017/eeg.eeg_raw?readPreference=primaryPreferred")
pipeline = [{'$match':{'participant_id': 'B14-1-1'}}]
df_raw = dfr_2.option("pipeline", pipeline).load()

In [37]:
df_raw.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- admincode: string (nullable = true)
 |-- annotations_in_file: integer (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- datarecord_duration: double (nullable = true)
 |-- datarecords_in_file: integer (nullable = true)
 |-- digital_max: integer (nullable = true)
 |-- digital_min: integer (nullable = true)
 |-- dimension: string (nullable = true)
 |-- equipment: string (nullable = true)
 |-- file_duration: integer (nullable = true)
 |-- file_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- handle: integer (nullable = true)
 |-- label: string (nullable = true)
 |-- participant_group: string (nullable = true)
 |-- participant_id: string (nullable = true)
 |-- patient: string (nullable = true)
 |-- patient_additional: string (nullable = true)
 |-- patientcode: string (nullable = true)
 |-- patientname: string (nullable = true)
 |-- physical_max: double (nullable = true

In [38]:
df_raw.count()

16

In [13]:
x = df_raw.select('raw').rdd.take(1)

In [19]:
sc.stop()