#PySpark Practice Notebook
Experimenting further with Spark Dataframes

In [None]:
#Create CSV files for the following Datasets

patients
patientId,firstName,lastName,age
101,Alice,Smith,30
102,Bob,Johnson,45
103,Charlie,Williams,50
104,John,Smith,78

visits
visitId,patientId,visitDuration
1,101,15
2,101,30
3,102,45
4,102,30
5,104,20
6,103,60
7,103,50
8,104,45
9,,45
10,101,30
11,103,40
12,,30
13,104,25
15,102,15

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=e7f4c1ca6be5a32d225840dfd817536f7178af9fc7079fb468f0e6d4913b1048
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [3]:
#Import necessary spark libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
lti_spark = SparkSession.builder.appName("sound_lti").getOrCreate()



In [37]:
%%writefile visits.csv
visitId,patientId,visitDuration
1,101,15
2,101,30
3,102,45
4,102,30
5,104,20
6,103,60
7,103,50
8,104,45
9,,45
10,101,30
11,103,40
12,,30
13,104,25
15,102,15

Overwriting visits.csv


In [31]:
#Create dataframes from CSV files using enforced Schema
#Schems Directive [Names as string, everything else as integers]
%%writefile patients.csv
patientId,firstName,lastName,age
101,Alice,Smith,30
102,Bob,Johnson,45
103,Charlie,Williams,50
104,John,Smith,78



Overwriting patients.csv


In [32]:
pdf=lti_spark.read.csv("patients.csv",header=True)
vdf=lti_spark.read.csv("visits.csv",header=True)

In [27]:
#Display all the valid visits
vdf.filter(vdf.patientId!=" ").show()



+-------+---------+-------------+
|visitId|patientId|visitDuration|
+-------+---------+-------------+
|      1|      101|           15|
|      2|      101|           30|
|      3|      102|           45|
|      4|      102|           30|
|      5|      104|           20|
|      6|      103|           60|
|      7|      103|           50|
|      8|      104|           45|
|     10|      101|           30|
|     11|      103|           40|
|     13|      104|           25|
|     15|      102|           15|
+-------+---------+-------------+



In [28]:
#Fetch the total number of valid visits
vdf.filter(vdf.patientId!=" ").count()

12

In [29]:
#Get the total hours of patient visit
a=vdf.agg({"visitDuration":"sum"})
b=a.collect()[0][0]
vdf.withColumn("totalhour",vdf.visitDuration/60).show()

+-------+---------+-------------+------------------+
|visitId|patientId|visitDuration|         totalhour|
+-------+---------+-------------+------------------+
|      1|      101|           15|              0.25|
|      2|      101|           30|               0.5|
|      3|      102|           45|              0.75|
|      4|      102|           30|               0.5|
|      5|      104|           20|0.3333333333333333|
|      6|      103|           60|               1.0|
|      7|      103|           50|0.8333333333333334|
|      8|      104|           45|              0.75|
|      9|     NULL|           45|              0.75|
|     10|      101|           30|               0.5|
|     11|      103|           40|0.6666666666666666|
|     12|     NULL|           30|               0.5|
|     13|      104|           25|0.4166666666666667|
|     15|      102|           15|              0.25|
+-------+---------+-------------+------------------+



In [23]:
#Fetch the top two most visited patients on the basis of occurances
vdf.groupby("patientId").count().orderBy("count",ascending=False).show(2)

+---------+-----+
|patientId|count|
+---------+-----+
|      101|    3|
|      104|    3|
+---------+-----+
only showing top 2 rows



In [33]:
#Add a new column patientFullName & display complete dataframe
pdf.withColumn("full_name",concat_ws(" ","firstName","lastName")).show()

+---------+---------+--------+---+----------------+
|patientId|firstName|lastName|age|       full_name|
+---------+---------+--------+---+----------------+
|      101|    Alice|   Smith| 30|     Alice Smith|
|      102|      Bob| Johnson| 45|     Bob Johnson|
|      103|  Charlie|Williams| 50|Charlie Williams|
|      104|     John|   Smith| 78|      John Smith|
+---------+---------+--------+---+----------------+



In [40]:
vdf=vdf.withColumn("visitDuration",vdf["visitDuration"].cast("integer"))

In [48]:
#Display the name of patients with maximum visiting hours
pdf.join(vdf,"patientId","full").select(["firstName","visitDuration","patientId"]).groupby("patientId").sum("visitDuration").orderBy("sum(visitDuration)",ascending=False).show()

+---------+------------------+
|patientId|sum(visitDuration)|
+---------+------------------+
|      103|               150|
|      104|                90|
|      102|                90|
|      101|                75|
|     NULL|                75|
+---------+------------------+



In [55]:
#Find the patient with maximum visiting hours in a single visit
#vdf.printSchema()
pdf.join(vdf,"patientId","full").select("firstName","visitDuration").groupby("firstName").max("visitDuration").show()

+---------+------------------+
|firstName|max(visitDuration)|
+---------+------------------+
|     NULL|                45|
|  Charlie|                60|
|      Bob|                45|
|     John|                45|
|    Alice|                30|
+---------+------------------+



In [57]:
#Show the patient info according to age-seniority
pdf.sort("age",ascending=False).show()

+---------+---------+--------+---+
|patientId|firstName|lastName|age|
+---------+---------+--------+---+
|      104|     John|   Smith| 78|
|      103|  Charlie|Williams| 50|
|      102|      Bob| Johnson| 45|
|      101|    Alice|   Smith| 30|
+---------+---------+--------+---+

