In [1]:
import pyspark
myConf = pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# Read Case entity 

In [17]:
cases = spark.read.load("coronavirusdataset/Case.csv",format="csv", sep=",", inferSchema="true", header="true")

In [5]:
cases.show(5)

+-------+--------+-------------+-----+--------------------+---------+---------+----------+
|case_id|province|         city|group|      infection_case|confirmed| latitude| longitude|
+-------+--------+-------------+-----+--------------------+---------+---------+----------+
|1000001|   Seoul|   Yongsan-gu| true|       Itaewon Clubs|       72|37.538621|126.992652|
|1000002|   Seoul|      Guro-gu| true| Guro-gu Call Center|       98|37.508163|126.884387|
|1000003|   Seoul|Dongdaemun-gu| true|       Dongan Church|       20|37.592888|127.056766|
|1000004|   Seoul|      Guro-gu| true|Manmin Central Ch...|       41|37.481059|126.894343|
|1000005|   Seoul| Eunpyeong-gu| true|Eunpyeong St. Mar...|       14| 37.63369|  126.9165|
+-------+--------+-------------+-----+--------------------+---------+---------+----------+
only showing top 5 rows



In [6]:
cases.columns

['case_id',
 'province',
 'city',
 'group',
 'infection_case',
 'confirmed',
 'latitude',
 'longitude']

# rdd to Pandas

In [7]:
cases.limit(10).toPandas()

Unnamed: 0,case_id,province,city,group,infection_case,confirmed,latitude,longitude
0,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,72,37.538621,126.992652
1,1000002,Seoul,Guro-gu,True,Guro-gu Call Center,98,37.508163,126.884387
2,1000003,Seoul,Dongdaemun-gu,True,Dongan Church,20,37.592888,127.056766
3,1000004,Seoul,Guro-gu,True,Manmin Central Church,41,37.481059,126.894343
4,1000005,Seoul,Eunpyeong-gu,True,Eunpyeong St. Mary's Hospital,14,37.63369,126.9165
5,1000006,Seoul,Seongdong-gu,True,Seongdong-gu APT,13,37.55713,127.0403
6,1000007,Seoul,Jongno-gu,True,Jongno Community Center,10,37.57681,127.006
7,1000008,Seoul,Jung-gu,True,Jung-gu Fashion Company,7,37.562405,126.984377
8,1000009,Seoul,from other city,True,Shincheonji Church,8,-,-
9,1000010,Seoul,-,False,overseas inflow,321,-,-


## Changing the Name of columns

- rdd.withColumnRenamed("A","B")

In [8]:
cases = cases.withColumnRenamed("infection_case",'infection_source')

In [9]:
cases.show(5)

+-------+--------+-------------+-----+--------------------+---------+---------+----------+
|case_id|province|         city|group|    infection_source|confirmed| latitude| longitude|
+-------+--------+-------------+-----+--------------------+---------+---------+----------+
|1000001|   Seoul|   Yongsan-gu| true|       Itaewon Clubs|       72|37.538621|126.992652|
|1000002|   Seoul|      Guro-gu| true| Guro-gu Call Center|       98|37.508163|126.884387|
|1000003|   Seoul|Dongdaemun-gu| true|       Dongan Church|       20|37.592888|127.056766|
|1000004|   Seoul|      Guro-gu| true|Manmin Central Ch...|       41|37.481059|126.894343|
|1000005|   Seoul| Eunpyeong-gu| true|Eunpyeong St. Mar...|       14| 37.63369|  126.9165|
+-------+--------+-------------+-----+--------------------+---------+---------+----------+
only showing top 5 rows



In [11]:
cases = cases.toDF(*['case_id','province','city','group','infection_source','confirmed','latitude','longitude'])

In [12]:
cases.show(5)

+-------+--------+-------------+-----+--------------------+---------+---------+----------+
|case_id|province|         city|group|    infection_source|confirmed| latitude| longitude|
+-------+--------+-------------+-----+--------------------+---------+---------+----------+
|1000001|   Seoul|   Yongsan-gu| true|       Itaewon Clubs|       72|37.538621|126.992652|
|1000002|   Seoul|      Guro-gu| true| Guro-gu Call Center|       98|37.508163|126.884387|
|1000003|   Seoul|Dongdaemun-gu| true|       Dongan Church|       20|37.592888|127.056766|
|1000004|   Seoul|      Guro-gu| true|Manmin Central Ch...|       41|37.481059|126.894343|
|1000005|   Seoul| Eunpyeong-gu| true|Eunpyeong St. Mar...|       14| 37.63369|  126.9165|
+-------+--------+-------------+-----+--------------------+---------+---------+----------+
only showing top 5 rows



- selecting features: rdd.select("col1",'col2',,,,)

In [13]:
cases = cases.select('province','city','infection_source')

In [14]:
cases.show(6)

+--------+-------------+--------------------+
|province|         city|    infection_source|
+--------+-------------+--------------------+
|   Seoul|   Yongsan-gu|       Itaewon Clubs|
|   Seoul|      Guro-gu| Guro-gu Call Center|
|   Seoul|Dongdaemun-gu|       Dongan Church|
|   Seoul|      Guro-gu|Manmin Central Ch...|
|   Seoul| Eunpyeong-gu|Eunpyeong St. Mar...|
|   Seoul| Seongdong-gu|    Seongdong-gu APT|
+--------+-------------+--------------------+
only showing top 6 rows



- sorting values by columns

- rdd.sort("feature1" , ascending=[True|False]).show()

In [19]:
cases.sort("confirmed",ascending=False).show()

+-------+-----------------+---------------+-----+--------------------+---------+---------+----------+
|case_id|         province|           city|group|      infection_case|confirmed| latitude| longitude|
+-------+-----------------+---------------+-----+--------------------+---------+---------+----------+
|1200001|            Daegu|         Nam-gu| true|  Shincheonji Church|     4510| 35.84008|  128.5667|
|1200008|            Daegu|              -|false|contact with patient|      929|        -|         -|
|1200009|            Daegu|              -|false|                 etc|      724|        -|         -|
|6000001| Gyeongsangbuk-do|from other city| true|  Shincheonji Church|      566|        -|         -|
|1000010|            Seoul|              -|false|     overseas inflow|      321|        -|         -|
|2000007|      Gyeonggi-do|              -|false|     overseas inflow|      225|        -|         -|
|1200002|            Daegu|   Dalseong-gun| true|Second Mi-Ju Hosp...|      196|35