In [1]:
#Definicao de Schema
from pyspark.sql.session import SparkSession

#Definindo os tipos de dados a serem trabalhados
from pyspark.sql.types import (ArrayType, BooleanType, FloatType, IntegerType, StringType, 
                               StructField, StructType, TimestampType)

import pyspark.sql.functions as F

#Criando sessão Spark 
spark = SparkSession.builder.appName('fisrtSession')\
    .config('spark.master', 'local[4]')\
    .config('spark.executor.memory', '1gb')\
    .config('spark.shuffle.partitions', 1)\
    .getOrCreate()

schema = StructType([
                StructField('case_id', IntegerType()),
                StructField('province', StringType()),
                StructField('city', StringType()),
                StructField('group', BooleanType()),
                StructField('infection_case', StringType()),
                StructField('confirmed', IntegerType()),
                StructField('latitude', StringType()),
                StructField('longitude', StringType())
])

path = "covid_cases.csv"

df = spark.read.format('csv')\
    .schema(schema)\
    .load(path)

df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- province: string (nullable = true)
 |-- city: string (nullable = true)
 |-- group: boolean (nullable = true)
 |-- infection_case: string (nullable = true)
 |-- confirmed: integer (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)



In [2]:
cases1 = df.withColumnRenamed('infection_case', 'Casos de Infecção')
cases1.show()

+-------+--------+---------------+-----+--------------------+---------+---------+----------+
|case_id|province|           city|group|   Casos de Infecção|confirmed| latitude| longitude|
+-------+--------+---------------+-----+--------------------+---------+---------+----------+
|1000001|   Seoul|     Yongsan-gu| true|       Itaewon Clubs|      139|37.538621|126.992652|
|1000002|   Seoul|      Gwanak-gu| true|             Richway|      119| 37.48208|126.901384|
|1000003|   Seoul|        Guro-gu| true| Guro-gu Call Center|       95|37.508163|126.884387|
|1000004|   Seoul|   Yangcheon-gu| true|Yangcheon Table T...|       43|37.546061|126.874209|
|1000005|   Seoul|      Dobong-gu| true|     Day Care Center|       43|37.679422|127.044374|
|1000006|   Seoul|        Guro-gu| true|Manmin Central Ch...|       41|37.481059|126.894343|
|1000007|   Seoul|from other city| true|SMR Newly Planted...|       36|        -|         -|
|1000008|   Seoul|  Dongdaemun-gu| true|       Dongan Church|       17

In [3]:
cases2 = df.withColumnRenamed('infection_case', 'Casos de Infecção')
cases2.show()

+-------+--------+---------------+-----+--------------------+---------+---------+----------+
|case_id|province|           city|group|   Casos de Infecção|confirmed| latitude| longitude|
+-------+--------+---------------+-----+--------------------+---------+---------+----------+
|1000001|   Seoul|     Yongsan-gu| true|       Itaewon Clubs|      139|37.538621|126.992652|
|1000002|   Seoul|      Gwanak-gu| true|             Richway|      119| 37.48208|126.901384|
|1000003|   Seoul|        Guro-gu| true| Guro-gu Call Center|       95|37.508163|126.884387|
|1000004|   Seoul|   Yangcheon-gu| true|Yangcheon Table T...|       43|37.546061|126.874209|
|1000005|   Seoul|      Dobong-gu| true|     Day Care Center|       43|37.679422|127.044374|
|1000006|   Seoul|        Guro-gu| true|Manmin Central Ch...|       41|37.481059|126.894343|
|1000007|   Seoul|from other city| true|SMR Newly Planted...|       36|        -|         -|
|1000008|   Seoul|  Dongdaemun-gu| true|       Dongan Church|       17

In [4]:
df2 = df.select('province', 'city', 'confirmed')
df2.show()

+--------+---------------+---------+
|province|           city|confirmed|
+--------+---------------+---------+
|   Seoul|     Yongsan-gu|      139|
|   Seoul|      Gwanak-gu|      119|
|   Seoul|        Guro-gu|       95|
|   Seoul|   Yangcheon-gu|       43|
|   Seoul|      Dobong-gu|       43|
|   Seoul|        Guro-gu|       41|
|   Seoul|from other city|       36|
|   Seoul|  Dongdaemun-gu|       17|
|   Seoul|from other city|       25|
|   Seoul|      Gwanak-gu|       30|
|   Seoul|   Eunpyeong-gu|       14|
|   Seoul|   Seongdong-gu|       13|
|   Seoul|      Jongno-gu|       10|
|   Seoul|     Gangnam-gu|        7|
|   Seoul|        Jung-gu|        7|
|   Seoul|   Seodaemun-gu|        5|
|   Seoul|      Jongno-gu|        7|
|   Seoul|     Gangnam-gu|        6|
|   Seoul|from other city|        1|
|   Seoul|   Geumcheon-gu|        6|
+--------+---------------+---------+
only showing top 20 rows



In [5]:
df3 = df.sort(F.desc('confirmed'))
df3.show()

+-------+-----------------+---------------+-----+--------------------+---------+---------+----------+
|case_id|         province|           city|group|      infection_case|confirmed| latitude| longitude|
+-------+-----------------+---------------+-----+--------------------+---------+---------+----------+
|1200001|            Daegu|         Nam-gu| true|  Shincheonji Church|     4511| 35.84008|  128.5667|
|1200009|            Daegu|              -|false|contact with patient|      917|        -|         -|
|1200010|            Daegu|              -|false|                 etc|      747|        -|         -|
|6000001| Gyeongsangbuk-do|from other city| true|  Shincheonji Church|      566|        -|         -|
|2000020|      Gyeonggi-do|              -|false|     overseas inflow|      305|        -|         -|
|1000036|            Seoul|              -|false|     overseas inflow|      298|        -|         -|
|1200002|            Daegu|   Dalseong-gun| true|Second Mi-Ju Hosp...|      196|35