In [1]:
#INSTALANDO O PYSPARK
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 45.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=d1c70fbe826190632a2d6866357c4acc21c0b01ecc42a082dde329801a65e84a
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [54]:
#IMPORTANDO BIBLIOTECAS
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [4]:
#CRIANDO UMA CONEXAO COM O SPARK
spark = (
    SparkSession.builder
      .master('local')
      .appName('dataframe_withcolumn')
      .config('spark.ui.port', '4050')
      .getOrCreate()
)

In [6]:
#TESTANDO A CONEXAO COM O SPARK
spark

In [161]:
#iMPORTANDO O DATAFRAME
df = (spark
       .read
       .format("csv")
       .option("header", "false")
       .option("inferschema", "false")
       .option("delimiter", ",")
       .load('/content/drive/MyDrive/covid_north_america.csv')
)

In [162]:
#TROCANDO O CABEÇALHO DO DATAFRAME PORQUE VEIO UMA PORCARIA!
df1 = df.select(F.col("_c0").alias("pais"),
               F.col("_c1").alias("casosTotais"),
               F.col("_c2").alias("obitosTotais"),
               F.col("_c3").alias("recuperTotais"),
               F.col("_c4").alias("casosAtivos"),
               F.col("_c5").alias("casosTotaisPop"),
               F.col("_c6").alias("mortesPop"),
               F.col("_c4").alias("testesTotais"),
               F.col("_c5").alias("testesPop"),
               F.col("_c6").alias("populacao"),
)

In [164]:
#CONVERTENDO O TIPO DE DADOS PARA UMA FORMA CONSISTENTE, EM UM NOVO
df1 = (df1.select(F.col("pais").cast("string"),
                  F.col("casosTotais").cast("int"),
                  F.col("obitosTotais").cast("int"),
                  F.col("recuperTotais").cast("int"),
                  F.col("casosAtivos").cast("int"),
                  F.col("casosTotaisPop").cast("int"),
                  F.col("mortesPop").cast("int"),
                  F.col("testesTotais").cast("int"),
                  F.col("testesPop").cast("int"),
                  F.col("populacao").cast("int")
                  )
)
         

In [165]:
#CONFERINDO A CONVERSAO
df1.printSchema()

root
 |-- pais: string (nullable = true)
 |-- casosTotais: integer (nullable = true)
 |-- obitosTotais: integer (nullable = true)
 |-- recuperTotais: integer (nullable = true)
 |-- casosAtivos: integer (nullable = true)
 |-- casosTotaisPop: integer (nullable = true)
 |-- mortesPop: integer (nullable = true)
 |-- testesTotais: integer (nullable = true)
 |-- testesPop: integer (nullable = true)
 |-- populacao: integer (nullable = true)



In [174]:
#CRIANDO UMA VIEW DO DATAFRAME
df2 = df1.createOrReplaceTempView("covid_america")

In [167]:
#VISUALIZANDO A VIEW
spark.sql("""SELECT * FROM covid_america""").show()

+--------------------+-----------+------------+-------------+-----------+--------------+---------+------------+---------+---------+
|                pais|casosTotais|obitosTotais|recuperTotais|casosAtivos|casosTotaisPop|mortesPop|testesTotais|testesPop|populacao|
+--------------------+-----------+------------+-------------+-----------+--------------+---------+------------+---------+---------+
|       Country/Other|       null|        null|         null|       null|          null|     null|        null|     null|     null|
|            Anguilla|        944|           1|          832|        111|         62183|       66|         111|    62183|       66|
| Antigua and Barbuda|       4058|         102|         3726|        230|         40987|     1030|         230|    40987|     1030|
|               Aruba|      15925|         171|        15577|        177|        148320|     1593|         177|   148320|     1593|
|             Bahamas|      22351|         643|        21079|        629|   

In [168]:
#PESQUISANDO PAISES QUE HOUVERAM MENOS DE 50000 RECUPERACOES TOTAIS, ORDENANDO A PARTIR DOS QUE TIVERAM MAIS RECUPERACOES
spark.sql("""
  SELECT pais, casosTotais, recuperTotais 
  FROM covid_america 
  WHERE recuperTotais < 50000 
  ORDER BY recuperTotais DESC;
""").show()

+--------------------+-----------+-------------+
|                pais|casosTotais|recuperTotais|
+--------------------+-----------+-------------+
|              Belize|      26798|        23706|
|             Bahamas|      22351|        21079|
|               Haiti|      23960|        20345|
|             Curaçao|      17084|        16798|
|               Aruba|      15925|        15577|
|         Saint Lucia|      12559|        11959|
|            Barbados|      17763|        11506|
|             Grenada|       5840|         5519|
|             Bermuda|       5647|         5462|
|        Sint Maarten|       4494|         4389|
|            Dominica|       4823|         4337|
|           Nicaragua|      16422|         4225|
| Antigua and Barbuda|       4058|         3726|
|St. Vincent Grena...|       4995|         3108|
|    Turks and Caicos|       2980|         2902|
|British Virgin Is...|       2725|         2649|
|Saint Kitts and N...|       2669|         2511|
|Caribbean Netherl..