In [4]:
import findspark
import requests
from pathlib import Path
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [5]:
# Iniciar findspark
findspark.init()

# Crear una sesión de Spark
spark = SparkSession.builder \
    .appName("Prueba") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "True") \
    .getOrCreate()
spark

24/09/17 21:36:13 WARN Utils: Your hostname, andres-b460mds3h resolves to a loopback address: 127.0.1.1; using 192.168.1.76 instead (on interface enp3s0)
24/09/17 21:36:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/17 21:36:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Definir URL y nombre del archivo CSV
url = ("https://raw.githubusercontent.com/owid/covid-19-data/master/"
       "public/data/owid-covid-data.csv")
csv_file_name = 'owid-covid-data.csv'

# Descargar y guardar el archivo CSV
response = requests.get(url)
response.raise_for_status()
Path(csv_file_name).write_bytes(response.content)

# Leer el archivo CSV en un DataFrame de Spark
df = spark.read.csv(csv_file_name, header=True, inferSchema=True)

# Convertir la columna de fecha
df = df.withColumn('date', F.to_date(F.col('date')))

# Mostrar el esquema del DataFrame y estadísticas
df.printSchema()
df.describe().show()

24/09/17 21:36:25 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

root
 |-- iso_code: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- location: string (nullable = true)
 |-- date: date (nullable = true)
 |-- total_cases: integer (nullable = true)
 |-- new_cases: integer (nullable = true)
 |-- new_cases_smoothed: double (nullable = true)
 |-- total_deaths: integer (nullable = true)
 |-- new_deaths: integer (nullable = true)
 |-- new_deaths_smoothed: double (nullable = true)
 |-- total_cases_per_million: double (nullable = true)
 |-- new_cases_per_million: double (nullable = true)
 |-- new_cases_smoothed_per_million: double (nullable = true)
 |-- total_deaths_per_million: double (nullable = true)
 |-- new_deaths_per_million: double (nullable = true)
 |-- new_deaths_smoothed_per_million: double (nullable = true)
 |-- reproduction_rate: double (nullable = true)
 |-- icu_patients: integer (nullable = true)
 |-- icu_patients_per_million: double (nullable = true)
 |-- hosp_patients: integer (nullable = true)
 |-- hosp_patients_per_mil

24/09/17 21:36:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-------+--------+-------------+-----------+--------------------+------------------+------------------+-----------------+------------------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+------------------+------------------+------------------------+------------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-------------------+-----------------+------------------------+----------------------+------------------+-------------------------------+-------------------+-----------------+-------------+-------------------+--------------------+-----------------------+--------------------+------------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+-----------------

                                                                                

In [7]:
# Filtrar y mostrar datos para los Estados Unidos y México,
# ordenados por fecha descendente
df.filter(F.col('location').isin("United States", "Mexico")) \
    .orderBy(F.desc("date")) \
    .show()

+--------+-------------+-------------+----------+-----------+---------+------------------+------------+----------+-------------------+-----------------------+---------------------+------------------------------+------------------------+----------------------+-------------------------------+-----------------+------------+------------------------+-------------+-------------------------+---------------------+---------------------------------+----------------------+----------------------------------+-----------+---------+------------------------+----------------------+------------------+-------------------------------+-------------+--------------+-----------+------------------+-----------------+-----------------------+--------------+----------------+-------------------------+------------------------------+-----------------------------+-----------------------------------+--------------------------+-------------------------------------+------------------------------+-------------------------

In [8]:
# Agrupar por 'location', sumar 'new_cases' y ordenar de acuerdo con las suma
df.groupBy("location") \
    .agg(F.sum("new_cases").alias("total_new_cases")) \
    .orderBy(F.desc("total_new_cases")) \
    .show(truncate=False)

+-----------------------------+---------------+
|location                     |total_new_cases|
+-----------------------------+---------------+
|World                        |775935057      |
|High-income countries        |429044052      |
|Asia                         |301564180      |
|Europe                       |252916868      |
|Upper-middle-income countries|251756125      |
|European Union (27)          |185822587      |
|North America                |124492698      |
|United States                |103436829      |
|China                        |99373219       |
|Lower-middle-income countries|92019711       |
|South America                |68811012       |
|India                        |45041748       |
|France                       |38997490       |
|Germany                      |38437756       |
|Brazil                       |37511921       |
|South Korea                  |34571873       |
|Japan                        |33803572       |
|Italy                        |26781078 

In [None]:
# Detener la sesión de Spark
spark.stop()