In [1]:
# Task 3: Death Percentage Analysis
# Using full_grouped.csv:
# 1. Compute daily death percentage per country: Deaths / Confirmed * 100
# 2. Compute global daily death percentage.
# 3. Compute continent-wise death percentage (join with worldometer_data).
# Identify:
# 1. Country with highest death percentage
# 2. Top 10 countries by deaths per capita
# All results must be written to HDFS under /data/covid/analytics.


In [2]:
## Import Statements

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.appName("Death Percentage Analysis").master("yarn").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/19 20:24:57 WARN Utils: Your hostname, Parashus-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.118.16.230 instead (on interface en0)
26/02/19 20:24:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/19 20:24:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/19 20:24:59 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [4]:
spark

In [5]:
full_grouped = spark.read.parquet("hdfs:///data/covid/staging/full_grouped")

                                                                                

## 1. Compute daily death percentage per country: Deaths / Confirmed * 100

In [6]:
full_grouped = full_grouped.withColumn("Death Percentage",
                                       when(col("Confirmed") != 0,
                                        round((col("Deaths") / col("Confirmed")) * 100,2)
                                        ).otherwise(0)
                                      ).orderBy(col("Death Percentage"))


In [7]:
daily_death_percentage_country = full_grouped.select("Date","Country/Region","Death Percentage") 

In [8]:
daily_death_percentage_country.show()

[Stage 1:>                                                          (0 + 1) / 1]

+----------+-------------------+----------------+
|      Date|     Country/Region|Death Percentage|
+----------+-------------------+----------------+
|2020-01-22|        Afghanistan|             0.0|
|2020-01-22|            Albania|             0.0|
|2020-01-22|            Algeria|             0.0|
|2020-01-22|            Andorra|             0.0|
|2020-01-22|             Angola|             0.0|
|2020-01-22|Antigua and Barbuda|             0.0|
|2020-01-22|          Argentina|             0.0|
|2020-01-22|            Armenia|             0.0|
|2020-01-22|          Australia|             0.0|
|2020-01-22|            Austria|             0.0|
|2020-01-22|         Azerbaijan|             0.0|
|2020-01-22|            Bahamas|             0.0|
|2020-01-22|            Bahrain|             0.0|
|2020-01-22|         Bangladesh|             0.0|
|2020-01-22|           Barbados|             0.0|
|2020-01-22|            Belarus|             0.0|
|2020-01-22|            Belgium|             0.0|


                                                                                

In [9]:
daily_death_percentage_country.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/daily_death_percentage_country")

                                                                                

## 2. Compute global daily death percentage.

In [10]:
global_deaths = full_grouped.groupBy("Date").agg(
    sum("Confirmed").alias("Global Confirmed"),
    sum("Deaths").alias("Global Deaths")
)

In [11]:
global_deaths = global_deaths.withColumn(
    "daily death percentage",
    round(
        when(col("Global Confirmed") != 0,
             (col("Global Deaths") / col("Global Confirmed")) * 100
        ).otherwise(0),
        2
    )
)

In [12]:
global_deaths.show()

+----------+----------------+-------------+----------------------+
|      Date|Global Confirmed|Global Deaths|daily death percentage|
+----------+----------------+-------------+----------------------+
|2020-07-24|        15791645|       639650|                  4.05|
|2020-04-30|         3268876|       234704|                  7.18|
|2020-03-07|          105312|         3553|                  3.37|
|2020-03-13|          146008|         5406|                   3.7|
|2020-02-04|           23898|          492|                  2.06|
|2020-02-15|           68765|         1666|                  2.42|
|2020-05-23|         5322253|       343385|                  6.45|
|2020-02-12|           46561|         1118|                   2.4|
|2020-05-08|         3941935|       276304|                  7.01|
|2020-05-24|         5417579|       346525|                   6.4|
|2020-06-04|         6647861|       392218|                   5.9|
|2020-04-29|         3185195|       228742|                  7

In [13]:
global_deaths.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/global_deaths")

## 3. Compute continent-wise death percentage (join with worldometer_data).

In [14]:
worldometer_data = spark.read.parquet("hdfs:///data/covid/staging/worldometer_data")

In [15]:
full_worldometer = full_grouped.join(worldometer_data,
                  "Country/Region","inner")

In [16]:
continent_wise = full_worldometer.groupBy("Continent").agg(
    sum("Confirmed").alias("Continent Confirmed"),
    sum("Deaths").alias("Continent Deaths")
)

In [17]:
continent_wise = continent_wise.withColumn("Death Percentage",
                                           when(col("Continent Confirmed") != 0,
                                                round((col("Continent Deaths")/col("Continent Confirmed"))*100,2)
                                               ).otherwise(0)
                                          )

In [18]:
continent_wise.show()

+-----------------+-------------------+----------------+----------------+
|        Continent|Continent Confirmed|Continent Deaths|Death Percentage|
+-----------------+-------------------+----------------+----------------+
|           Europe|          194318876|        14689301|            7.56|
|           Africa|           26916623|          682067|            2.53|
|Australia/Oceania|            1139677|           13570|            1.19|
|    North America|           33282136|         2666043|            8.01|
|    South America|          144631846|         5681838|            3.93|
|             Asia|          155784969|         3882965|            2.49|
+-----------------+-------------------+----------------+----------------+



In [19]:
continent_wise.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/continent_wise")

### 1. Country with highest death percentage

In [20]:
country_wise = full_grouped.groupBy("Country/Region").agg(
    sum("Confirmed").alias("Country Confirmed"),
    sum("Deaths").alias("Country Deaths")
)

In [21]:
country_wise = country_wise.withColumn("Death Percentage",
                                       when(col("Country Confirmed") != 0,
                                            round((col("Country Deaths")/col("Country Confirmed"))*100,2)
                                           ).otherwise(0)
                                      )

In [22]:
country_wise = country_wise.orderBy(col("Death Percentage").desc()).limit(1)

In [23]:
country_wise.show()

+--------------+-----------------+--------------+----------------+
|Country/Region|Country Confirmed|Country Deaths|Death Percentage|
+--------------+-----------------+--------------+----------------+
|         Yemen|            67180|         17707|           26.36|
+--------------+-----------------+--------------+----------------+



In [24]:
country_wise.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/country_wise")

### 2. Top 10 countries by deaths per capita

In [25]:
country_data = full_worldometer.groupBy("Country/Region").agg(sum("Deaths").alias("Total Deaths"))

In [28]:
death_per_capita = country_data.join(worldometer_data.select("Country/Region", "Population"),
    "Country/Region",
    "inner")

In [30]:
death_per_capita = death_per_capita.withColumn("Death per capita(1000)",
                                               when(col("Population") != 0,
                                                   round(col("Total Deaths")/col("Population")*1000,2)
                                               ).otherwise(0))

In [32]:
death_per_capita = death_per_capita.orderBy(col("Death per capita(1000)").desc()).limit(10)

In [33]:
death_per_capita.show()

+--------------+------------+----------+----------------------+
|Country/Region|Total Deaths|Population|Death per capita(1000)|
+--------------+------------+----------+----------------------+
|    San Marino|        5086|     33938|                149.86|
|       Belgium|      963679|  11594739|                 83.11|
|       Andorra|        5423|     77278|                 70.18|
|         Spain|     3033030|  46756648|                 64.87|
|         Italy|     3707717|  60452568|                 61.33|
|        France|     3048524|  65288306|                 46.69|
|        Sweden|      448913|  10105596|                 44.42|
|   Netherlands|      622314|  17138756|                 36.31|
|       Ireland|      161948|   4943200|                 32.76|
|   Switzerland|      207858|   8660952|                  24.0|
+--------------+------------+----------+----------------------+



In [34]:
death_per_capita.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/death_per_capita")

                                                                                

In [35]:
spark.stop()