In [1]:
# Task 4: Infection Rate Analysis
# Using worldometer_data:
# 1. Confirmed cases per 1000 population.
# 2. Active cases per 1000 population.
# 3. Top 10 countries by infection rate.
# 4. WHO region infection ranking.

In [42]:
## Import Statements

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [32]:
spark = SparkSession.builder.appName("Infection Rate Analysis").master("yarn").getOrCreate()

26/02/19 16:30:51 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [33]:
spark

In [34]:
worldometer_data = spark.read.parquet("hdfs:///data/covid/staging/worldometer_data")

                                                                                

### 1. Confirmed cases per 1000 population.

In [41]:
cases_per_population = worldometer_data.select("Country/Region","Population","TotalCases")

In [53]:
cases_per_population = cases_per_population.withColumn("Confirmed Per 1000",
                                                      round((col("TotalCases")/col("Population")) * 1000, 2)
                                                      )

In [54]:
cases_per_population.show()

+--------------+----------+----------+------------------+
|Country/Region|Population|TotalCases|Confirmed Per 1000|
+--------------+----------+----------+------------------+
|           USA| 331198130|   5032179|             15.19|
|        Brazil| 212710692|   2917562|             13.72|
|         India|1381344997|   2025409|              1.47|
|        Russia| 145940924|    871894|              5.97|
|  South Africa|  59381566|    538184|              9.06|
|        Mexico| 129066160|    462690|              3.58|
|          Peru|  33016319|    455409|             13.79|
|         Chile|  19132514|    366671|             19.16|
|      Colombia|  50936262|    357710|              7.02|
|         Spain|  46756648|    354530|              7.58|
|          Iran|  84097623|    320117|              3.81|
|            UK|  67922029|    308134|              4.54|
|  Saudi Arabia|  34865919|    284226|              8.15|
|      Pakistan| 221295851|    281863|              1.27|
|    Banglades

In [75]:
cases_per_population.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/cases_per_population")

                                                                                

### 2. Active cases per 1000 population.

In [44]:
active_cases_per_population = worldometer_data.select("Country/Region","Population","ActiveCases")

In [55]:
active_cases_per_population = active_cases_per_population.withColumn("Confirmed Per 1000",
                                                      round((col("ActiveCases")/col("Population")) * 1000,2)
                                                      )

In [56]:
active_cases_per_population.show()

+--------------+----------+-----------+------------------+
|Country/Region|Population|ActiveCases|Confirmed Per 1000|
+--------------+----------+-----------+------------------+
|           USA| 331198130|    2292707|              6.92|
|        Brazil| 212710692|     771258|              3.63|
|         India|1381344997|     606387|              0.44|
|        Russia| 145940924|     180931|              1.24|
|  South Africa|  59381566|     141264|              2.38|
|        Mexico| 129066160|     103325|               0.8|
|          Peru|  33016319|     124648|              3.78|
|         Chile|  19132514|      16614|              0.87|
|      Colombia|  50936262|     153416|              3.01|
|         Spain|  46756648|          0|               0.0|
|          Iran|  84097623|      24678|              0.29|
|            UK|  67922029|          0|               0.0|
|  Saudi Arabia|  34865919|      34082|              0.98|
|      Pakistan| 221295851|      19770|              0.0

In [76]:
active_cases_per_population.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/active_cases_per_population")

### 3. Top 10 countries by infection rate.

In [None]:
top10_countries_infection_rate = cases_per_population.orderBy(col("confirmed Per 1000").desc()).limit(10)

In [None]:
top10_countries_infection_rate.show()

+--------------+----------+----------+------------------+
|Country/Region|Population|TotalCases|Confirmed Per 1000|
+--------------+----------+----------+------------------+
|         Qatar|   2807805|    112092|             39.92|
| French Guiana|    299385|      8127|             27.15|
|       Bahrain|   1706669|     42889|             25.13|
|    San Marino|     33938|       699|              20.6|
|         Chile|  19132514|    366671|             19.16|
|        Panama|   4321282|     71418|             16.53|
|        Kuwait|   4276658|     70045|             16.38|
|          Oman|   5118446|     80713|             15.77|
|           USA| 331198130|   5032179|             15.19|
|  Vatican City|       801|        12|             14.98|
+--------------+----------+----------+------------------+



In [None]:
top10_countries_infection_rate.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/top10_countries_infection_rate")

                                                                                

### 4. WHO region infection ranking.

In [35]:
full_grouped = spark.read.parquet("hdfs:///data/covid/staging/full_grouped")

                                                                                

In [36]:
data = worldometer_data.join(full_grouped,'Country/Region','inner')

In [37]:
who_region_infection_rate = data.groupBy("WHO region").agg(
    sum("Population").alias("Total Population"),
    sum("TotalCases").alias("Total Cases")
)

In [38]:
who_region_infection_rate = who_region_infection_rate.withColumn(
    "Region Infection Rate Per 1000", 
    round((col("Total Cases")/col("Total Population"))*1000,2)
).orderBy(col("Region Infection Rate Per 1000").desc())

In [53]:
who_region_infection_rate = who_region_infection_rate.withColumn(
    "Rank",
    dense_rank().over(
        Window.orderBy(col("Region Infection Rate Per 1000").desc())
    )
)

In [54]:
who_region_infection_rate.show()

26/02/19 16:39:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:24 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:25 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 1

+--------------------+----------------+-----------+------------------------------+----+
|          WHO region|Total Population|Total Cases|Region Infection Rate Per 1000|Rank|
+--------------------+----------------+-----------+------------------------------+----+
|            Americas|    129272394560| 1019057008|                          7.88|   1|
|              Europe|    161644476648|  608356720|                          3.76|   2|
|Eastern Mediterra...|    134794734948|  287757124|                          2.13|   3|
|     South-East Asia|    365296447104|  456189896|                          1.25|   4|
|              Africa|    187377861752|  154672112|                          0.83|   5|
|     Western Pacific|     82634815132|   46674384|                          0.56|   6|
+--------------------+----------------+-----------+------------------------------+----+



In [55]:
who_region_infection_rate.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/who_region_infection_rate")

26/02/19 16:39:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 16:39:46 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/19 1

In [56]:
spark.stop()