In [170]:
# Task 5: Recovery Efficiency
# 1. Recovered percentage per country.
# 2. 7-day rolling recovery average (Window function).
# 3. Country with fastest recovery growth.
# 4. Peak recovery day per country.
# Use Spark Window functions.

In [171]:
## Import Statements

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [172]:
spark = SparkSession.builder.appName("Recovery Efficiency").master("yarn").getOrCreate()

26/02/20 15:10:56 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [173]:
spark

### 1. Recovered percentage per country.

In [174]:
worldometer_data = spark.read.parquet("hdfs:///data/covid/staging/worldometer_data")

                                                                                

In [175]:
recovered_percentage_country_wise = worldometer_data.select("Country/Region","TotalCases","TotalRecovered")

In [176]:
recovered_percentage_country_wise = recovered_percentage_country_wise.withColumn("Recovered Percentage",
                                                       when(col("TotalCases") != 0,
                                                            round((col("TotalRecovered")/col("TotalCases"))*100,2)
                                                                            ).otherwise(0)
                                                                           )

In [177]:
recovered_percentage_country_wise.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------+----------+--------------+--------------------+
|Country/Region|TotalCases|TotalRecovered|Recovered Percentage|
+--------------+----------+--------------+--------------------+
|           USA|   5032179|       2576668|                51.2|
|        Brazil|   2917562|       2047660|               70.18|
|         India|   2025409|       1377384|               68.01|
|        Russia|    871894|        676357|               77.57|
|  South Africa|    538184|        387316|               71.97|
|        Mexico|    462690|        308848|               66.75|
|          Peru|    455409|        310337|               68.14|
|         Chile|    366671|        340168|               92.77|
|      Colombia|    357710|        192355|               53.77|
|         Spain|    354530|             0|                 0.0|
|          Iran|    320117|        277463|               86.68|
|            UK|    308134|             0|                 0.0|
|  Saudi Arabia|    284226|        24708

                                                                                

In [178]:
recovered_percentage_country_wise.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/recovered_percentage_country_wise")

                                                                                

### 2. 7-day rolling recovery average (Window function).

In [179]:
day_wise = spark.read.parquet("hdfs:///data/covid/staging/day_wise")

In [180]:
recovery_average_rolling_7days = day_wise.select("Date","Recovered")

In [181]:
recovery_average_rolling_7days = recovery_average_rolling_7days.withColumn(
    "7 Day Rolling Recovery Average",
    round(avg("Recovered").over(Window.orderBy("Date").rowsBetween(-6, 0)),2)                                                                          
)

In [182]:
recovery_average_rolling_7days.show()

26/02/20 15:11:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:16 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 6:>                                                          (0 + 1) / 1]

+----------+---------+------------------------------+
|      Date|Recovered|7 Day Rolling Recovery Average|
+----------+---------+------------------------------+
|2020-01-22|       28|                          28.0|
|2020-01-23|       30|                          29.0|
|2020-01-24|       36|                         31.33|
|2020-01-25|       39|                         33.25|
|2020-01-26|       52|                          37.0|
|2020-01-27|       61|                          41.0|
|2020-01-28|      107|                         50.43|
|2020-01-29|      125|                         64.29|
|2020-01-30|      141|                         80.14|
|2020-01-31|      219|                        106.29|
|2020-02-01|      281|                        140.86|
|2020-02-02|      459|                         199.0|
|2020-02-03|      604|                        276.57|
|2020-02-04|      821|                        378.57|
|2020-02-05|     1071|                        513.71|
|2020-02-06|     1418|      

                                                                                

In [183]:
recovery_average_rolling_7days.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/recovery_average_rolling_7days")

26/02/20 15:11:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:11:17 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
                                                                                

### 3. Country with fastest recovery growth.

In [184]:
full_grouped = spark.read.parquet("hdfs:///data/covid/staging/full_grouped")
full_grouped.show()

+----------+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|      Date|     Country/Region|Confirmed|Deaths|Recovered|Active|New cases|New deaths|New recovered|          WHO Region|
+----------+-------------------+---------+------+---------+------+---------+----------+-------------+--------------------+
|2020-01-22|        Afghanistan|        0|     0|        0|     0|        0|         0|            0|Eastern Mediterra...|
|2020-01-22|            Albania|        0|     0|        0|     0|        0|         0|            0|              Europe|
|2020-01-22|            Algeria|        0|     0|        0|     0|        0|         0|            0|              Africa|
|2020-01-22|            Andorra|        0|     0|        0|     0|        0|         0|            0|              Europe|
|2020-01-22|             Angola|        0|     0|        0|     0|        0|         0|            0|              Africa|
|2020-01-22|Anti

In [185]:
fastest_recovery_growth = full_grouped.select("Date","Country/Region","Recovered","New recovered")

In [186]:
fastest_recovery_growth = fastest_recovery_growth.withColumn("Previous_Day_Recovery",
                                        lag("Recovered", 1).over(Window.partitionBy("Country/Region").orderBy("Date"))
                                       ).withColumn(
    "Daily_Recovery_Growth",
    when(col("Previous_Day_Recovery") != 0,
    (col("Recovered") - col("Previous_Day_Recovery"))/col("Previous_Day_Recovery") * 100).otherwise(0)
).fillna(0)

In [187]:
fastest_recovery_growth = fastest_recovery_growth.groupBy('Country/Region').agg(
        avg(col('Daily_Recovery_Growth')).alias("avg_growth")
    ).orderBy(col("avg_growth").desc())

In [188]:
fastest_recovery_growth = fastest_recovery_growth.limit(1)
fastest_recovery_growth.show()

+--------------+------------------+
|Country/Region|        avg_growth|
+--------------+------------------+
|       Belgium|157.27021708900037|
+--------------+------------------+



In [197]:
fastest_recovery_growth.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/fastest_recovery_growth")

                                                                                

### 4. Peak recovery day per country.

In [204]:
peak_recovery_day = full_grouped.select("Country/Region","Date","Recovered")

In [205]:
peak_recovery_day = peak_recovery_day.withColumn("rank",
                                                 row_number().over(Window.partitionBy("Country/Region")\
                                                     .orderBy(col("Recovered").desc()
                                                             ))
                                                )

In [206]:
peak_recovery_day = peak_recovery_day.select("Country/Region","Date","Recovered")\
    .filter(col("rank") == 1)

In [207]:
peak_recovery_day.show()

+-------------------+----------+---------+
|     Country/Region|      Date|Recovered|
+-------------------+----------+---------+
|        Afghanistan|2020-07-27|    25198|
|            Albania|2020-07-27|     2745|
|            Algeria|2020-07-27|    18837|
|            Andorra|2020-07-10|      803|
|             Angola|2020-07-25|      242|
|Antigua and Barbuda|2020-07-27|       65|
|          Argentina|2020-07-27|    72575|
|            Armenia|2020-07-27|    26665|
|          Australia|2020-07-27|     9311|
|            Austria|2020-07-27|    18246|
|         Azerbaijan|2020-07-27|    23242|
|            Bahamas|2020-07-13|       91|
|            Bahrain|2020-07-27|    36110|
|         Bangladesh|2020-07-27|   125683|
|           Barbados|2020-07-22|       94|
|            Belarus|2020-07-27|    60492|
|            Belgium|2020-07-27|    17452|
|             Belize|2020-07-24|       26|
|              Benin|2020-07-26|     1036|
|             Bhutan|2020-07-27|       86|
+----------

In [208]:
peak_recovery_day.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/peak_recovery_day")

                                                                                

In [209]:
spark.stop()