In [1]:
# Task 6: Global Time-Series Analysis
# Using day_wise.csv:
# 1. Global daily average new cases.
# 2. Detect spike days using Z-score.
# 3. Identify peak death date globally.
# 4. Month-over-Month death growth rate.

In [82]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [83]:
spark = SparkSession.builder.appName("Global Time Series").master("yarn").getOrCreate()
spark

26/02/20 15:27:50 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [84]:
day_wise = spark.read.parquet("hdfs:///data/covid/staging/day_wise")

                                                                                

### 1. Global daily average new cases.

In [85]:
daily_average_new_cases = day_wise.select("Date","New cases")

In [86]:
daily_average_new_cases = daily_average_new_cases.withColumn("Avg daily new cases",\
                                                             avg("New cases").over(Window.orderBy("Date")\
                                                                                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)
                                                             ))

In [87]:
daily_average_new_cases.show()

26/02/20 15:28:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:28:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:28:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:28:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:28:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 3:>                                                          (0 + 1) / 1]

+----------+---------+-------------------+
|      Date|New cases|Avg daily new cases|
+----------+---------+-------------------+
|2020-01-22|        0|                0.0|
|2020-01-23|       99|               49.5|
|2020-01-24|      287| 128.66666666666666|
|2020-01-25|      493|             219.75|
|2020-01-26|      684|              312.6|
|2020-01-27|      809|  395.3333333333333|
|2020-01-28|     2651|  717.5714285714286|
|2020-01-29|      588|            701.375|
|2020-01-30|     2068|  853.2222222222222|
|2020-01-31|     1693|              937.2|
|2020-02-01|     2111|  1043.909090909091|
|2020-02-02|     4749| 1352.6666666666667|
|2020-02-03|     3100|  1487.076923076923|
|2020-02-04|     4011|  1667.357142857143|
|2020-02-05|     3745| 1805.8666666666666|
|2020-02-06|     3159|          1890.4375|
|2020-02-07|     3532|             1987.0|
|2020-02-08|     2734|             2028.5|
|2020-02-09|     3027| 2081.0526315789475|
|2020-02-10|     2538|             2103.9|
+----------

                                                                                

In [94]:
daily_average_new_cases.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/daily_average_new_cases")

26/02/20 15:29:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


### 2. Detect spike days using Z-score.

In [97]:
window_spec = Window.rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing)
zscore_spike_days = day_wise.select(
    "Date","New cases"
).withColumn("mean", round(avg("New cases").over(window_spec), 2)) \
       .withColumn("stddev", round(stddev("New cases").over(window_spec),2)) \
       .withColumn("Z_score",
                   round((col("New cases") - col("mean")) / col("stddev"), 2))

In [98]:
zscore_spike_days = zscore_spike_days.filter(col("Z_score") > 2)

In [99]:
zscore_spike_days.show()

26/02/20 15:29:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:36 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 1

+----------+---------+--------+--------+-------+
|      Date|New cases|    mean|  stddev|Z_score|
+----------+---------+--------+--------+-------+
|2020-07-16|   252544|87771.02|75295.29|   2.19|
|2020-07-17|   242038|87771.02|75295.29|   2.05|
|2020-07-22|   280647|87771.02|75295.29|   2.56|
|2020-07-23|   282756|87771.02|75295.29|   2.59|
|2020-07-24|   281164|87771.02|75295.29|   2.57|
|2020-07-25|   255545|87771.02|75295.29|   2.23|
+----------+---------+--------+--------+-------+



In [100]:
zscore_spike_days.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/zscore_spike_days")

26/02/20 15:29:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:51 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 15:29:52 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 1

### 3. Identify peak death date globally.

In [101]:
peak_death_day_globally = day_wise.select("Date","Deaths").orderBy(col("Deaths").desc()).limit(1)
peak_death_day_globally.show()

+----------+------+
|      Date|Deaths|
+----------+------+
|2020-07-27|654036|
+----------+------+



In [102]:
peak_death_day_globally.write.mode("overwrite").parquet("hdfs:///data/covid/analytics/peak_death_day_globally")

### 4. Month-over-Month death growth rate.

In [127]:
monthly_death_growth_rate = day_wise.select("Date","Deaths","New deaths")

In [128]:
monthly_death_growth_rate = monthly_death_growth_rate.withColumn("Month",month(col("Date")))\
.withColumn("Year",year(col("Date")))

In [129]:
monthly_death_growth_rate = monthly_death_growth_rate.groupBy(
    "Year","Month").agg(sum("New deaths")\
    .alias("Monthly Deaths")).orderBy("Year", "Month")

In [130]:
monthly_death_growth_rate = monthly_death_growth_rate.withColumn(
    "Previous_Month_Deaths",
    lag("Monthly Deaths").over(Window.orderBy("Year", "Month"))
).withColumn("Monthly death growth rate",
                round((col("Monthly Deaths") - col("Previous_Month_Deaths")) / col("Previous_Month_Deaths") * 100,2)
               )

In [131]:
monthly_death_growth_rate.show()

26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----+-----+--------------+---------------------+-------------------------+
|Year|Month|Monthly Deaths|Previous_Month_Deaths|Monthly death growth rate|
+----+-----+--------------+---------------------+-------------------------+
|2020|    1|           196|                 NULL|                     NULL|
|2020|    2|          2723|                  196|                  1289.29|
|2020|    3|         41542|                 2723|                   1425.6|
|2020|    4|        190226|                41542|                   357.91|
|2020|    5|        138902|               190226|                   -26.98|
|2020|    6|        137604|               138902|                    -0.93|
|2020|    7|        142826|               137604|                     3.79|
+----+-----+--------------+---------------------+-------------------------+



26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/02/20 16:11:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [None]:
monthly_death_growth_rate

In [80]:
spark.stop()