In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, avg, round
from pyspark.sql.window import Window

# Create Spark session

In [2]:
spark = SparkSession.builder \
    .appName("COVID19 Data Analysis") \
    .getOrCreate()


# Load dataset

In [3]:
df = spark.read.csv("owid-covid-data.csv", header=True, inferSchema=True)

# Filter for relevant columns

In [4]:
df_filtered = df.select("location", "date", "new_cases", "total_cases") \
    .filter(col("location").isin("India", "United States", "Brazil")) \
    .filter(col("date").isNotNull())

# Convert date column to proper type if needed

In [5]:
from pyspark.sql.functions import to_date
df_filtered = df_filtered.withColumn("date", to_date("date", "yyyy-MM-dd"))

# Window specification

In [6]:
window_spec = Window.partitionBy("location").orderBy("date")

# Calculate daily growth rate (new_cases / lag(total_cases))

In [7]:
df_with_growth = df_filtered.withColumn("prev_total_cases", lag("total_cases", 1).over(window_spec)) \
    .withColumn("daily_growth_rate", round((col("new_cases") / col("prev_total_cases")) * 100, 2))

# Calculate 7-day moving average of new cases

In [8]:
moving_avg_window = window_spec.rowsBetween(-6, 0)
df_with_moving_avg = df_with_growth.withColumn("7_day_avg_new_cases", round(avg("new_cases").over(moving_avg_window), 2))

# Show final results

In [9]:
df_with_moving_avg.select("location", "date", "new_cases", "daily_growth_rate", "7_day_avg_new_cases").show()

+--------+----------+---------+-----------------+-------------------+
|location|      date|new_cases|daily_growth_rate|7_day_avg_new_cases|
+--------+----------+---------+-----------------+-------------------+
|  Brazil|2020-01-05|        0|             NULL|                0.0|
|  Brazil|2020-01-06|        0|             NULL|                0.0|
|  Brazil|2020-01-07|        0|             NULL|                0.0|
|  Brazil|2020-01-08|        0|             NULL|                0.0|
|  Brazil|2020-01-09|        0|             NULL|                0.0|
|  Brazil|2020-01-10|        0|             NULL|                0.0|
|  Brazil|2020-01-11|        0|             NULL|                0.0|
|  Brazil|2020-01-12|        0|             NULL|                0.0|
|  Brazil|2020-01-13|        0|             NULL|                0.0|
|  Brazil|2020-01-14|        0|             NULL|                0.0|
|  Brazil|2020-01-15|        0|             NULL|                0.0|
|  Brazil|2020-01-16