In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, max as spark_max, min as spark_min, month, date_format, desc




In [5]:
spark = SparkSession.builder.master("local[*]").appName("CovidDataAnalysis").getOrCreate()


In [23]:
df = spark.read.csv("D:/Dataset/covid.csv", header=True, inferSchema=True)
df.show()

+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|      Date|Name of State / UT|Latitude|Longitude|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|
+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|2020-01-30|            Kerala| 10.8505|  76.2711|                  1.0|    0|                      0.0|        0|         0|            0|
|2020-01-31|            Kerala| 10.8505|  76.2711|                  1.0|    0|                      0.0|        0|         0|            0|
|2020-02-01|            Kerala| 10.8505|  76.2711|                  2.0|    0|                      0.0|        1|         0|            0|
|2020-02-02|            Kerala| 10.8505|  76.2711|                  3.0|    0|                      0.0|        1|         0|            0|
|2020-02-03|        

In [7]:
df = df.withColumn("Name of State / UT", lower(col("Name of State / UT")))

In [25]:
df.show()

+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|      Date|Name of State / UT|Latitude|Longitude|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|
+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|2020-01-30|            Kerala| 10.8505|  76.2711|                  1.0|    0|                      0.0|        0|         0|            0|
|2020-01-31|            Kerala| 10.8505|  76.2711|                  1.0|    0|                      0.0|        0|         0|            0|
|2020-02-01|            Kerala| 10.8505|  76.2711|                  2.0|    0|                      0.0|        1|         0|            0|
|2020-02-02|            Kerala| 10.8505|  76.2711|                  3.0|    0|                      0.0|        1|         0|            0|
|2020-02-03|        

Day with the Greatest Number of COVID Cases

In [9]:

day_greatest_cases = df.orderBy(desc("Total Confirmed cases")).select("Date", "Total Confirmed cases").first()

print(f"Day with the greatest number of COVID cases: {day_greatest_cases['Date']} with {day_greatest_cases['Total Confirmed cases']} cases")


Day with the greatest number of COVID cases: 2020-08-06 with 468265.0 cases


In [15]:

state_cases = df.groupBy("Name of State / UT").agg(spark_max("Total Confirmed cases").alias("TotalCases"))
second_largest_state = state_cases.orderBy(desc("TotalCases")).take(2)[1]

print(f"State with the second largest number of COVID cases: {second_largest_state['Name of State / UT']} with {second_largest_state['TotalCases']} cases")


State with the second largest number of COVID cases: tamil nadu with 273460.0 cases


the state/UT with the lowest ratio

In [17]:

df = df.withColumn("DeathToCaseRatio", col("Death") / col("Total Confirmed cases"))
state_lowest_ratio = df.orderBy("DeathToCaseRatio").select("Name of State / UT", "DeathToCaseRatio").first()

print(f"State/UT with the lowest Death to Confirmed Cases ratio: {state_lowest_ratio['Name of State / UT']} with a ratio of {state_lowest_ratio['DeathToCaseRatio']}")


State/UT with the lowest Death to Confirmed Cases ratio: puducherry with a ratio of None


Month with the Most Newer Recovered Cases

In [21]:
df = df.withColumn("Month", date_format(col("Date"), "MM"))
month_recovered = df.groupBy("Month").agg(spark_max("New recovered").alias("Total Recovered"))

# Find the month with the maximum recovered cases
max_recovered_month = month_recovered.orderBy(desc("Total Recovered")).first()

# Convert the month number to the month name
import calendar
month_name = calendar.month_name[int(max_recovered_month['Month'])]

print(f"Month with the most newer recovered cases: {month_name} with {max_recovered_month['Total Recovered']} recovered cases")

Month with the most newer recovered cases: July with 13401 recovered cases
