In [None]:
# Analysis and Visualization

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import matplotlib.pyplot as plt

# Initialize Spark session
spark = SparkSession.builder.appName("Analysis and Visualization").getOrCreate()

# Load the dataset
df = spark.read.option("header", "true").csv("hdfs:///user/hadoop/datasets/transformed_global_confirmed_cases.csv")

# Aggregation and analysis
aggregated_df = df.groupBy("country").agg(sum("cases").alias("total_cases")).orderBy(desc("total_cases"))

# Convert to Pandas for visualization
pandas_df = aggregated_df.toPandas()

# Plot the data
plt.figure(figsize=(10, 6))
plt.barh(pandas_df["country"], pandas_df["total_cases"])
plt.xlabel("Total Cases")
plt.ylabel("Country")
plt.title("Total COVID-19 Cases by Country")
plt.show()

spark.stop()
