In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev, skewness, kurtosis, sum

# Step 1: Create a SparkSession
spark = SparkSession.builder \
    .appName("Summary Statistics") \
    .getOrCreate()

# Step 2: Load the dataset
file_path = "Desktop/big data visualization/dataset/cleaned_transformed_dataset3.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

# Step 3: Select numerical columns for analysis
numerical_columns = [column for column, dtype in data.dtypes if dtype in ('int', 'double')]

if not numerical_columns:
    print("No numerical columns found in the dataset.")
else:
    # Step 4: Compute summary statistics for each numerical column
    for column in numerical_columns:
        print(f"Summary statistics for column: {column}")
        stats = data.select(
            sum(col(column)).alias("Sum"),
            mean(col(column)).alias("Mean"),
            stddev(col(column)).alias("Standard Deviation"),
            skewness(col(column)).alias("Skewness"),
            kurtosis(col(column)).alias("Kurtosis")
        )
        stats.show()

# Step 5: Stop the Spark session
spark.stop()


Summary statistics for column: temp
+--------------------+------------------+------------------+------------------+-----------------+
|                 Sum|              Mean|Standard Deviation|          Skewness|         Kurtosis|
+--------------------+------------------+------------------+------------------+-----------------+
|1.3550425069999697E7|281.20499450058514|13.338737965774865|-2.247409766876874|39.91830115846082|
+--------------------+------------------+------------------+------------------+-----------------+

Summary statistics for column: rain_1h
+----------------+------------------+------------------+------------------+-----------------+
|             Sum|              Mean|Standard Deviation|          Skewness|         Kurtosis|
+----------------+------------------+------------------+------------------+-----------------+
|16112.8600000002|0.3343818872310001| 44.79703269348381|219.34351945261943|48133.64919028831|
+----------------+------------------+------------------+--