In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# Create a spark session/application
spark = SparkSession.builder.appName('Exercise 1').getOrCreate()

In [3]:
# Import household_power_consumption.txt as dataframe using the defined schema
schema = StructType(
    [
        StructField("Date", StringType(), True),
        StructField("Time", StringType(), True),
        StructField("Global_active_power", FloatType(), True),
        StructField("Global_reactive_power", FloatType(), True),
        StructField("Voltage", FloatType(), True),
        StructField("Global_intensity", FloatType(), True),
        StructField("Sub_metering_1", FloatType(), True),
        StructField("Sub_metering_2", FloatType(), True),
        StructField("Sub_metering_3", FloatType(), True)
    ]
)
df = spark.read.format("csv").option("header", "true").option("delimiter", ";").schema(schema).load("household_power_consumption.txt")

In [5]:
# Calculate and print the min, max and count f
columns = ['Global_active_power', 'Global_reactive_power', 'Voltage', 'Global_intensity']
for column in columns:
    print(f'-- {column} --')
    column_result = df.agg(
        min(column).alias('min'), 
        max(column).alias('max'), 
        count(column).alias('count')
    ).collect()[0]
    print(f"min: {column_result['min']}")
    print(f"max: {column_result['max']}")
    print(f"count: {column_result['count']}\n")

-- Global_active_power --
min: 0.07599999755620956
max: 11.121999740600586
count: 2049280

-- Global_reactive_power --
min: 0.0
max: 1.3899999856948853
count: 2049280

-- Voltage --
min: 223.1999969482422
max: 254.14999389648438
count: 2049280

-- Global_intensity --
min: 0.20000000298023224
max: 48.400001525878906
count: 2049280

