In [2]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("HumidAnalysis").getOrCreate()

#Load Preprocessed Data

df = spark.read.format('csv').options(header='true', inferSchema='true').load('s3://brazil-climate-analysis/preprocess_final/')

df.show()
df.printSchema()

+-------------------+------------+-----+----+-------------+--------------------+------------------------+------------------------+------------------+-----------+-------------+--------+--------+------------------+-----------------+------------+------------+--------+--------------+----------+------+-----------------+--------+------------+------+
|               date|station_code| time|  id|precipitation|atmospheric_pressure|max_atmospheric_pressure|min_atmospheric_pressure|         radiation|temperature|dewpoint_temp|max_temp|min_temp|max_dewpoint_temp_|min_dewpoint_temp|max_humidity|min_humidity|humidity|wind_direction|wind_speed|region|          station|latitude|   longitude|height|
+-------------------+------------+-----+----+-------------+--------------------+------------------------+------------------------+------------------+-----------+-------------+--------+--------+------------------+-----------------+------------+------------+--------+--------------+----------+------+----------

In [4]:
from pyspark.sql.functions import *

# Select only neccessy columns

df_humid_analysis = df.select(to_date("date","yyyy-MM-dd").alias("date"),df["time"],df["station_code"],df["humidity"],
                    df["temperature"],df["precipitation"].alias("amount of precipitation"))

df_humid_analysis.show(20)

+----------+-----+------------+--------+-----------+-----------------------+
|      date| time|station_code|humidity|temperature|amount of precipitation|
+----------+-----+------------+--------+-----------+-----------------------+
|2001-04-08|00:00|        A601|    93.0|       24.4|                    0.0|
|2001-04-08|01:00|        A601|    93.0|       24.4|                    0.0|
|2001-04-08|02:00|        A601|    95.0|       23.5|                    0.0|
|2001-04-08|03:00|        A601|    95.0|       23.3|                    0.0|
|2001-04-08|04:00|        A601|    95.0|       23.6|                    0.0|
|2001-04-08|05:00|        A601|    94.0|       23.7|                    0.0|
|2001-04-08|06:00|        A601|    92.0|       23.8|                    0.0|
|2001-04-08|07:00|        A601|    94.0|       23.1|                    0.0|
|2001-04-08|08:00|        A601|    94.0|       23.3|                    0.0|
|2001-04-08|09:00|        A601|    93.0|       23.3|                    0.0|

In [7]:
#Group the data by Date then mean the humidity percentage, temperature, and precipitation

daily_humid_analysis = df_humid_analysis.groupBy("Date").agg(
    round(mean("humidity"), 2).alias("average daily humidity"),
    round(mean("temperature"), 2).alias("average daily temperature"),
    round(sum("amount of precipitation"), 2).alias("daily precipitation"))

#Sort the result

daily_humid_analysis_sorted = daily_humid_analysis.orderBy("Date")

daily_humid_analysis_sorted.show(10)

+----------+----------------------+-------------------------+-------------------+
|      Date|average daily humidity|average daily temperature|daily precipitation|
+----------+----------------------+-------------------------+-------------------+
|2000-05-07|                 74.25|                    22.23|                0.0|
|2000-05-08|                 79.08|                    22.15|                0.0|
|2000-05-09|                 84.61|                    21.88|                1.2|
|2000-05-10|                 73.67|                    20.73|                0.0|
|2000-05-11|                 79.64|                    20.56|               12.4|
|2000-05-12|                 80.58|                    20.12|                1.2|
|2000-05-13|                 75.21|                    20.25|                0.0|
|2000-05-14|                 76.13|                    20.68|                0.0|
|2000-05-15|                 79.33|                    20.91|                0.0|
|2000-05-24|    

In [10]:
# Group by Date, and calculate the minimum humidity

min_daily_humidity = df.select(to_date("Date","yyyy-MM-dd").alias("Date"),df["Time"],df["station_code"],
          df["min_humidity"].alias("minimum humidity"))

min_daily_humidity = min_daily_humidity.groupBy("Date").agg(min("minimum humidity").alias("minimum daily humidity"))

# Sort the result by Date
min_daily_humidity_sorted = min_daily_humidity.sort("Date")

# Show the sorted result
min_daily_humidity_sorted.show(25)

+----------+----------------------+
|      Date|minimum daily humidity|
+----------+----------------------+
|2000-05-07|                  67.0|
|2000-05-08|                  58.0|
|2000-05-09|                  68.0|
|2000-05-10|                  41.0|
|2000-05-11|                  49.0|
|2000-05-12|                  42.0|
|2000-05-13|                  44.0|
|2000-05-14|                  40.0|
|2000-05-15|                  40.0|
|2000-05-24|                  65.0|
|2000-05-25|                  34.0|
|2000-05-29|                  47.0|
|2000-05-30|                  44.0|
|2000-05-31|                  34.0|
|2000-06-01|                  67.0|
|2000-06-02|                  63.0|
|2000-06-03|                  58.0|
|2000-06-04|                  43.0|
|2000-06-05|                  39.0|
|2000-06-06|                  36.0|
|2000-06-07|                  38.0|
|2000-06-08|                  30.0|
|2000-06-09|                  33.0|
|2000-06-10|                  53.0|
|2000-06-11|                

In [11]:
# Group by Date, and calculate the maximum humidity

max_daily_humidity = df.select(to_date("Date","yyyy-MM-dd").alias("Date"),df["Time"],df["station_code"],
          df["max_humidity"].alias("maximum humidity"))

max_daily_humidity = max_daily_humidity.groupBy("Date").agg(max("maximum humidity").alias("maximum daily humidity"))

# Sort the result by Date
max_daily_humidity_sorted = max_daily_humidity.sort("Date")

# Show the sorted result
max_daily_humidity_sorted.show(25)

+----------+----------------------+
|      Date|maximum daily humidity|
+----------+----------------------+
|2000-05-07|                  87.0|
|2000-05-08|                  93.0|
|2000-05-09|                  97.0|
|2000-05-10|                  97.0|
|2000-05-11|                  96.0|
|2000-05-12|                  98.0|
|2000-05-13|                  96.0|
|2000-05-14|                  97.0|
|2000-05-15|                  98.0|
|2000-05-24|                  88.0|
|2000-05-25|                  97.0|
|2000-05-29|                  89.0|
|2000-05-30|                  96.0|
|2000-05-31|                  98.0|
|2000-06-01|                  96.0|
|2000-06-02|                  95.0|
|2000-06-03|                  97.0|
|2000-06-04|                  95.0|
|2000-06-05|                  97.0|
|2000-06-06|                  96.0|
|2000-06-07|                  78.0|
|2000-06-08|                  80.0|
|2000-06-09|                  88.0|
|2000-06-10|                  97.0|
|2000-06-11|                

In [14]:
minmax_humidity = max_daily_humidity_sorted.join(min_daily_humidity_sorted, on="Date")
minmax_humidity = minmax_humidity.sort("Date")
minmax_humidity.show(25)

+----------+----------------------+----------------------+
|      Date|maximum daily humidity|minimum daily humidity|
+----------+----------------------+----------------------+
|2000-05-07|                  87.0|                  67.0|
|2000-05-08|                  93.0|                  58.0|
|2000-05-09|                  97.0|                  68.0|
|2000-05-10|                  97.0|                  41.0|
|2000-05-11|                  96.0|                  49.0|
|2000-05-12|                  98.0|                  42.0|
|2000-05-13|                  96.0|                  44.0|
|2000-05-14|                  97.0|                  40.0|
|2000-05-15|                  98.0|                  40.0|
|2000-05-24|                  88.0|                  65.0|
|2000-05-25|                  97.0|                  34.0|
|2000-05-29|                  89.0|                  47.0|
|2000-05-30|                  96.0|                  44.0|
|2000-05-31|                  98.0|                  34.

In [15]:
spark.sql("CREATE DATABASE humidity_db LOCATION 's3://brazil-climate-analysis/humidity_db'").show()

++
||
++
++

In [16]:
daily_humid_analysis_sorted.createOrReplaceTempView('daily_humidity_analysis')
minmax_humidity.createOrReplaceTempView('minmax_humidity_analysis')


spark.sql("use humidity_db")
spark.sql("create table minmax_humidity_tb as select * from minmax_humidity_analysis")
spark.sql("create table daily_humid_tb as select * from daily_humidity_analysis")

DataFrame[]