In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.ui.port,4567
spark.driver.memory,4g
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.executor.id,driver
spark.app.id,app-20240501233317-0578
spark.executor.cores,2
spark.driver.host,mathmadslinux2p.canterbury.ac.nz


In [3]:
# Write your imports and code here or insert cells below

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

# Question 3

### (a)Recall the hdfs commands that you used to explore the data in Processing Q1. You would have used
###    hdfs dfs -ls [path]
###    hdfs dfs -du [path]
### to determine the size of files under a specific path in HDFS.

In [4]:
#a)
! hdfs dfs -ls /data/ghcnd/daily

Found 263 items
-rw-r--r--   8 jsw93 supergroup     517706 2024-03-18 23:56 /data/ghcnd/daily/1750.csv.gz
-rw-r--r--   8 jsw93 supergroup       3358 2024-03-18 23:57 /data/ghcnd/daily/1763.csv.gz
-rw-r--r--   8 jsw93 supergroup       3327 2024-03-18 23:54 /data/ghcnd/daily/1764.csv.gz
-rw-r--r--   8 jsw93 supergroup       3335 2024-03-18 23:54 /data/ghcnd/daily/1765.csv.gz
-rw-r--r--   8 jsw93 supergroup       3344 2024-03-18 23:49 /data/ghcnd/daily/1766.csv.gz
-rw-r--r--   8 jsw93 supergroup       3356 2024-03-18 23:56 /data/ghcnd/daily/1767.csv.gz
-rw-r--r--   8 jsw93 supergroup       3325 2024-03-18 23:53 /data/ghcnd/daily/1768.csv.gz
-rw-r--r--   8 jsw93 supergroup       3418 2024-03-18 23:54 /data/ghcnd/daily/1769.csv.gz
-rw-r--r--   8 jsw93 supergroup       3357 2024-03-18 23:56 /data/ghcnd/daily/1770.csv.gz
-rw-r--r--   8 jsw93 supergroup       3373 2024-03-18 23:56 /data/ghcnd/daily/1771.csv.gz
-rw-r--r--   8 jsw93 supergroup       3419 2024-03-18 23:55 /data/ghcnd/d

Use the following command
hdfs getconf -confKey "dfs.blocksize"
to determine the default blocksize of HDFS

In [5]:
! hdfs getconf -confKey "dfs.blocksize"

134217728


### How many blocks are required for the daily
### climate summaries for the year 2024? What about the year 2023? What are the individual
### block sizes for the year 2023?

In [6]:
!hdfs dfs -du -h /data/ghcnd/daily/2023.csv.gz
!hdfs dfs -du -h /data/ghcnd/daily/2024.csv.gz

158.7 M  1.2 G  /data/ghcnd/daily/2023.csv.gz
26.2 M  209.8 M  /data/ghcnd/daily/2024.csv.gz


In [7]:
!hdfs fsck /data/ghcnd/daily/2024.csv.gz -files -blocks -locations

Connecting to namenode via http://masternode2:9870/fsck?ugi=uwi14&files=1&blocks=1&locations=1&path=%2Fdata%2Fghcnd%2Fdaily%2F2024.csv.gz
FSCK started by uwi14 (auth:SIMPLE) from /192.168.40.11 for path /data/ghcnd/daily/2024.csv.gz at Mon Apr 22 20:39:57 NZST 2024

/data/ghcnd/daily/2024.csv.gz 27492832 bytes, replicated: replication=8, 1 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1074057694_316881 len=27492832 Live_repl=8  [DatanodeInfoWithStorage[192.168.40.133:9866,DS-1395ddcb-8542-431a-8b4e-48509cab5de9,DISK], DatanodeInfoWithStorage[192.168.40.183:9866,DS-d547a5f3-b56f-45ab-81b8-e9d492a4e1d3,DISK], DatanodeInfoWithStorage[192.168.40.173:9866,DS-81ea4712-59d5-45a7-8d68-848697c8fac8,DISK], DatanodeInfoWithStorage[192.168.40.106:9866,DS-39db1648-d1af-450f-999c-277feea9beeb,DISK], DatanodeInfoWithStorage[192.168.40.159:9866,DS-93ea3ab4-4367-4d25-8560-df00f71281a3,DISK], DatanodeInfoWithStorage[192.168.40.102:9866,DS-33686d75-da76-41d8-97de-5c93688abb7a,DISK], 

In [10]:
!hdfs fsck /data/ghcnd/daily/2023.csv.gz -files -blocks -locations

Connecting to namenode via http://masternode2:9870/fsck?ugi=uwi14&files=1&blocks=1&locations=1&path=%2Fdata%2Fghcnd%2Fdaily%2F2023.csv.gz
FSCK started by uwi14 (auth:SIMPLE) from /192.168.40.11 for path /data/ghcnd/daily/2023.csv.gz at Sun Mar 31 23:39:15 NZDT 2024

/data/ghcnd/daily/2023.csv.gz 166367488 bytes, replicated: replication=8, 2 block(s):  OK
0. BP-700027894-132.181.129.68-1626517177804:blk_1074057666_316853 len=134217728 Live_repl=8  [DatanodeInfoWithStorage[192.168.40.126:9866,DS-6f3fc9d3-f21f-4098-9664-a781e15f486c,DISK], DatanodeInfoWithStorage[192.168.40.142:9866,DS-18323328-25b7-46b9-beea-a2d0a1ab6423,DISK], DatanodeInfoWithStorage[192.168.40.180:9866,DS-8012b6d5-226a-489a-bfa7-a2e221b8d1ff,DISK], DatanodeInfoWithStorage[192.168.40.158:9866,DS-60dfaa6a-f7a4-4d6c-8e44-70a79274b7af,DISK], DatanodeInfoWithStorage[192.168.40.116:9866,DS-49765458-974a-4728-a1a5-09ee775a1b0f,DISK], DatanodeInfoWithStorage[192.168.40.105:9866,DS-995c6d40-6fd7-4f6e-8812-148b6aac3e9f,DISK]

A HDFS block's default size is 134,217,728 bytes, or 128 megabytes. The compressed file size of the daily climate summary 2024 is 27,492,832 bytes (26.2 MB), which is smaller than the size of one HDFS block. So, 1 HDFS block is adequate for the daily climate summary in 2024.
The file size of the daily climate summary 2023 is 166,367,488 bytes, which exceeds the HDFS block capacity. So, the daily climate summary 2023 will require two blocks, with the first block being 134,217,728 bytes (128 MB) and the second block with the size of 32,149,760bytes (30.99MB).


### (b) Load and count the number of observations in 2023 and then separately in 2024.
### How many tasks were executed by each stage of each job?

In [15]:
#b)
# Load the data for 2023
data_2023 = spark.read.csv("/data/ghcnd/daily/2023.csv.gz")

# Count the number of observations for each year
count_2023 = data_2023.count()

print(f"Number of observations in 2023: {count_2023}")



Number of observations in 2023: 37395852


In [4]:
data_2024 = spark.read.csv("/data/ghcnd/daily/2024.csv.gz")
count_2024 = data_2024.count()
print(f"Number of observations in 2024: {count_2024}")

Number of observations in 2024: 6061827


In [12]:
num_partitions_2023 = data_2023.rdd.getNumPartitions()
num_partitions_2024 = data_2024.rdd.getNumPartitions()

print(f"Number of partitions in 2023 data: {num_partitions_2023}")
print(f"Number of partitions in 2024 data: {num_partitions_2024}")

Number of partitions in 2023 data: 1
Number of partitions in 2024 data: 1


### Did the number of tasks executed correspond to the number of blocks in each input?

The number of observations for daily climate summary in 2023 is 37,395,852 and in 2024 it is 6,061,827. When counting observations in 2023 and 2024, each job had two stages, each with a single task. Also, each year's files had one partition. This implies that, although having two blocks for the Daily climate summary 2023, there was only one task, and the Daily climate summary 2024 which has one block also has one task in each stage. 

### (c) Load and count the total number of observations in the years from 2014 to 2023 (inclusive). Note that you can use glob patterns in the path argument of the read command. Now how many tasks were executed by each stage, and how does this number correspond to your input?

In [19]:
years = list(range(2014, 2024)) 
paths = [f"/data/ghcnd/daily/{year}.csv.gz" for year in years]

data_2014_2023 = spark.read.option("header", "true").csv(paths)
count = data_2014_2023 .count()
print(f"Total number of observations from 2014 to 2023: {count}")

Total number of observations from 2014 to 2023: 369419055


In [23]:
print(f"Number of partitions: {data_2014_2023.rdd.getNumPartitions()}")

Number of partitions: 10


From the year 2014 to 2023, there are total 10 files and 369,419,055 observations in total. The number of partitions for these input data is 10 which is equal to the number of distinct files. So, in each stage there could be 10 tasks in total. 

### Explain how Spark partitions input files that are compressed.


When Spark reads a compressed input file, the way it partitions the data depends on the type of compression used. For compression formats like bzip2 that allow partitioning, Spark can create a task for each compressed block, allowing for parallel processing. This means that even if the data is compressed, Spark can use all available resources in the cluster to read and process the data simultaneously.

On the other hand, for compression formats that don't allow partitioning, like gzip, Spark treats the entire compressed file as a single block, resulting in a single task processing it. This limits parallelism because even if your cluster has significant resources, Spark can only use one task to process the entire compressed file.

Thus, the choice of compression format can greatly affect the performance of the Spark application. If the priority is to maximize parallelism, a partitionable compression format might be the best option. But the focus is on conserving storage space or reducing network bandwidth, a non-partitionable compression format could be more beneficial.

### (d) Based on parts (b) and (c), how many tasks do you think would run in parallel when loading and applying transformations to all of daily? Can you think of any practical way you could increase this either in Spark or changing how the data is stored in HDFS?

The daily climate summaries for 2023 and 2024 should ideally require 2 and 1 HDFS blocks, suggesting that Spark would execute 2 and 1 tasks when reading these files. However, since the files are compressed in .gz format, each file is treated as a single block. This results in Spark processing each file with just one task, regardless of its original block count. 

To boost parallelism when processing large datasets, we can increase the number of partitions at each stage. This leads to more tasks running simultaneously, enhancing processing speed. By explicitly setting a higher number of partitions when reading data, Spark creates additional blocks, allowing more tasks to run in parallel. Using partitionable compression formats, like bzip2, supports this approach, as Spark can assign a task to each block within these compressed files, maximizing parallelism. Adjusting Spark's configuration can also increase parallelism. This can be done by increasing the number of tasks per core or adding more cores to the Spark application, allowing more tasks to run concurrently.


In [17]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()