In [27]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Functions used below

def username():
    """Get username with any domain information removed.
    """

    return re.sub('@.*', '', getpass.getuser())


def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'<li><a href="{sc.uiWebUrl}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username() + " (jupyter)"}</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    user = username()
    
    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .master("spark://masternode2:7077")
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{user}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.executor.memory", f"{worker_memory}g")
        .config("spark.driver.memory", f"{master_memory}g")
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.ui.port", str(port))
        .appName(user + " (jupyter)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [28]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=2, worker_memory=4, master_memory=4)

0,1
spark.dynamicAllocation.enabled,false
spark.executor.instances,4
spark.driver.memory,4g
spark.ui.port,4336
spark.executor.memory,4g
spark.master,spark://masternode2:7077
spark.app.id,app-20240417095621-0908
spark.executor.id,driver
spark.app.startTime,1713304581271
spark.executor.cores,2


In [29]:
# Write your imports and code here or insert cells below

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

In [30]:
!hdfs dfs -du -h /user/uwi14/outputs/ghcnd/

5.0 M    20.1 M  /user/uwi14/outputs/ghcnd/Final_stations_df.parquet
1.5 M    6.0 M   /user/uwi14/outputs/ghcnd/Modified_Inventory_df.parquet
256.1 K  1.0 M   /user/uwi14/outputs/ghcnd/avg_rainfall.csv
1.9 K    7.5 K   /user/uwi14/outputs/ghcnd/countries_df.csv
2.3 M    9.2 M   /user/uwi14/outputs/ghcnd/joined_countries_df.csv
2.3 M    9.2 M   /user/uwi14/outputs/ghcnd/joined_countries_state_df.csv
2.3 K    9.0 K   /user/uwi14/outputs/ghcnd/modified_countries_df.csv
865      3.4 K   /user/uwi14/outputs/ghcnd/modified_states_df.csv
2.1 M    8.3 M   /user/uwi14/outputs/ghcnd/nz_tmin_tmax.parquet
2.8 M    11.3 M  /user/uwi14/outputs/ghcnd/nz_tmin_tmax2.csv
7.5 K    29.8 K  /user/uwi14/outputs/ghcnd/sorted_stations_nz_df.csv
623      2.4 K   /user/uwi14/outputs/ghcnd/states_df.csv



<h3>(a) <h3> <h4>How many stations are there in total? How many stations were active so far in 2024?
How many stations are in each of the GCOS Surface Network (GSN), the US Historical
Climatology Network (HCN), and the US Climate Reference Network (CRN)? Are there
any stations that are in more than one of these networks?<h4>

In [31]:
station_data = spark.read.parquet("/user/uwi14/outputs/ghcnd/Final_stations_df.parquet")
station_data.printSchema()
show_as_html(station_data,10)

root
 |-- ID: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- CODE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- GSN_FLAG: string (nullable = true)
 |-- HCN_CRN_FLAG: string (nullable = true)
 |-- WMO_ID: string (nullable = true)
 |-- Country_Name: string (nullable = true)
 |-- State_Name: string (nullable = true)
 |-- FIRST_ACTIVE_YEAR: string (nullable = true)
 |-- LAST_ACTIVE_YEAR: string (nullable = true)
 |-- DIFFERENT_ELEMENTS_COUNT: long (nullable = true)
 |-- CORE_ELEMENTS_COUNT: long (nullable = true)
 |-- OTHER_ELEMENTS_COUNT: long (nullable = true)
 |-- COLLECTED_ELEMENTS: array (nullable = true)
 |    |-- element: string (containsNull = true)



Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,AE000041196,,AE,25.333,55.517,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,,1944,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
1,AEM00041218,,AE,24.262,55.609,264.9,AL AIN INTL,,,41218.0,United Arab Emirates,,1994,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
2,AGE00147715,,AG,35.42,8.1197,863.0,TEBESSA,,,,Algeria,,1879,1938,3,3,,"[TMAX, TMIN, PRCP]"
3,AGE00147794,,AG,36.78,5.1,225.0,BEJAIA-CAP CARBON,,,,Algeria,,1926,1938,2,2,,"[TMAX, TMIN]"
4,AGM00060402,,AG,36.712,5.07,6.1,SOUMMAM,,,60402.0,Algeria,,����,2024,5,4,1.0,"[TMAX, TMIN, ����, PRCP, SNWD]"
5,AGM00060430,,AG,36.3,2.233,721.0,MILIANA,,,60430.0,Algeria,,1957,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"
6,AGM00060461,,AG,35.7,-0.65,22.0,ORAN-PORT,,,60461.0,Algeria,,1995,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
7,AGM00060514,,AG,35.167,2.317,801.0,KSAR CHELLALA,,,60514.0,Algeria,,1995,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"
8,AGM00060515,,AG,35.333,4.206,459.0,BOU SAADA,,,60515.0,Algeria,,1984,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
9,AGM00060550,,AG,33.667,1.0,1347.0,EL-BAYADH,,,60550.0,Algeria,,1973,2024,5,4,1.0,"[TMAX, TMIN, PRCP, SNWD, TAVG]"


In [6]:
#1) Total Number of Stations
total_stations = station_data.select(
    'ID').count()
print(f"Total Number of Stations: {total_stations}")

Total Number of Stations: 125983


In [7]:
#Active so far in 2024
active_stations_2024 = station_data.filter(
    station_data['LAST_ACTIVE_YEAR'] == 2024).select(
    'ID').count()
print(f"Total Number of active stations in 2024: {active_stations_2024}")

Total Number of active stations in 2024: 31837


In [8]:
#How many stations are in each of the GCOS Surface Network (GSN), the US Historical
#Climatology Network (HCN), and the US Climate Reference Network (CRN)? 

gsn_stations = station_data.filter(station_data['GSN_FLAG'] == 'GSN').select('ID').count()
hcn_stations = station_data.filter(station_data['HCN_CRN_FLAG'] == 'HCN').select('ID').count()
crn_stations = station_data.filter(station_data['HCN_CRN_FLAG'] == 'CRN').select('ID').count()
print(f"Total Number of stations in GSN: {gsn_stations}")
print(f"Total Number of stations in HCN: {hcn_stations}")
print(f"Total Number of stations in CRN: {crn_stations}")


Total Number of stations in GSN: 991
Total Number of stations in HCN: 1218
Total Number of stations in CRN: 234


In [9]:
#Are thereany stations that are in more than one of these networks?
more_than_one_stations = station_data.filter(
        (station_data['GSN_FLAG'] == 'GSN') & 
        (station_data['HCN_CRN_FLAG'] == 'HCN') | 
        (station_data['GSN_FLAG'] == 'GSN') & 
        (station_data['HCN_CRN_FLAG'] == 'CRN') | 
        (station_data['HCN_CRN_FLAG'] == 'HCN') & 
        (station_data['HCN_CRN_FLAG'] == 'CRN')
    ) .select('ID').count()

print(f"Stations that are in more than one networks: {more_than_one_stations}")

Stations that are in more than one networks: 15


<h3>(b)<h3> 
<h4>Count the total number of stations in each country, and join these counts onto countries
so that we can use these counts later if desired.
Do the same for states and save a copy of each table to your output directory<h4>

In [10]:
#(b)
country_station_counts_df = station_data.groupBy(
    'CODE').agg(
    F.countDistinct('ID').alias('Station_Count'))
show_as_html(country_station_counts_df,10)

Unnamed: 0,CODE,Station_Count
0,TI,62
1,BB,1
2,CA,9188
3,MX,5249
4,SW,1721
5,NI,10
6,BG,10
7,MZ,19
8,UG,8
9,WQ,1


In [11]:
countries_df = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("/user/uwi14/outputs/ghcnd/countries_df.csv")
)
countries_df.show(10)

+------------+--------------------+
|Country_Code|        Country_Name|
+------------+--------------------+
|          AC| Antigua and Barbuda|
|          AE|United Arab Emirates|
|          AF|         Afghanistan|
|          AG|             Algeria|
|          AJ|          Azerbaijan|
|          AL|             Albania|
|          AM|             Armenia|
|          AO|              Angola|
|          AQ|American Samoa [U...|
|          AR|           Argentina|
+------------+--------------------+
only showing top 10 rows



In [12]:
countries_df = countries_df.join(
    country_station_counts_df.withColumnRenamed("CODE","Country_Code"),
    on="Country_Code",
    how = "left"
)
show_as_html(countries_df,5)

Unnamed: 0,Country_Code,Country_Name,Station_Count
0,AC,Antigua and Barbuda,2
1,AE,United Arab Emirates,4
2,AF,Afghanistan,4
3,AG,Algeria,82
4,AJ,Azerbaijan,66


In [14]:
from pyspark.sql.functions import col
top_country = countries_df.orderBy(col("Station_Count").desc()).first()
print(f"The country with the highest station count is: {top_country['Country_Name']} with {top_country['Station_Count']} stations.")

The country with the highest station count is: United States with 72289 stations.


In [14]:
output_path_countries = "hdfs:///user/uwi14/outputs/ghcnd/modified_countries_df.csv"
countries_df.write.mode("overwrite").option("header", "true").option("compression", "gzip").csv(output_path_countries)

In [15]:
states_station_counts_df = station_data.groupBy(
    'STATE').agg(
    F.countDistinct('ID').alias('Station_Count'))
show_as_html(states_station_counts_df ,10)

Unnamed: 0,STATE,Station_Count
0,NT,137
1,CA,3080
2,OK,1081
3,MN,2199
4,ND,574
5,OH,1397
6,WI,1412
7,NH,471
8,MB,731
9,AZ,1655


In [16]:
states_df = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("/user/uwi14/outputs/ghcnd/states_df.csv")
)
states_df.show(10)

+----------+----------------+
|State_Code|      State_Name|
+----------+----------------+
|        AB|         ALBERTA|
|        AK|          ALASKA|
|        AL|         ALABAMA|
|        AR|        ARKANSAS|
|        AS|  AMERICAN SAMOA|
|        AZ|         ARIZONA|
|        BC|BRITISH COLUMBIA|
|        CA|      CALIFORNIA|
|        CO|        COLORADO|
|        CT|     CONNECTICUT|
+----------+----------------+
only showing top 10 rows



In [17]:
states_df = states_df.join(
    states_station_counts_df.withColumnRenamed("STATE","State_Code"),
    on="State_Code",
    how = "left"
)
show_as_html(states_df,5)

Unnamed: 0,State_Code,State_Name,Station_Count
0,AB,ALBERTA,1445
1,AK,ALASKA,1040
2,AL,ALABAMA,1101
3,AR,ARKANSAS,937
4,AS,AMERICAN SAMOA,21


In [19]:
top_state = states_df.orderBy(col("Station_Count").desc()).first()
print(f"The state with the highest station count is: {top_state['State_Name']} with {top_state['Station_Count']} stations.")

The state with the highest station count is: TEXAS with 6154 stations.


In [18]:
output_path_states = "hdfs:///user/uwi14/outputs/ghcnd/modified_states_df.csv"
states_df.write.mode("overwrite").option("header", "true").option("compression", "gzip").csv(output_path_states)

<h3> (c) <h3>
<h4>How many stations are there in the Southern Hemisphere?
Some of the countries in the database are territories of the United States as indicated by
the name of the country. How many stations are there in total in the territories of the United
States around the world, excluding the United States itself?<h4>


In [20]:
!hdfs dfs -du -h /user/uwi14/outputs/ghcnd/

5.0 M    20.1 M  /user/uwi14/outputs/ghcnd/Final_stations_df.parquet
1.5 M    6.0 M   /user/uwi14/outputs/ghcnd/Modified_Inventory_df.parquet
256.0 K  1.0 M   /user/uwi14/outputs/ghcnd/avg_rainfall.csv
1.9 K    7.5 K   /user/uwi14/outputs/ghcnd/countries_df.csv
2.3 M    9.2 M   /user/uwi14/outputs/ghcnd/joined_countries_df.csv
2.3 M    9.2 M   /user/uwi14/outputs/ghcnd/joined_countries_state_df.csv
2.3 K    9.0 K   /user/uwi14/outputs/ghcnd/modified_countries_df.csv
865      3.4 K   /user/uwi14/outputs/ghcnd/modified_states_df.csv
2.1 M    8.3 M   /user/uwi14/outputs/ghcnd/nz_tmin_tmax.parquet
2.8 M    11.3 M  /user/uwi14/outputs/ghcnd/nz_tmin_tmax2.csv
7.4 K    29.7 K  /user/uwi14/outputs/ghcnd/sorted_stations_nz_df.csv
623      2.4 K   /user/uwi14/outputs/ghcnd/states_df.csv


In [64]:
#Changeing the data types for Latitude and Longitude
station_data = station_data \
    .withColumn("LATITUDE", col("LATITUDE").cast(DoubleType())) \
    .withColumn("LONGITUDE", col("LONGITUDE").cast(DoubleType()))
show_as_html(station_data,5)

Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,AE000041196,,AE,25.333,55.517,34.0,SHARJAH INTER. AIRP,GSN,,41196.0,United Arab Emirates,,1944,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
1,AEM00041218,,AE,24.262,55.609,264.9,AL AIN INTL,,,41218.0,United Arab Emirates,,1994,2024,4,3,1.0,"[TMAX, TMIN, PRCP, TAVG]"
2,AGE00147715,,AG,35.42,8.1197,863.0,TEBESSA,,,,Algeria,,1879,1938,3,3,,"[TMAX, TMIN, PRCP]"
3,AGE00147794,,AG,36.78,5.1,225.0,BEJAIA-CAP CARBON,,,,Algeria,,1926,1938,2,2,,"[TMAX, TMIN]"
4,AGM00060402,,AG,36.712,5.07,6.1,SOUMMAM,,,60402.0,Algeria,,����,2024,5,4,1.0,"[TMAX, TMIN, ����, PRCP, SNWD]"


In [65]:
# Filter for stations in the Southern Hemisphere
southern_hemisphere_stations = station_data.filter(station_data['LATITUDE'] < 0)

# Count the distinct stations
southern_hemisphere_station_count = southern_hemisphere_stations.select('ID').count()

print(f"Number of stations in the Southern Hemisphere: {southern_hemisphere_station_count}")

Number of stations in the Southern Hemisphere: 25357


In [66]:
northern_hemisphere_stations = station_data.filter(station_data['LATITUDE'] > 0)
northern_hemisphere_stations_count = northern_hemisphere_stations.select('ID').count()

print(f"Number of stations in the Northern Hemisphere: {northern_hemisphere_stations_count}")

Number of stations in the Northern Hemisphere: 100626


In [67]:
equator_stations = station_data.filter(station_data['LATITUDE'] == 0)
equator_stations_count = equator_stations.select('ID').count()

print(f"Number of stations in the Equator: {equator_stations_count}")

Number of stations in the Equator: 0


In [68]:
#total US stations 

us_stations_df = station_data.filter(station_data['Country_Name'].contains("United States"))
show_as_html(us_stations_df,10)
us = us_stations_df.select('ID').count()
print(f"Total Number of US Stations: {us}")


Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,AQC00914869,AS,AQ,-14.3333,-170.7167,3.0,TAFUNA AP TUTUILA,,,,American Samoa [United States],AMERICAN SAMOA,1956,1966,10,5,5.0,"[TMAX, TMIN, PRCP, WT05, SNWD, MDPR, DAPR, SNO..."
1,CQC00914855,MP,CQ,15.1167,145.7167,65.5,SAIPAN INTL AP,,,91232.0,Northern Mariana Islands [United States],NORTHERN MARIANA ISLANDS,1988,2024,19,5,14.0,"[WSF5, WDF5, TMAX, WT10, TMIN, PSUN, PRCP, AWN..."
2,CQC00914860,MP,CQ,15.1333,145.7,3.0,SAIPAN LORAN,,,,Northern Mariana Islands [United States],NORTHERN MARIANA ISLANDS,1954,1978,9,5,4.0,"[TMAX, TMIN, PRCP, SNWD, MDPR, DAPR, SNOW, TOB..."
3,GQC00914120,GU,GQ,13.2333,144.65,3.0,COCOS ISLAND GUAM,,,,Guam [United States],GUAM,1956,1966,9,5,4.0,"[TMAX, TMIN, PRCP, SNWD, MDPR, DAPR, SNOW, TOB..."
4,GQC00914828,GU,GQ,13.35,144.75,91.1,TALOFOFO VILLAGE,,,,Guam [United States],GUAM,1965,1969,5,3,2.0,"[PRCP, SNWD, MDPR, DAPR, SNOW]"
5,GQC00914950,GU,GQ,13.5478,144.8928,160.0,YIGO,,,,Guam [United States],GUAM,1978,2012,5,3,2.0,"[PRCP, SNWD, MDPR, DAPR, SNOW]"
6,MQC00914490,UM,MQ,28.2167,-177.35,3.0,MIDWAY SAND ISLAND,,,,Midway Islands [United States},U.S. MINOR OUTLYING ISLANDS,1953,1991,11,5,6.0,"[WT03, TMAX, TMIN, PRCP, WT05, SNWD, SNOW, TOB..."
7,RQ1PRAL0003,PR,RQ,18.5083,-67.1124,39.9,AGUADILLA 5.5 NNE,,,,Puerto Rico [United States],PUERTO RICO,1998,2024,4,2,2.0,"[PRCP, MDPR, DAPR, SNOW]"
8,RQ1PRCG0001,PR,RQ,18.2273,-66.0543,90.5,CAGUAS 1.1 WSW,,,,Puerto Rico [United States],PUERTO RICO,2014,2018,2,2,,"[PRCP, SNOW]"
9,RQ1PRLM0001,PR,RQ,18.2563,-67.0035,209.4,LAS MARIAS 9.6 ESE,,,,Puerto Rico [United States],PUERTO RICO,2016,2024,4,2,2.0,"[PRCP, MDPR, DAPR, SNOW]"


Total Number of US Stations: 72675


In US territories outside united states, the country code ("CODE") is not US. So, filtering the CODE different to US would give the Total Number of US Stations exluding itself 

In [69]:
us_tm2_stations_df = us_stations_df.filter(us_stations_df['CODE']!="US")
show_as_html(us_tm2_stations_df,10)
us_tm2 = us_tm2_stations_df.select('ID').count()
print(f"Total Number of US Stations exluding itself: {us_tm2}")

Unnamed: 0,ID,STATE,CODE,LATITUDE,LONGITUDE,ELEVATION,NAME,GSN_FLAG,HCN_CRN_FLAG,WMO_ID,Country_Name,State_Name,FIRST_ACTIVE_YEAR,LAST_ACTIVE_YEAR,DIFFERENT_ELEMENTS_COUNT,CORE_ELEMENTS_COUNT,OTHER_ELEMENTS_COUNT,COLLECTED_ELEMENTS
0,AQC00914869,AS,AQ,-14.3333,-170.7167,3.0,TAFUNA AP TUTUILA,,,,American Samoa [United States],AMERICAN SAMOA,1956,1966,10,5,5.0,"[TMAX, TMIN, PRCP, WT05, SNWD, MDPR, DAPR, SNO..."
1,CQC00914855,MP,CQ,15.1167,145.7167,65.5,SAIPAN INTL AP,,,91232.0,Northern Mariana Islands [United States],NORTHERN MARIANA ISLANDS,1988,2024,19,5,14.0,"[WSF5, WDF5, TMAX, WT10, TMIN, PSUN, PRCP, AWN..."
2,CQC00914860,MP,CQ,15.1333,145.7,3.0,SAIPAN LORAN,,,,Northern Mariana Islands [United States],NORTHERN MARIANA ISLANDS,1954,1978,9,5,4.0,"[TMAX, TMIN, PRCP, SNWD, MDPR, DAPR, SNOW, TOB..."
3,GQC00914120,GU,GQ,13.2333,144.65,3.0,COCOS ISLAND GUAM,,,,Guam [United States],GUAM,1956,1966,9,5,4.0,"[TMAX, TMIN, PRCP, SNWD, MDPR, DAPR, SNOW, TOB..."
4,GQC00914828,GU,GQ,13.35,144.75,91.1,TALOFOFO VILLAGE,,,,Guam [United States],GUAM,1965,1969,5,3,2.0,"[PRCP, SNWD, MDPR, DAPR, SNOW]"
5,GQC00914950,GU,GQ,13.5478,144.8928,160.0,YIGO,,,,Guam [United States],GUAM,1978,2012,5,3,2.0,"[PRCP, SNWD, MDPR, DAPR, SNOW]"
6,MQC00914490,UM,MQ,28.2167,-177.35,3.0,MIDWAY SAND ISLAND,,,,Midway Islands [United States},U.S. MINOR OUTLYING ISLANDS,1953,1991,11,5,6.0,"[WT03, TMAX, TMIN, PRCP, WT05, SNWD, SNOW, TOB..."
7,RQ1PRAL0003,PR,RQ,18.5083,-67.1124,39.9,AGUADILLA 5.5 NNE,,,,Puerto Rico [United States],PUERTO RICO,1998,2024,4,2,2.0,"[PRCP, MDPR, DAPR, SNOW]"
8,RQ1PRCG0001,PR,RQ,18.2273,-66.0543,90.5,CAGUAS 1.1 WSW,,,,Puerto Rico [United States],PUERTO RICO,2014,2018,2,2,,"[PRCP, SNOW]"
9,RQ1PRLM0001,PR,RQ,18.2563,-67.0035,209.4,LAS MARIAS 9.6 ESE,,,,Puerto Rico [United States],PUERTO RICO,2016,2024,4,2,2.0,"[PRCP, MDPR, DAPR, SNOW]"


Total Number of US Stations exluding itself: 386


In [72]:
count_by_country_name_df = us_tm2_stations_df.groupBy("Country_Name").count()

count_by_country_name_df.show()

+--------------------+-----+
|        Country_Name|count|
+--------------------+-----+
|Northern Mariana ...|   11|
|Puerto Rico [Unit...|  243|
|Guam [United Stat...|   29|
|Johnston Atoll [U...|    4|
|Midway Islands [U...|    3|
|Palmyra Atoll [Un...|    3|
|American Samoa [U...|   21|
|Virgin Islands [U...|   71|
|Wake Island [Unit...|    1|
+--------------------+-----+



In [73]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()