# Part 1 - Data Ingestion

### Fetch the data from the API. Have to split it up since the api only allows for 10000 records at a time.
#### Create the url for the API call
SN19710 is the station ID for the weather station at Asker. Choosen because it is a weather station with a long history. Achived from https://seklima.met.no/. 

The data is fetched from the frost.met.no API. The data is fetched from the API using the following parameters:
- Sources: SN19710
- From date: 2010-01-01
- To date: 2024-10-04
- Elements: air_temperature

In [2]:
import requests

#load username and password from a .env file where the first line is the username and the second line is the password
with open('.env') as f:
    lines = f.readlines()
    username = lines[0].strip()
    password = lines[1].strip()

stationNr = "SN19710"
elements = "air_temperature"

data_entries = []

#Fetch data from 2000-2010
fromDate = "2000-01-01"
toDate = "2010-12-31"
url = "https://frost.met.no/observations/v0.jsonld?sources=" + stationNr + "&referencetime=" + fromDate + "/" + toDate + "&elements="+ elements

result = requests.get(url, auth=(username, password))
data = result.json()
data_entries.extend(data["data"])

#Fetch data from 2011-2020
fromDate = "2011-01-01"
toDate = "2020-12-31"
url = "https://frost.met.no/observations/v0.jsonld?sources=" + stationNr + "&referencetime=" + fromDate + "/" + toDate + "&elements="+ elements

result = requests.get(url, auth=(username, password))
data = result.json()
data_entries.extend(data["data"])

#Fetch data from 2021-2024
fromDate = "2021-01-01"
toDate = "2024-10-18"
url = "https://frost.met.no/observations/v0.jsonld?sources=" + stationNr + "&referencetime=" + fromDate + "/" + toDate + "&elements="+ elements

result = requests.get(url, auth=(username, password))
data = result.json()
data_entries.extend(data["data"])

print("Total number of data entries: ", len(data_entries))


Total number of data entries:  121414


### Load it into a RDD in Spark

In [None]:
from pyspark.sql import SparkSession
 
# Using the same spark configuration as in the data cleaning task
spark = (
    SparkSession.builder
    .appName("data_ingestion")
    .config("spark.master", "yarn")
    .config("spark.driver.memory", "2g")
    .config("spark.yarn.am.memory", "2g")
    .config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "4")
    .config("spark.executor.instances", "3")
    .config("spark.task.cpus", "1")
    .config("spark.dynamicAllocation.enabled", "false")
    .getOrCreate()
)


rdd = spark.sparkContext.parallelize(data_entries)

### Since the data is semi-structured, we need to convert it to a raw format and save it in a txt file.

To simulate a real world scenario and expand our dataset we duplicate each entry 150 times.

In [None]:
def to_raw_format(entry):
    source_id = entry["sourceId"]
    ref_time = entry["referenceTime"]
    obs = entry["observations"][0]
    temp_value = obs["value"]
    temp_unit = obs["unit"]
    height = obs["level"]["value"]
    height_unit = obs["level"]["unit"]
    time_offset = obs["timeOffset"]
    time_res = obs["timeResolution"]
    time_SeriesId = obs["timeSeriesId"]
    performanceCategory = obs["performanceCategory"]
    exposureCategory = obs["exposureCategory"]
    qualityCode = obs["qualityCode"]
    
    log_entry = f"{source_id} {ref_time} air_temperature:{temp_value}{temp_unit} height_above_ground:{height}{height_unit} {time_offset} {time_res} {time_SeriesId} {performanceCategory} {exposureCategory} {qualityCode}"
    
    return [log_entry] * 150
    
raw_rdd = rdd.flatMap(to_raw_format)

try:
    raw_rdd.saveAsTextFile("hdfs:///project/raw_temperature_data")
except Exception as e:
    print("File already exists")
    #Delete the file and save it again
    !hdfs dfs -rm -r /project/raw_temperature_data
    raw_rdd.saveAsTextFile("hdfs:///project/raw_temperature_data")
    



File already exists
Deleted /project/raw_temperature_data


                                                                                

### Print the first 5 lines of the raw data

In [19]:
##Print part of the data
for entry in raw_rdd.take(5):
    print(entry)
spark.stop()

SN19710:0 2000-01-01T06:00:00.000Z air_temperature:-6.3degC height_above_ground:2m PT0H PT6H 0 C 2 2 SN16040SN17280SN3190SN17000SN26580SN28380SN20280SN1070SN3290SN3290SN3810SN3720SN3290SN3370SN17050SN17850SN17870SN17850SN17850SN18810SN19710SN19710SN2650SN4260SN3810SN18210SN4260SN4460SN18270SN4735SN4920SN4735SN4780SN20540SN20280SN24710SN24890SN24890SN25110SN25630SN29720SN26350SN26350SN26500SN26630SN19940SN28750SN28922SN28922SN20301SN20540SN18315SN5590SN12290SN12648SN12550SN12550SN12180SN12180SN5350SN5590SN5590SN5660SN6020SN6020SN12180SN180SN7950SN8140SN8880SN700SN9580SN9580SN8880SN9160SN10380SN16400SN61630SN15890SN15430SN14200SN16040SN13150SN16040SN13150SN12648SN13030SN11500SN11500SN20540SN12550SN12680SN24890SN13030SN23420SN23420SN23500SN23800SN27160SN27120SN27270SN27470SN27780SN30330SN30330SN30650SN27435SN30330SN34130SN34130SN35210SN32060SN32240SN30810SN30650SN32240SN32240SN37230SN37230SN32240SN32890SN35210SN38140SN36330SN39150SN41770SN41825SN43010SN35210SN36560SN35860SN38730SN39040SN3

### Find the size of the raw data

In [4]:
#Size of the saved text file
import os
import sys
size = os.popen("hdfs dfs -du -s -h /project/raw_temperature_data").read()
print(size)

1.1 G  3.4 G  /project/raw_temperature_data

