# Part 2. Data Cleaning


In [1]:
import requests
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("data_cleaning").getOrCreate()

# Load the saved data from the directory
loaded_rdd = spark.sparkContext.textFile("hdfs:///project/raw_temperature_data")

# Print the first 5 entries to verify it was saved and loaded correctly
for i in loaded_rdd.take(5):
    print(i)

[Stage 0:>                                                          (0 + 1) / 1]

SN19710:0  2000-01-01T06:00:00.000Z  air_temperature:-6.3degC  height_above_ground:2m  PT6H
SN19710:0  2000-01-01T12:00:00.000Z  air_temperature:-4degC  height_above_ground:2m  PT6H
SN19710:0  2000-01-01T18:00:00.000Z  air_temperature:-7degC  height_above_ground:2m  PT6H
SN19710:0  2000-01-02T06:00:00.000Z  air_temperature:-4.4degC  height_above_ground:2m  PT6H
SN19710:0  2000-01-02T12:00:00.000Z  air_temperature:2.1degC  height_above_ground:2m  PT6H


                                                                                

### Process the raw data line by line and convert it into a structured format containing date as the ky and temperature and day of the year as values. Since the data is hourly we convert the datetime to date

In [2]:
from datetime import datetime
import math

# Define a function to process each line of the log file and extract date as a key and temperature and day of the year as values
def process_line(line):
    fields = line.split("  ")
    if len(fields) != 5:
        return None  # Invalid format, skip this line

    source_id, ref_time, temperature, height, time_res = [f.strip() for f in fields]

    # Extract date and temperature
    date = ref_time.split("T")[0]  # Assumes ISO format like '2024-10-01T00:00:00.000Z'
    temp_value = float(temperature.split(":")[1].rstrip("degC"))

    # Convert date string to datetime object
    date_obj = datetime.strptime(date, "%Y-%m-%d")

    # Calculate day of the year
    day_of_year = date_obj.timetuple().tm_yday

    # Apply sine/cosine transformation to day of the year to capture seasonality
    day_of_year_sin = round(math.sin(2 * math.pi * day_of_year / 365), 2)
    day_of_year_cos = round(math.cos(2 * math.pi * day_of_year / 365), 2)
    
    return (date, (temp_value, day_of_year_sin, day_of_year_cos))

    

    # Process the loaded RDD to extract the required information
    

processed_rdd = loaded_rdd.map(process_line).filter(lambda x: x is not None)

# Print the first 10 entries to verify the processing
for i in processed_rdd.take(10):
    print(i)

[Stage 1:>                                                          (0 + 1) / 1]

('2000-01-01', (-6.3, 0.02, 1.0))
('2000-01-01', (-4.0, 0.02, 1.0))
('2000-01-01', (-7.0, 0.02, 1.0))
('2000-01-02', (-4.4, 0.03, 1.0))
('2000-01-02', (2.1, 0.03, 1.0))
('2000-01-02', (-0.8, 0.03, 1.0))
('2000-01-03', (0.4, 0.05, 1.0))
('2000-01-03', (3.2, 0.05, 1.0))
('2000-01-03', (2.5, 0.05, 1.0))
('2000-01-04', (2.2, 0.07, 1.0))


                                                                                

### Since the data is in hourly format, and we above converted it to daily format, we need to aggregate the data to get the average temperature for each day.

### Profile the performance of the MapReduce implementation (e.g., Spark job execution time, memory usage).

In [3]:
import time
import statistics

# Measure the execution time of the MapReduce job
start_time = time.time()

import statistics

# Perform the MapReduce operation with rounding to whole numbers
daily_avg_rdd = processed_rdd.groupByKey().mapValues(lambda temps: list(temps)) \
    .mapValues(lambda temps: (round(statistics.mean([t[0] for t in temps])), temps[0][1], temps[0][2]))


# # Trigger an action to force execution
# daily_avg_rdd.count()

end_time = time.time()
execution_time = end_time - start_time

print(f"MapReduce job execution time: {execution_time} seconds")

# To monitor memory usage, you can use the Spark UI. 
# The Spark UI is usually available at http://<driver-node>:4040
# You can access it by navigating to the URL in your web browser.

# Print the first 10 entries to verify the processing
for i in daily_avg_rdd.take(10):
    print(i)

MapReduce job execution time: 0.030730724334716797 seconds




('2000-01-20', (4, 0.34, 0.94))
('2000-02-17', (-4, 0.74, 0.68))
('2000-02-18', (-5, 0.75, 0.66))
('2000-02-24', (-1, 0.81, 0.58))
('2000-02-27', (3, 0.84, 0.54))
('2000-03-01', (2, 0.87, 0.5))
('2000-03-11', (0, 0.94, 0.34))
('2000-03-12', (-2, 0.95, 0.33))
('2000-03-15', (1, 0.96, 0.28))
('2000-03-18', (4, 0.97, 0.23))


                                                                                

### If there is gaps in the data, fill the missing values with the average of the previous and next value.

In [4]:
# min_max_dates = processed_rdd.map(lambda x: x[1][1]).min(), processed_rdd.map(lambda x: x[1][1]).max()
# start_day, end_day = min_max_dates
# complete_days = spark.sparkContext.parallelize(range(start_day, end_day + 1))

### Transformthe data to a data frame

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


formatted_rdd = daily_avg_rdd.map(lambda x: (x[0], x[1][0], x[1][1], x[1][2]))

# Define the schema for your data
schema = StructType([
    StructField("date", StringType(), True),
    StructField("temperature", IntegerType(), True),
    StructField("feature1", FloatType(), True),
    StructField("feature2", FloatType(), True)
])

## Convert the RDD to a DataFrame
df = spark.createDataFrame(formatted_rdd, schema)

# Show the first few rows to verify
df.show()

df.write.mode("overwrite").parquet("hdfs:///project/cleaned_data_parquet")

spark.stop()


                                                                                

+----------+-----------+--------+--------+
|      date|temperature|feature1|feature2|
+----------+-----------+--------+--------+
|2013-03-09|         -6|    0.92|    0.39|
|2013-03-13|         -7|    0.95|    0.33|
|2013-03-14|         -8|    0.95|    0.31|
|2013-04-14|          2|    0.98|   -0.22|
|2013-04-15|          5|    0.97|   -0.23|
|2013-04-26|          6|    0.91|   -0.41|
|2013-05-24|         12|    0.62|   -0.79|
|2013-06-13|         13|    0.31|   -0.95|
|2013-06-17|         15|    0.25|   -0.97|
|2013-07-21|         21|   -0.33|   -0.94|
|2013-09-03|         15|   -0.89|   -0.46|
|2013-09-05|         15|    -0.9|   -0.43|
|2013-09-11|         14|   -0.94|   -0.33|
|2013-10-18|          4|   -0.96|    0.29|
|2013-10-30|          5|   -0.88|    0.48|
|2013-11-19|          2|   -0.66|    0.75|
|2013-12-03|          4|   -0.46|    0.89|
|2013-12-26|          3|   -0.09|     1.0|
|2014-01-14|         -8|    0.24|    0.97|
|2014-02-02|          1|    0.54|    0.84|
+----------

                                                                                