# Mt. Everest Data Cleansing 
Silver Layer Processing

In [71]:
from pyspark.sql import functions as F, types as T, DataFrame as Frame, Column as C 

StatementMeta(, 4bc1890a-70c6-4f59-8462-f3eaeee41cea, 73, Finished, Available, Finished)

Starting with the Expedition data, I'll take that data from the files and create a dataframe for each file.

In [ ]:
df_expeditions = spark.read.format("csv").option("header","true").option("sep", ";").load("Files/expeditions.csv")
df_peaks = spark.read.format("csv").option("header","true").option("sep", ";").load("Files/peaks.csv")
df_members = spark.read.format("csv").option("header","true").option("sep", ";").load("Files/members.csv")
df_refer = spark.read.format("csv").option("header","true").option("sep", ";").load("Files/refer.csv")

I just want to import data about Everest, so I'll start with removing all other rows from Peaks.

In [ ]:
df_peaks = df_peaks.filter(df_peaks.PKNAME == 'Everest')
display(df_peaks)

Next I'm doing the same concerning the Expeditions, but for this I need to do three steps:
1. Rename the PEAKID column in the Expeditions dataframe
2. Join the Expeditions with the Peaks using the PEAKID column
3. Clearing from the resulting dataframe all the columns that were imported from the Peaks dataframe 

I started by renaming the column PEAKID from the Expeditions to PEAKID_EXP so that when dropping the extra columns there would not be a duplicate PEAKID column.

In [ ]:
df_expeditions = df_expeditions.withColumnRenamed('PEAKID', 'PEAKID_EXP')
df_expeditions = df_expeditions.join(df_peaks, df_expeditions.PEAKID_EXP == df_peaks.PEAKID)
df_expeditions = df_expeditions.drop(*(F.col(c) for c in df_peaks.columns))
display(df_expeditions.head(10))

Similarly, I'm now going to do the same three steps for the Refer dataframe:
1. Rename the EXPID column in the Refer dataframe
2. Join the Refer with the Expeditions using the PEAKID column
3. Clearing from the resulting dataframe all the columns that were imported from the Expeditions dataframe 

In [ ]:
df_refer = df_refer.withColumnRenamed('EXPID', 'EXPID_REF')
df_refer = df_refer.join(df_expeditions, df_refer.EXPID_REF == df_expeditions.EXPID)
df_refer = df_refer.drop(*(F.col(c) for c in df_expeditions.columns))
display(df_refer.head(10))

And finally I'll filter the Members dataframe as well using the Peaks dataframe. So again:
1. Rename the PEAKID column in the Members dataframe
2. Join the Members with the Peaks using the PEAKID column
3. Clearing from the resulting dataframe all the columns that were imported from the Peaks dataframe 

In [ ]:
df_members = df_members.withColumnRenamed('PEAKID', 'PEAKID_MEM')
df_members = df_members.join(df_peaks, df_members.PEAKID_MEM == df_peaks.PEAKID)
df_members = df_members.drop(*(F.col(c) for c in df_peaks.columns))
display(df_members.head(10))

In [75]:
df = spark.read.option("multiline", "true").json("Files/mteverest_weather_data.json")
# df now is a Spark DataFrame containing JSON data from "Files/mteverest_weather_data.json".
display(df)

StatementMeta(, 4bc1890a-70c6-4f59-8462-f3eaeee41cea, 77, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 54d69e8e-ab3f-47ef-961a-68a8777b4b71)

In [78]:
display(df.select('current.time'))

StatementMeta(, 4bc1890a-70c6-4f59-8462-f3eaeee41cea, 80, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a9323c10-4c4d-4da6-bdaf-46489920befd)

In [84]:
df_current_weather = (df.select(
    F.col('current.time').alias('Time'),
    F.col('daily.sunrise').getItem(0).alias('Sunrise'),
    F.col('daily.sunset').getItem(0).alias('Sunset'),
    F.col('current.precipitation').alias('Precipitation'),
    F.col('current_units.precipitation').alias('Precipitation_unit'),  
    F.col('current.temperature_2m').alias('Temperature'),
    F.col('current_units.temperature_2m').alias('Temperature_unit'),
    F.col('current.wind_speed_10m').alias('Wind_speed'),
    F.col('current_units.wind_speed_10m').alias('Wind_speed_unit'),
    F.col('current.wind_gusts_10m').alias('Wind_gusts'),
    F.col('current_units.wind_gusts_10m').alias('Wind_gusts_unit'),
    F.col('current.relative_humidity_2m').alias('Relative_humidity'),
    F.col('current_units.relative_humidity_2m').alias('Relative_humidity_unit'),
    F.col('current.snowfall').alias('Snowfall'),
    F.col('current_units.snowfall').alias('Snowfall_unit')
))
display(df_current_weather)

StatementMeta(, 4bc1890a-70c6-4f59-8462-f3eaeee41cea, 86, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 557e3f21-7860-45f1-b551-e75b75ade134)

In [ ]:
df = spark.read.option("multiline", "true").json("Files/mteverest_hist_weather_data.json")
# df now is a Spark DataFrame containing JSON data from "Files/mteverest_hist_weather_data.json".
df.show()