# Exercise - Get the rock sample data into Visual Studio Code

In [100]:
# Import PySpark
from pyspark.sql import SparkSession

In [101]:
#Create SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [102]:
#Get data file

df = spark.read.options(inferSchema='True', header= 'True').csv("data/rocksamples.csv")

In [103]:
display("Schema:")

df.printSchema

'Schema:'

<bound method DataFrame.printSchema of DataFrame[ID: int, Mission: string, Type: string, Subtype: string, Weight (g): double, Pristine (%): double]>

In [104]:
# Print DataFrame
df.show(10)

+-----+--------+-------+--------+----------+------------+
|   ID| Mission|   Type| Subtype|Weight (g)|Pristine (%)|
+-----+--------+-------+--------+----------+------------+
|10001|Apollo11|   Soil|Unsieved|     125.8|       88.36|
|10002|Apollo11|   Soil|Unsieved|    5629.0|       93.73|
|10003|Apollo11| Basalt|Ilmenite|     213.0|       65.56|
|10004|Apollo11|   Core|Unsieved|      44.8|       71.76|
|10005|Apollo11|   Core|Unsieved|      53.4|       40.31|
|10008|Apollo11|   Soil|Unsieved|      89.0|        5.75|
|10009|Apollo11|Breccia|Regolith|     112.0|       97.27|
|10010|Apollo11|   Soil|Unsieved|     491.0|       91.03|
|10011|Apollo11|   Soil|Unsieved|      82.6|       62.01|
|10014|Apollo11|   Soil|Unsieved|      50.0|         0.0|
+-----+--------+-------+--------+----------+------------+
only showing top 10 rows



# Exercise - Determine the question to ask to inform data cleansing

In [105]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col,lag


In [106]:
# Convert the sample weight


# Rename Column to remove space
df = df.withColumnRenamed("Weight (g)","Weight(g)")

df = df.withColumn("Weight(kg)", col("Weight(g)") * 0.001)


df.head(10)

[Row(ID=10001, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=125.8, Pristine (%)=88.36, Weight(kg)=0.1258),
 Row(ID=10002, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=5629.0, Pristine (%)=93.73, Weight(kg)=5.6290000000000004),
 Row(ID=10003, Mission='Apollo11', Type='Basalt', Subtype='Ilmenite', Weight(g)=213.0, Pristine (%)=65.56, Weight(kg)=0.213),
 Row(ID=10004, Mission='Apollo11', Type='Core', Subtype='Unsieved', Weight(g)=44.8, Pristine (%)=71.76, Weight(kg)=0.0448),
 Row(ID=10005, Mission='Apollo11', Type='Core', Subtype='Unsieved', Weight(g)=53.4, Pristine (%)=40.31, Weight(kg)=0.0534),
 Row(ID=10008, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=89.0, Pristine (%)=5.75, Weight(kg)=0.089),
 Row(ID=10009, Mission='Apollo11', Type='Breccia', Subtype='Regolith', Weight(g)=112.0, Pristine (%)=97.27, Weight(kg)=0.112),
 Row(ID=10010, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=491.0, Pristine (%)=91.03, Weight(kg

In [107]:
# Create a new DataFrame called missions that will be a summary of data for each of the six Apollo missions that brought samples back. 
# Create a column in this DataFrame called Mission that has one row for each mission.

missions = df.dropDuplicates(["Mission"]).select("Mission")
missions.head(10)


[Row(Mission='Apollo15'),
 Row(Mission='Apollo11'),
 Row(Mission='Apollo14'),
 Row(Mission='Apollo12'),
 Row(Mission='Apollo17'),
 Row(Mission='Apollo16')]

In [108]:
#missions.toPandas().info()

type(missions)

pyspark.sql.dataframe.DataFrame

In [109]:
# Sum total sample weight by mission

sample_total_weight = df.groupby('Mission').sum('Weight(kg)')


# Using Join expression and remove duplicate columns
missions = missions.join(sample_total_weight,missions["Mission"] == sample_total_weight["Mission"]) \
    .select(missions["Mission"], sample_total_weight["sum(Weight(kg))"]) \
    .orderBy(missions["Mission"])

# Rename Column
missions = missions.withColumnRenamed("sum(Weight(kg))","Sample_weight(kg)")
missions.show()


+--------+------------------+
| Mission| Sample_weight(kg)|
+--------+------------------+
|Apollo11|          21.55424|
|Apollo12|          34.34238|
|Apollo14|          41.83363|
|Apollo15| 75.39910000000005|
|Apollo16| 92.46262000000006|
|Apollo17|109.44402000000001|
+--------+------------------+



In [110]:
# Get the difference in weights across missions

# Create window
windowSpec  = Window.orderBy("Mission")

#Simulate Pandas diff() API on PySpark usinl lag function (with above windowSpec)
missions = missions.withColumn("lag",lag("Sample_weight(kg)",1).over(windowSpec)) \
      .withColumn("Weight_diff", col("Sample_weight(kg)") - col("lag")) \
      .select("Mission","Sample_weight(kg)","Weight_diff")


In [111]:
# Replace Null values

missions = missions.na.fill(value=0,subset=["Weight_diff"])
missions.show()


+--------+------------------+------------------+
| Mission| Sample_weight(kg)|       Weight_diff|
+--------+------------------+------------------+
|Apollo11|          21.55424|               0.0|
|Apollo12|          34.34238|12.788139999999999|
|Apollo14|          41.83363| 7.491250000000001|
|Apollo15| 75.39910000000005| 33.56547000000005|
|Apollo16| 92.46262000000006| 17.06352000000001|
|Apollo17|109.44402000000001| 16.98139999999995|
+--------+------------------+------------------+

