# Exercise - Get the rock sample data into Visual Studio Code

In [2]:
# Import PySpark
from pyspark.sql import SparkSession

In [3]:
#Create SparkSession
spark = SparkSession.builder.appName("TestSpark") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()


#    .config("spark.sql.warehouse.dir", "/hive/warehouse/dir") \
#    .config("hive.metastore.uris", "thrift://localhost:9083") \



In [4]:
#Get data file

df = spark.read.options(inferSchema='True', header= 'True').csv("file:///C:/Users/manso/LocalDocuments/10-TechProjects/over-the-moon/sample-return/data/rocksamples.csv")

In [5]:
display("Schema:")

df.printSchema

'Schema:'

<bound method DataFrame.printSchema of DataFrame[ID: int, Mission: string, Type: string, Subtype: string, Weight (g): double, Pristine (%): double]>

In [6]:
# Print DataFrame
df.show(10)

+-----+--------+-------+--------+----------+------------+
|   ID| Mission|   Type| Subtype|Weight (g)|Pristine (%)|
+-----+--------+-------+--------+----------+------------+
|10001|Apollo11|   Soil|Unsieved|     125.8|       88.36|
|10002|Apollo11|   Soil|Unsieved|    5629.0|       93.73|
|10003|Apollo11| Basalt|Ilmenite|     213.0|       65.56|
|10004|Apollo11|   Core|Unsieved|      44.8|       71.76|
|10005|Apollo11|   Core|Unsieved|      53.4|       40.31|
|10008|Apollo11|   Soil|Unsieved|      89.0|        5.75|
|10009|Apollo11|Breccia|Regolith|     112.0|       97.27|
|10010|Apollo11|   Soil|Unsieved|     491.0|       91.03|
|10011|Apollo11|   Soil|Unsieved|      82.6|       62.01|
|10014|Apollo11|   Soil|Unsieved|      50.0|         0.0|
+-----+--------+-------+--------+----------+------------+
only showing top 10 rows



# Exercise - Determine the question to ask to inform data cleansing

In [7]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col,lag, lit, round, mean as _mean


In [8]:
# Convert the sample weight


# Rename Column to remove space
df = df.withColumnRenamed("Weight (g)","Weight(g)")

df = df.withColumn("Weight(kg)", col("Weight(g)") * 0.001)


df.head(10)

[Row(ID=10001, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=125.8, Pristine (%)=88.36, Weight(kg)=0.1258),
 Row(ID=10002, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=5629.0, Pristine (%)=93.73, Weight(kg)=5.6290000000000004),
 Row(ID=10003, Mission='Apollo11', Type='Basalt', Subtype='Ilmenite', Weight(g)=213.0, Pristine (%)=65.56, Weight(kg)=0.213),
 Row(ID=10004, Mission='Apollo11', Type='Core', Subtype='Unsieved', Weight(g)=44.8, Pristine (%)=71.76, Weight(kg)=0.0448),
 Row(ID=10005, Mission='Apollo11', Type='Core', Subtype='Unsieved', Weight(g)=53.4, Pristine (%)=40.31, Weight(kg)=0.0534),
 Row(ID=10008, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=89.0, Pristine (%)=5.75, Weight(kg)=0.089),
 Row(ID=10009, Mission='Apollo11', Type='Breccia', Subtype='Regolith', Weight(g)=112.0, Pristine (%)=97.27, Weight(kg)=0.112),
 Row(ID=10010, Mission='Apollo11', Type='Soil', Subtype='Unsieved', Weight(g)=491.0, Pristine (%)=91.03, Weight(kg

In [9]:
# Create a new DataFrame called missions that will be a summary of data for each of the six Apollo missions that brought samples back. 
# Create a column in this DataFrame called Mission that has one row for each mission.

missions = df.dropDuplicates(["Mission"]).select("Mission")
missions.head(10)


[Row(Mission='Apollo15'),
 Row(Mission='Apollo11'),
 Row(Mission='Apollo14'),
 Row(Mission='Apollo12'),
 Row(Mission='Apollo17'),
 Row(Mission='Apollo16')]

In [10]:
#missions.toPandas().info()

type(missions)

pyspark.sql.dataframe.DataFrame

In [11]:
# Sum total sample weight by mission

sample_total_weight = df.groupby('Mission').sum('Weight(kg)')


# Using Join expression and remove duplicate columns
missions = missions.join(sample_total_weight,missions["Mission"] == sample_total_weight["Mission"]) \
    .select(missions["Mission"], sample_total_weight["sum(Weight(kg))"]) \
    .orderBy(missions["Mission"])

# Rename Column
missions = missions.withColumnRenamed("sum(Weight(kg))","Sample_weight(kg)")
missions.show()


+--------+------------------+
| Mission| Sample_weight(kg)|
+--------+------------------+
|Apollo11|          21.55424|
|Apollo12|          34.34238|
|Apollo14|          41.83363|
|Apollo15| 75.39910000000005|
|Apollo16| 92.46262000000006|
|Apollo17|109.44402000000001|
+--------+------------------+



In [12]:
# Get the difference in weights across missions

# Create window
windowSpec  = Window.orderBy("Mission")

#Simulate Pandas diff() API on PySpark usinl lag function (with above windowSpec)
missions = missions.withColumn("lag",lag("Sample_weight(kg)",1).over(windowSpec)) \
      .withColumn("Weight_diff", col("Sample_weight(kg)") - col("lag")) \
      .select("Mission","Sample_weight(kg)","Weight_diff")


In [13]:
# Replace Null values

missions = missions.na.fill(value=0,subset=["Weight_diff"])
missions.show()


+--------+------------------+------------------+
| Mission| Sample_weight(kg)|       Weight_diff|
+--------+------------------+------------------+
|Apollo11|          21.55424|               0.0|
|Apollo12|          34.34238|12.788139999999999|
|Apollo14|          41.83363| 7.491250000000001|
|Apollo15| 75.39910000000005| 33.56547000000005|
|Apollo16| 92.46262000000006| 17.06352000000001|
|Apollo17|109.44402000000001| 16.98139999999995|
+--------+------------------+------------------+



In [14]:
# Create train database 
spark.sql("CREATE DATABASE IF NOT EXISTS train")

# Write to (managed) tables on train database

df.write.mode("Overwrite").saveAsTable("train.rockSamples")
missions.write.mode("Overwrite").saveAsTable("train.Missions")

Py4JJavaError: An error occurred while calling o99.saveAsTable.
: java.util.concurrent.ExecutionException: org.apache.hadoop.ipc.RpcException: RPC response has invalid length
	at org.sparkproject.guava.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:306)
	at org.sparkproject.guava.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:293)
	at org.sparkproject.guava.util.concurrent.AbstractFuture.get(AbstractFuture.java:116)
	at org.sparkproject.guava.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135)
	at org.sparkproject.guava.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2410)
	at org.sparkproject.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2380)
	at org.sparkproject.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
	at org.sparkproject.guava.cache.LocalCache$Segment.get(LocalCache.java:2257)
	at org.sparkproject.guava.cache.LocalCache.get(LocalCache.java:4000)
	at org.sparkproject.guava.cache.LocalCache$LocalManualCache.get(LocalCache.java:4789)
	at org.apache.spark.sql.catalyst.catalog.SessionCatalog.getCachedPlan(SessionCatalog.scala:209)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable.org$apache$spark$sql$execution$datasources$FindDataSourceTable$$readDataSourceTable(DataSourceStrategy.scala:248)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable$$anonfun$apply$2.applyOrElse(DataSourceStrategy.scala:289)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable$$anonfun$apply$2.applyOrElse(DataSourceStrategy.scala:278)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$2(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:170)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$4(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren(TreeNode.scala:1215)
	at org.apache.spark.sql.catalyst.trees.UnaryLike.mapChildren$(TreeNode.scala:1214)
	at org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias.mapChildren(basicLogicalOperators.scala:1659)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.$anonfun$resolveOperatorsDownWithPruning$1(AnalysisHelper.scala:175)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:323)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning(AnalysisHelper.scala:168)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsDownWithPruning$(AnalysisHelper.scala:164)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning(AnalysisHelper.scala:99)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperatorsWithPruning$(AnalysisHelper.scala:96)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperatorsWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperators(AnalysisHelper.scala:76)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.resolveOperators$(AnalysisHelper.scala:75)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators(LogicalPlan.scala:32)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable.apply(DataSourceStrategy.scala:278)
	at org.apache.spark.sql.execution.datasources.FindDataSourceTable.apply(DataSourceStrategy.scala:242)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:222)
	at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
	at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
	at scala.collection.immutable.List.foldLeft(List.scala:91)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:219)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:211)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:211)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.org$apache$spark$sql$catalyst$analysis$Analyzer$$executeSameContext(Analyzer.scala:226)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$execute$1(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.AnalysisContext$.withNewAnalysisContext(Analyzer.scala:173)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:222)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.execute(Analyzer.scala:188)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:89)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:182)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:209)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:330)
	at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:208)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:77)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$2(QueryExecution.scala:219)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:219)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:218)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:77)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:74)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:66)
	at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:91)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:89)
	at org.apache.spark.sql.SparkSession.table(SparkSession.scala:606)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:658)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:571)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.hadoop.ipc.RpcException: RPC response has invalid length
	at org.apache.hadoop.ipc.Client$IpcStreams.readResponse(Client.java:1933)
	at org.apache.hadoop.ipc.Client$Connection.receiveRpcResponse(Client.java:1238)
	at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1134)


In [None]:
# List existing databases
spark.sql('show databases').show()

# List train database tables
spark.catalog.listTables('train')

+---------+
|namespace|
+---------+
|  default|
|    train|
+---------+



[Table(name='missions', catalog='spark_catalog', namespace=['train'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='rocksamples', catalog='spark_catalog', namespace=['train'], description=None, tableType='MANAGED', isTemporary=False),
 Table(name='a', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='AuxCM', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='AuxLM', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='AuxMissions', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [None]:
# Test previous saved tables

spark.sql("SELECT * FROM train.Missions WHERE Mission='Apollo14'").show()

+--------+-----------------+-----------------+
| Mission|Sample_weight(kg)|      Weight_diff|
+--------+-----------------+-----------------+
|Apollo14|         41.83363|7.491250000000001|
+--------+-----------------+-----------------+



# Exercise - Add rocket weight data to the mission analysis

In [203]:
# Add in command and lunar module data

lunarModuleData = [("Eagle (LM-5)",15103), 
        ("Intrepid (LM-6)",15235), 
        ("Antares (LM-8)",15264), 
        ("Falcon (LM-10)",16430),
        ("Orion (LM-11)",16445),
        ("Challenger (LM-12)",16456)
      ]

commandModuleData = [("Columbia (CSM-107)",5560), 
        ("Yankee Clipper (CM-108)",5609), 
        ("Kitty Hawk (CM-110)",5758), 
        ("Endeavor (CM-112)",5875),
        ("Casper (CM-113)",5840),
        ("America (CM-114)",5960)
      ]


# Generate ID Column on Dataframes to be able to join them after
dfLM = spark.createDataFrame(lunarModuleData,["Lunar_module(LM)", "LM_mass(kg)"])
dfLM.createOrReplaceTempView('AuxLM')
dfLM = spark.sql('select ROW_NUMBER() OVER(ORDER BY (Select 0)) AS Row_Num, * from AuxLM')

#dfLM.show()

dfCM = spark.createDataFrame(commandModuleData, ["Command_module(CM)", "CM_mass(kg)"])
dfCM.createOrReplaceTempView('AuxCM')
dfCM = spark.sql('select ROW_NUMBER() OVER(ORDER BY (Select 0)) AS Row_Num, * from AuxCM')

#dfCM.show()

dfMissions = missions.select("Mission")
dfMissions.createOrReplaceTempView('AuxMissions')
dfMissions = spark.sql('select ROW_NUMBER() OVER(ORDER BY (Select 0)) AS Row_Num, * from AuxMissions')

#dfMissions.show()

# Join Dataframes using ID column
dfAll = dfMissions.join(dfLM,dfMissions["Row_Num"] == dfLM["Row_Num"]) \
        .drop(dfMissions["Row_Num"])
      
dfAll = dfAll.join(dfCM,dfAll["Row_Num"] == dfCM["Row_Num"]) \
        .drop(dfAll["Row_Num"], dfCM["Row_Num"])


#dfAll.printSchema()
#dfAll.show()

# Update missions Dataframe with the new fields
missions = missions.join(dfAll,missions["Mission"] == dfAll["Mission"]) \
        .drop(dfAll["Mission"])


missions.printSchema()
missions.show()

root
 |-- Sample_weight(kg): double (nullable = true)
 |-- Weight_diff: double (nullable = true)
 |-- Lunar_module(LM): string (nullable = true)
 |-- LM_mass(kg): long (nullable = true)
 |-- Command_module(CM): string (nullable = true)
 |-- CM_mass(kg): long (nullable = true)
 |-- LM_mass_diff: long (nullable = true)
 |-- CM_mass_diff: long (nullable = true)
 |-- Total_weight(kg): long (nullable = true)
 |-- Total_weight_diff: long (nullable = true)
 |-- Crewed_area_Payload: double (nullable = true)
 |-- Sample_Crewed_area: double (nullable = true)
 |-- Sample_Payload: double (nullable = true)
 |-- Mission: string (nullable = true)
 |-- Lunar_module(LM): string (nullable = true)
 |-- LM_mass(kg): long (nullable = true)
 |-- Command_module(CM): string (nullable = true)
 |-- CM_mass(kg): long (nullable = true)

+-----------------+-----------+------------------+-----------+--------------------+-----------+------------+------------+----------------+-----------------+-------------------+---

In [17]:
# Get the difference in weights across missions

# Create window
windowSpec  = Window.orderBy("Mission")

#Simulate Pandas diff() API on PySpark usinl lag function (with above windowSpec)
missions = missions.withColumn("lag",lag("LM_mass(kg)",1).over(windowSpec)) \
      .withColumn("LM_mass_diff", col("LM_mass(kg)") - col("lag")) \
      .drop("lag")


missions = missions.withColumn("lag",lag("CM_mass(kg)",1).over(windowSpec)) \
      .withColumn("CM_mass_diff", col("CM_mass(kg)") - col("lag")) \
      .drop("lag")

# Replace Null values
missions = missions.na.fill(value=0,subset=["LM_mass_diff"])
missions = missions.na.fill(value=0,subset=["CM_mass_diff"])


missions.printSchema()
missions.show()

root
 |-- Sample_weight(kg): double (nullable = true)
 |-- Weight_diff: double (nullable = false)
 |-- Mission: string (nullable = true)
 |-- Lunar_module(LM): string (nullable = true)
 |-- LM_mass(kg): long (nullable = true)
 |-- Command_module(CM): string (nullable = true)
 |-- CM_mass(kg): long (nullable = true)
 |-- LM_mass_diff: long (nullable = true)
 |-- CM_mass_diff: long (nullable = true)

+------------------+------------------+--------+------------------+-----------+--------------------+-----------+------------+------------+
| Sample_weight(kg)|       Weight_diff| Mission|  Lunar_module(LM)|LM_mass(kg)|  Command_module(CM)|CM_mass(kg)|LM_mass_diff|CM_mass_diff|
+------------------+------------------+--------+------------------+-----------+--------------------+-----------+------------+------------+
|          21.55424|               0.0|Apollo11|      Eagle (LM-5)|      15103|  Columbia (CSM-107)|       5560|           0|           0|
|          34.34238|12.788139999999999|Apo

In [18]:
# Add some totals for each mission across both the lunar and command modules

missions = missions.withColumn("Total_weight(kg)", col("LM_mass(kg)") + col("CM_mass(kg)")) \
      .withColumn("Total_weight_diff", col("LM_mass_diff") + col("CM_mass_diff")) \


missions.printSchema()
missions.show()


root
 |-- Sample_weight(kg): double (nullable = true)
 |-- Weight_diff: double (nullable = false)
 |-- Mission: string (nullable = true)
 |-- Lunar_module(LM): string (nullable = true)
 |-- LM_mass(kg): long (nullable = true)
 |-- Command_module(CM): string (nullable = true)
 |-- CM_mass(kg): long (nullable = true)
 |-- LM_mass_diff: long (nullable = true)
 |-- CM_mass_diff: long (nullable = true)
 |-- Total_weight(kg): long (nullable = true)
 |-- Total_weight_diff: long (nullable = true)

+------------------+------------------+--------+------------------+-----------+--------------------+-----------+------------+------------+----------------+-----------------+
| Sample_weight(kg)|       Weight_diff| Mission|  Lunar_module(LM)|LM_mass(kg)|  Command_module(CM)|CM_mass(kg)|LM_mass_diff|CM_mass_diff|Total_weight(kg)|Total_weight_diff|
+------------------+------------------+--------+------------------+-----------+--------------------+-----------+------------+------------+----------------+--

# Exercise - Understand the data in the missions DataFrame

In [19]:
# Sample-to-weight ratio

saturnVPayload = 43500

missions = missions.withColumn("Crewed_area_Payload", col("Total_weight(kg)") / saturnVPayload) \
    .withColumn("Sample_Crewed_area", col("Sample_weight(kg)") / col("Total_weight(kg)")) \
    .withColumn("Sample_Payload", round(col("Sample_weight(kg)") / saturnVPayload,6))

# Round all numeric columns to 6 decimal places
for c_name, c_type in missions.dtypes:
    if c_type in ('long','double', 'float'):
        missions = missions.withColumn(c_name, round(c_name, 6))

missions.printSchema()
missions.show()


root
 |-- Sample_weight(kg): double (nullable = true)
 |-- Weight_diff: double (nullable = true)
 |-- Mission: string (nullable = true)
 |-- Lunar_module(LM): string (nullable = true)
 |-- LM_mass(kg): long (nullable = true)
 |-- Command_module(CM): string (nullable = true)
 |-- CM_mass(kg): long (nullable = true)
 |-- LM_mass_diff: long (nullable = true)
 |-- CM_mass_diff: long (nullable = true)
 |-- Total_weight(kg): long (nullable = true)
 |-- Total_weight_diff: long (nullable = true)
 |-- Crewed_area_Payload: double (nullable = true)
 |-- Sample_Crewed_area: double (nullable = true)
 |-- Sample_Payload: double (nullable = true)

+-----------------+-----------+--------+------------------+-----------+--------------------+-----------+------------+------------+----------------+-----------------+-------------------+------------------+--------------+
|Sample_weight(kg)|Weight_diff| Mission|  Lunar_module(LM)|LM_mass(kg)|  Command_module(CM)|CM_mass(kg)|LM_mass_diff|CM_mass_diff|Total_wei

In [20]:
# Update (managed) Missions table on train database

missions.write.mode("Overwrite").saveAsTable("train.Missions")

spark.sql("Select * From train.Missions;").show()
spark.sql("DESCRIBE TABLE train.Missions;").show()

Py4JJavaError: An error occurred while calling o247.saveAsTable.
: org.apache.hadoop.ipc.RpcException: RPC response has invalid length
	at org.apache.hadoop.ipc.Client$IpcStreams.readResponse(Client.java:1933)
	at org.apache.hadoop.ipc.Client$Connection.receiveRpcResponse(Client.java:1238)
	at org.apache.hadoop.ipc.Client$Connection.run(Client.java:1134)


In [195]:
# Take the average of all those ratios across all the missions

# Note: (...).collect()[0][0] was used to allow a single value in variable instead of dataframe
crewedArea_payload_ratio = missions.select(round(_mean("Crewed_area_Payload"),6)).collect()[0][0]
sample_crewedArea_ratio = missions.select(round(_mean("Sample_Crewed_area"),6)).collect()[0][0]
sample_payload_ratio = missions.select(round(_mean(col("Sample_Payload")),6)).collect()[0][0]

# Exercise - Predict Artemis sample capacity

In [196]:
# Create an Artemis mission DataFrame
artemis_crewedArea = 26520

missionData = ["artemis1","artemis1b","artemis2"]
weightData = [artemis_crewedArea,artemis_crewedArea,artemis_crewedArea]
payloadData = [26988, 37965, 42955]

# joins / Zip the 3 configuration lists
artemisAllData = zip(missionData,weightData,payloadData)

# Convert zip object to a list
artemisAllData = list(artemisAllData)

#print(artemisAllData)
print(*artemisAllData)

# Schema configuration
schemaArtemis = ["Mission", "Total_weight(kg)", "Payload(kg)"]

artemis_mission = spark.createDataFrame(data = artemisAllData, schema = schemaArtemis)

artemis_mission.printSchema()
artemis_mission.show()


('artemis1', 26520, 26988) ('artemis1b', 26520, 37965) ('artemis2', 26520, 42955)
root
 |-- Mission: string (nullable = true)
 |-- Total_weight(kg): long (nullable = true)
 |-- Payload(kg): long (nullable = true)

+---------+----------------+-----------+
|  Mission|Total_weight(kg)|Payload(kg)|
+---------+----------------+-----------+
| artemis1|           26520|      26988|
|artemis1b|           26520|      37965|
| artemis2|           26520|      42955|
+---------+----------------+-----------+



In [198]:
# Estimate the weight of samples based on the ratios we determined from the Apollo missions

artemis_mission = artemis_mission.withColumn("Sample_weight_from_total(kg)",round(col("Total_weight(kg)") * sample_crewedArea_ratio,6))
artemis_mission = artemis_mission.withColumn("Sample_weight_from_payload(kg)",round(col("Payload(kg)") * sample_payload_ratio,6))


artemis_mission.printSchema()
artemis_mission.show()

root
 |-- Mission: string (nullable = true)
 |-- Total_weight(kg): long (nullable = true)
 |-- Payload(kg): long (nullable = true)
 |-- Sample_weight_from_total(kg): double (nullable = true)
 |-- Sample_weight_from_payload(kg): double (nullable = true)

+---------+----------------+-----------+----------------------------+------------------------------+
|  Mission|Total_weight(kg)|Payload(kg)|Sample_weight_from_total(kg)|Sample_weight_from_payload(kg)|
+---------+----------------+-----------+----------------------------+------------------------------+
| artemis1|           26520|      26988|                    75.55548|                     38.781756|
|artemis1b|           26520|      37965|                    75.55548|                     54.555705|
| artemis2|           26520|      42955|                    75.55548|                     61.726335|
+---------+----------------+-----------+----------------------------+------------------------------+



In [205]:
# Get the average of the two predictions:
artemis_mission = artemis_mission.withColumn("Estimated_sample_weight(kg)", round((col("Sample_weight_from_payload(kg)") + col("Sample_weight_from_total(kg)"))/2,6))

artemis_mission.describe().show()
artemis_mission.printSchema()
artemis_mission.show()


+-------+--------+----------------+------------------+----------------------------+------------------------------+---------------------------+
|summary| Mission|Total_weight(kg)|       Payload(kg)|Sample_weight_from_total(kg)|Sample_weight_from_payload(kg)|Estimated_sample_weight(kg)|
+-------+--------+----------------+------------------+----------------------------+------------------------------+---------------------------+
|  count|       3|               3|                 3|                           3|                             3|                          3|
|   mean|    NULL|         26520.0|35969.333333333336|                    75.55548|            51.687931999999996|          63.62170633333333|
| stddev|    NULL|             0.0| 8168.432305732437|                         0.0|             11.73803722333751|          5.869018886548102|
|    min|artemis1|           26520|             26988|                    75.55548|                     38.781756|                  57.168618|

# Exercise - Prioritize Moon rock sample gathering based on data