# Iceberg Classification Step 2: Create Feature Groups and Train/Eval datasets
This notebook will perform the following operations:
- Read the pre-processed data from a HopsFS dataset into a PySpark dataframe 
- Create and Feature Group "iceberg"
- Create a training and test dataset with the Feature Store API

This notebook is tested with the following ``configuration`` from hopsworks.
<div>
<img src="fig/step2_jupyter_config.png" width="900" align="center"/>
</div>

In [1]:
import os
import pandas as pd
import numpy as np
from hops import hdfs
from hops import pandas_helper as pd
import hsfs
from hsfs.rule import Rule

# SparkSession available as 'spark'
print(
    f"-----------------------------------------------\n" \
    f"This notebook is tested with:\n" \
    f"  - Hopsworks 2.2.0\n" \
    f"  - Spark {spark.version}.\n"
)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
240,application_1619040920875_0273,pyspark,idle,Link,Link


SparkSession available as 'spark'.


An error was encountered:
No module named 'hsfs.rule'
Traceback (most recent call last):
ModuleNotFoundError: No module named 'hsfs.rule'



## define relevant paths

In [2]:
DATA_FOLDER = 'eodata'
train_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER,'train.json')
train_preprocessed_all_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed_all.json')
train_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'train_preprocessed.json')
test_preprocessed_ds_path = os.path.join(hdfs.project_path(), DATA_FOLDER, 'test_preprocessed.json')

print("train_ds_path:", train_ds_path)
print("train_preprocessed_all_ds_path:", train_preprocessed_all_ds_path)
print("train_preprocessed_ds_path:", train_preprocessed_ds_path)
print("test_preprocessed_ds_path:", test_preprocessed_ds_path)

train_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train.json
train_preprocessed_all_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train_preprocessed_all.json
train_preprocessed_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/train_preprocessed.json
test_preprocessed_ds_path: hdfs://rpc.namenode.service.consul:8020/Projects/ExtremeEarth/eodata/test_preprocessed.json

In [3]:
#read raw train with spark and insert into feature store
train_preprocessed_all_df = spark.read.format('json').load(train_preprocessed_all_ds_path)

In [4]:
train_preprocessed_all_df.printSchema()

root
 |-- band_1: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- band_2: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- band_avg: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- id: string (nullable = true)
 |-- inc_angle: string (nullable = true)
 |-- is_iceberg: long (nullable = true)

In [5]:
train_preprocessed_all_df.show(5)

+--------------------+--------------------+--------------------+--------+---------+----------+
|              band_1|              band_2|            band_avg|      id|inc_angle|is_iceberg|
+--------------------+--------------------+--------------------+--------+---------+----------+
|[-27.878361, -27....|[-27.154118, -29....|[-27.5162395, -28...|dfd5f913|  43.9239|         0|
|[-12.242375, -14....|[-31.506321, -27....|[-21.874348, -21....|e25388fd|  38.1562|         0|
|[-24.603676, -24....|[-24.870956, -24....|[-24.737316, -24....|58b2aaa0|  45.2859|         1|
|[-22.454607, -23....|[-27.889421, -27....|[-25.172014, -25....|4cfc3a18|  43.8306|         0|
|[-26.006956, -23....|[-27.206915, -30....|[-26.6069355, -26...|271f93f4|  35.6256|         0|
+--------------------+--------------------+--------------------+--------+---------+----------+
only showing top 5 rows

In [6]:
conn = hsfs.connection()
fs = conn.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

# Create feature expectations with validation rules

In [7]:
expectation_id = fs.create_expectation("icebergs_id",  
                                       description="validate inc_angle feature values",
                                       features=["id"], 
                                       rules=[Rule(name="HAS_DATATYPE", level="ERROR",accepted_type="Null", max=0)])
expectation_id.save()

expectation_label = fs.create_expectation("is_iceberg",
                                         features=["is_iceberg"], 
                                         description="validate is_iceberg label values",
                                         rules=[Rule(name="HAS_DATATYPE", level="ERROR", accepted_type="Integral", min=1), 
                                                Rule(name="HAS_MAX", level="ERROR", min=1, max=1), 
                                                Rule(name="HAS_MIN", level="ERROR", min=0, max=0)])

expectation_label.save()

An error was encountered:
'FeatureStore' object has no attribute 'create_expectation'
Traceback (most recent call last):
AttributeError: 'FeatureStore' object has no attribute 'create_expectation'



# Create and save features to the Feature Store

In [8]:
icebergs_fg = fs.create_feature_group(
    "iceberg",
    time_travel_format=None,
    statistics_config=hsfs.statistics_config.StatisticsConfig(enabled=False, correlations=False, histograms=False, columns=[]),
    expectations=[expectation_id, expectation_label],
    validation_type="STRICT",
    description="Training dataset in Feature Store for iceberg classification"
)

An error was encountered:
name 'expectation_id' is not defined
Traceback (most recent call last):
NameError: name 'expectation_id' is not defined



In [9]:
icebergs_fg.save(train_preprocessed_all_df)

An error was encountered:
name 'icebergs_fg' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_fg' is not defined



# Retreving validation results


In [10]:
import json
[print(json.dumps(validation.to_dict(), indent=2)) for validation in icebergs_fg.get_validations()]

An error was encountered:
name 'icebergs_fg' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_fg' is not defined



# Train test split
Now that preprocessing is done, let's split the feature data into training and testing set.

In [11]:
RAND_SEED = 42
TRAIN_SIZE = 0.8

In [12]:
icebergs_fg.read().count()

An error was encountered:
name 'icebergs_fg' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_fg' is not defined



In [13]:
#Read feature group data, split into train/test and export in tfrecords
icebergs_train_df, icebergs_test_df = icebergs_fg.read().randomSplit([TRAIN_SIZE, 1-TRAIN_SIZE], RAND_SEED)

An error was encountered:
name 'icebergs_fg' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_fg' is not defined



In [14]:
print("Training dataset contains {} records".format(icebergs_train_df.count()))

An error was encountered:
name 'icebergs_train_df' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_train_df' is not defined



In [15]:
print("Testing dataset contains {} records".format(icebergs_test_df.count()))

An error was encountered:
name 'icebergs_test_df' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_test_df' is not defined



In [16]:
# create a traiing dataset of TFRecord
icebergs_train_td = fs.create_training_dataset(
    "train_tfrecords_iceberg_classification_dataset",
    statistics_config=hsfs.statistics_config.StatisticsConfig(enabled=False, correlations=False, histograms=False, columns=[]),
    data_format = "tfrecords"
).save(icebergs_train_df)

An error was encountered:
name 'icebergs_train_df' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_train_df' is not defined



In [17]:
# create a traiing dataset of TFRecord
icebergs_test_td = fs.create_training_dataset(
    "test_tfrecords_iceberg_classification_dataset",
    statistics_config=hsfs.statistics_config.StatisticsConfig(enabled=False, correlations=False, histograms=False, columns=[]),
    data_format = "tfrecords"
).save(icebergs_test_df)

An error was encountered:
name 'icebergs_test_df' is not defined
Traceback (most recent call last):
NameError: name 'icebergs_test_df' is not defined



# End of Step 2