In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pipeasy_spark as ppz
import pyspark
from pyspark.ml.feature import (
    OneHotEncoder, StringIndexer, StandardScaler, OneHotEncoder, OneHotEncoderEstimator,
    VectorAssembler
)

In [3]:
session = pyspark.sql.SparkSession.builder.appName('titanic').getOrCreate()
titanic = session.read.csv('./datasets/titanic.csv', header=True, inferSchema=True, sep='\t')

In [4]:
titanic.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [5]:
df = titanic.select('Survived', 'Sex', 'Age').dropna()
df.show(2)

+--------+------+----+
|Survived|   Sex| Age|
+--------+------+----+
|       0|  male|22.0|
|       1|female|38.0|
+--------+------+----+
only showing top 2 rows



In [6]:
pipeline = ppz.build_pipeline({
    # 'Survived' : this variable is not modified, it can also be omitted from the dict
    'Survived': [],
    'Sex': [StringIndexer(), OneHotEncoderEstimator(dropLast=False)],
    # 'Age': a VectorAssembler must be applied before the StandardScaler
    # as the latter only accepts vectors as input.
    'Age': [VectorAssembler(), StandardScaler()]
})

In [9]:
trained_pipeline = pipeline.fit(df)
df_transformed = trained_pipeline.transform(df)
df_transformed.show(2)

+--------+-------------+--------------------+
|Survived|          Sex|                 Age|
+--------+-------------+--------------------+
|       0|(2,[0],[1.0])|[1.5054181442954726]|
|       1|(2,[1],[1.0])| [2.600267703783089]|
+--------+-------------+--------------------+
only showing top 2 rows

