In [9]:
# from urllib.request import urlretrieve
# url = "https://assets.datacamp.com/production/repositories/1237/datasets/231480a2696c55fde829ce76d936596123f12c0c/planes.csv"
# planes = urlretrieve(url,"planes.csv")

In [38]:
from pyspark import SparkContext
sc = SparkContext("local", "pyspark-shell")

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

flights = spark.read.csv("flights.csv", header=True)
flights.createOrReplaceTempView("flights")

planes = spark.read.csv("planes.csv", header=True)
planes.createOrReplaceTempView("planes")

In [39]:
spark.catalog.listTables()

[Table(name='flights', database=None, description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='planes', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

# Getting started with machine learning pipelines

## Machine Learning Pipelines

At the core of the pyspark.ml module are the Transformer and Estimator classes. Almost every other class in the module behaves similarly to these two basic classes.

Transformer classes have a **.transform()** method that takes a **DataFrame** and returns a new **DataFrame**; usually the original one with a new column appended. For example, you might use the class Bucketizer to create discrete bins from a continuous feature or the class PCA to reduce the dimensionality of your dataset using principal component analysis.

Estimator classes all implement a **.fit()** method. These methods also take a **DataFrame**, but instead of returning another DataFrame they return a **model object**. This can be something like a StringIndexerModel for including categorical data saved as strings in your models, or a RandomForestModel that uses the random forest algorithm for classification or regression.

### Join the DataFrames

In [40]:
planes.show()
flights.show()

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N110UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA

In [41]:
planes = planes.withColumnRenamed("year", "plane_year")
model_data = flights.join(planes, on="tailnum", how="leftouter")

In [42]:
model_data.limit(5).toPandas().head()

Unnamed: 0,tailnum,year,month,day,dep_time,dep_delay,arr_time,arr_delay,carrier,flight,...,hour,minute,plane_year,type,manufacturer,model,engines,seats,speed,engine
0,N846VA,2014,12,8,658,-7,935,-5,VX,1780,...,6,58,2011,Fixed wing multi engine,AIRBUS,A320-214,2,182,,Turbo-fan
1,N559AS,2014,1,22,1040,5,1505,5,AS,851,...,10,40,2006,Fixed wing multi engine,BOEING,737-890,2,149,,Turbo-fan
2,N847VA,2014,3,9,1443,-2,1652,2,VX,755,...,14,43,2011,Fixed wing multi engine,AIRBUS,A320-214,2,182,,Turbo-fan
3,N360SW,2014,4,9,1705,45,1839,34,WN,344,...,17,5,1992,Fixed wing multi engine,BOEING,737-3H4,2,149,,Turbo-fan
4,N612AS,2014,3,9,754,-1,1015,1,AS,522,...,7,54,1999,Fixed wing multi engine,BOEING,737-790,2,151,,Turbo-jet


## Data types


Use the **.cast()** method in combination with the **.withColumn()** method to change data types of columns. It's important to note that .cast() works on columns, while .withColumn() works on DataFrames.

The only argument you need to pass to .cast() is the kind of value you want to create, in string form. For example, to create integers, you'll pass the argument "integer" and for decimal numbers you'll use "double".

### String to integer

In [43]:
model_data = model_data.withColumn("arr_delay", model_data.arr_delay.cast("integer"))
model_data = model_data.withColumn("air_time", model_data.air_time.cast("integer"))
model_data = model_data.withColumn("month", model_data.month.cast("integer"))
model_data = model_data.withColumn("plane_year", model_data.plane_year.cast("integer"))

### Create a new column

In [44]:
model_data = model_data.withColumn("plane_age", model_data.year - model_data.plane_year)
model_data.select("year", "plane_year", "plane_age").show()

+----+----------+---------+
|year|plane_year|plane_age|
+----+----------+---------+
|2014|      2011|      3.0|
|2014|      2006|      8.0|
|2014|      2011|      3.0|
|2014|      1992|     22.0|
|2014|      1999|     15.0|
|2014|      1997|     17.0|
|2014|      2002|     12.0|
|2014|      2013|      1.0|
|2014|      2001|     13.0|
|2014|      2006|      8.0|
|2014|      2004|     10.0|
|2014|      2001|     13.0|
|2014|      2013|      1.0|
|2014|      1992|     22.0|
|2014|      1999|     15.0|
|2014|      2000|     14.0|
|2014|      2001|     13.0|
|2014|      2014|      0.0|
|2014|      2008|      6.0|
|2014|      2001|     13.0|
+----+----------+---------+
only showing top 20 rows



### Making a Boolean

In [45]:
model_data = model_data.withColumn("is_late", model_data.arr_delay>0)
model_data.select("is_late").show()

model_data = model_data.withColumn("label", model_data.is_late.cast("integer"))
model_data.select("is_late", "label").show()

model_data = model_data.filter("arr_delay is not NULL AND dep_delay is not NULL AND air_time is not NULL AND plane_year is not NULL")

+-------+
|is_late|
+-------+
|  false|
|   true|
|   true|
|   true|
|   true|
|   true|
|   true|
|  false|
|  false|
|  false|
|  false|
|   true|
|   true|
|  false|
|   true|
|   true|
|   true|
|  false|
|  false|
|  false|
+-------+
only showing top 20 rows

+-------+-----+
|is_late|label|
+-------+-----+
|  false|    0|
|   true|    1|
|   true|    1|
|   true|    1|
|   true|    1|
|   true|    1|
|   true|    1|
|  false|    0|
|  false|    0|
|  false|    0|
|  false|    0|
|   true|    1|
|   true|    1|
|  false|    0|
|   true|    1|
|   true|    1|
|   true|    1|
|  false|    0|
|  false|    0|
|  false|    0|
+-------+-----+
only showing top 20 rows



## Strings and factors

With pyspark.ml.features submodule you can create what are called 'one-hot vectors' which is a way of representing a categorical feature where every observation has a vector in which all elements are zero except for at most one element, which has a value of one (1).

The first step to encoding your categorical feature is to create a StringIndexer. Members of this class are Estimators that take a DataFrame with a column of strings and map each unique string to a number. Then, the Estimator returns a Transformer that takes a DataFrame, attaches the mapping to it as metadata, and returns a new DataFrame with a numeric column corresponding to the string column.

The second step is to encode this numeric column as a one-hot vector using a OneHotEncoder. This works exactly the same way as the StringIndexer by creating an Estimator and then a Transformer. The end result is a column that encodes your categorical feature as a vector.

### Carrier

The inputCol is the name of the column you want to index or encode, and the outputCol is the name of the new column that the Transformer should create.



In [48]:
model_data.select("carrier").show()

+-------+
|carrier|
+-------+
|     VX|
|     AS|
|     VX|
|     WN|
|     AS|
|     WN|
|     WN|
|     VX|
|     AS|
|     AS|
|     AS|
|     AS|
|     AS|
|     AS|
|     AS|
|     UA|
|     AS|
|     WN|
|     AS|
|     OO|
+-------+
only showing top 20 rows



In [49]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

carr_indexer = StringIndexer(inputCol = "carrier", outputCol= "carrier_index")
carr_encoder = OneHotEncoder(inputCol= "carrier_index", outputCol= "carrier_fact")

### Destination

In [50]:
model_data.select("dest").show()

+----+
|dest|
+----+
| LAX|
| HNL|
| SFO|
| SJC|
| BUR|
| DEN|
| OAK|
| SFO|
| SAN|
| ORD|
| LAX|
| PHX|
| LAS|
| ANC|
| SFO|
| SFO|
| SMF|
| MDW|
| BOS|
| BUR|
+----+
only showing top 20 rows



In [51]:
dest_indexer = StringIndexer(inputCol = "dest", outputCol="dest_index")
dest_encoder = OneHotEncoder(inputCol = "dest_index", outputCol="dest_fact")

### Assemble a vector

The last step in the Pipeline is to combine all of the columns containing our features into a single column. This has to be done before modeling can take place because every Spark modeling routine expects the data to be in this form. You can do this by storing each of the values from a column as an entry in a vector. Then, from the model's point of view, every observation is a vector that contains all of the information about it and a label that tells the modeler what value that observation corresponds to.

VectorAssembler transformer takes all of the columns you specify and combines them into a new vector column.

In [54]:
from pyspark.ml.feature import VectorAssembler

vec_assembler = VectorAssembler(inputCols=["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol="features")

### Create the pipeline

Pipeline is a class in the pyspark.ml module that combines all the Estimators and Transformers that you've already created. This lets you reuse the same modeling process over and over again by wrapping it up in one simple object. 

In [55]:
from pyspark.ml import Pipeline

flights_pipe = Pipeline(stages = [dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])

## Test vs Train


One of the most important steps is to split the data into a test set and a train set.

### Transform the data

In [56]:
piped_data = flights_pipe.fit(model_data).transform(model_data)

In [63]:
piped_data.select("carrier", "carrier_index","carrier_fact", "dest", "dest_index", "dest_fact").show()

+-------+-------------+--------------+----+----------+---------------+
|carrier|carrier_index|  carrier_fact|dest|dest_index|      dest_fact|
+-------+-------------+--------------+----+----------+---------------+
|     VX|          7.0|(10,[7],[1.0])| LAX|       1.0| (68,[1],[1.0])|
|     AS|          0.0|(10,[0],[1.0])| HNL|      19.0|(68,[19],[1.0])|
|     VX|          7.0|(10,[7],[1.0])| SFO|       0.0| (68,[0],[1.0])|
|     WN|          1.0|(10,[1],[1.0])| SJC|       7.0| (68,[7],[1.0])|
|     AS|          0.0|(10,[0],[1.0])| BUR|      22.0|(68,[22],[1.0])|
|     WN|          1.0|(10,[1],[1.0])| DEN|       2.0| (68,[2],[1.0])|
|     WN|          1.0|(10,[1],[1.0])| OAK|       8.0| (68,[8],[1.0])|
|     VX|          7.0|(10,[7],[1.0])| SFO|       0.0| (68,[0],[1.0])|
|     AS|          0.0|(10,[0],[1.0])| SAN|      10.0|(68,[10],[1.0])|
|     AS|          0.0|(10,[0],[1.0])| ORD|      11.0|(68,[11],[1.0])|
|     AS|          0.0|(10,[0],[1.0])| LAX|       1.0| (68,[1],[1.0])|
|     

### Split the data

In [69]:
training, test = piped_data.randomSplit([0.6, 0.4])

In [75]:
print((training.count(), len(training.columns)))

(5586, 32)


In [76]:
print((test.count(), len(test.columns)))

(3717, 32)
