# Lesson 28 - Pipelines

## Prepare Environment

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression 
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.getOrCreate()

## Introduction

In this lesson, we will explore how to using the Spark `Pipeline` class to combine several preprocessing and modeling objects, or **stages**, into a single object.

## Load and Explore Data

To illustrate the use of Pipelines, we will use the [Census Income Dataset](http://archive.ics.uci.edu/ml/datasets/Census+Income) from the UCI Machine Learning Repository.

In [0]:
census_schema = (
    'age BYTE, workclass STRING, fnlwgt INTEGER, education STRING, educ_num BYTE, '
    'marital_status STRING, occupation STRING, relationship STRING, race STRING, '
    'sex STRING, capital_gain INTEGER, capital_loss SHORT, hrs_per_week BYTE, '
    'native_country STRING, salary STRING'
)

census = (
    spark.read
    .option('delimiter', '\t')
    .option('header', True)
    .schema(census_schema)
    .csv('/FileStore/tables/census.txt')
)

census.printSchema()

In [0]:
census.sample(withReplacement=False, fraction=0.0004, seed=1).toPandas()

Unnamed: 0,age,workclass,fnlwgt,education,educ_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,salary
0,43,Private,154538,Assoc-acdm,12,Divorced,Craft-repair,Not-in-family,White,Male,0,0,40,United-States,<=50K
1,38,Private,65466,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,50,United-States,<=50K
2,35,Self-emp-not-inc,111095,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,<=50K
3,43,Self-emp-not-inc,33521,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Male,0,0,70,United-States,>50K
4,38,Private,87556,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,7688,0,55,United-States,>50K
5,69,Private,277588,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,10,United-States,<=50K
6,58,Self-emp-not-inc,140729,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,35,United-States,<=50K
7,62,,378239,Masters,14,Married-civ-spouse,,Husband,White,Male,0,0,2,United-States,>50K
8,45,Private,55720,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K
9,33,Local-gov,194901,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K


In [0]:
N = census.count()
print(N)

### Distribution of Label Values

To serve as a baseline against which we can compare our model, we will check the distribution of the label values.

In [0]:
(
    census
    .select('salary')
    .groupby('salary')
    .agg(
        expr('COUNT(*) as count'), 
        expr(f'ROUND(COUNT(*)/{N},4) as prop')
    )
    .show()
)

### Missing Values

We saw in a previous lesson that this dataset contains missing values. We will now determine the number of missing values in each column.

In [0]:
census.select([expr(f'{N} - COUNT({c}) AS {c}') for c in census.columns]).toPandas()

Unnamed: 0,age,workclass,fnlwgt,education,educ_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hrs_per_week,native_country,salary
0,0,1836,0,0,0,0,1843,0,0,0,0,0,0,583,0


Before providing this data to an ML model, we need to either remove the records with missing values, or we need to fill in (or impute) the missing values. We will adopt the second approach.

In [0]:
census = census.fillna('missing')

### Identify Numerical and Categorical Features

We need to separate the categorical and numerical features so that we can perform one-hot encoding on the categorical features.

** - another way is to loop over the columns, store any numerical type column into num_features, and string columns into cat_features

In [0]:
num_features = ['age', 'fnlwgt', 'educ_num', 'capital_gain', 'capital_loss', 'hrs_per_week']
cat_features = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [0]:
census.columns

In [0]:
census.dtypes

#### Another way of avoiding hard coding to identify the numerical dtype

In [0]:
cat_features = [ c for c, d in census.dtypes[:-1] if d == 'string']  # slice out the last column, which is the target
num_features = [c for c in census.columns[:-1] if c not in cat_features]
# num_features = [c for c in census.dtypes in d != 'string'] this is another solution for identifying numerical features

print(cat_features)
print(num_features)

## Create and Apply the Pipeline

### Define Pipeline Stages

We will now define several stages to be included within our pipeline object. Each of these stages represents either a preprocessing task to be performed on the data, or the logistic regression model itself.

In [0]:
ix_features = [c + '_ix' for c in cat_features]
vec_features = [c + '_vec' for c in cat_features]

label_indexer = StringIndexer(inputCol='salary', outputCol='label')

feature_indexer = StringIndexer(inputCols=cat_features, outputCols=ix_features)

encoder = OneHotEncoder(inputCols=ix_features, outputCols=vec_features, dropLast=False)

assembler = VectorAssembler(inputCols=num_features + vec_features, outputCol='features')

logreg = LogisticRegression(featuresCol='features', labelCol='label')

### Create and Fit the Pipeline

We will now use the stages to create a Pipeline object, which we will then fit to the census data. We will use the fitted pipeline to generate predictions for the training set.

In [0]:
model = Pipeline(stages=[label_indexer, feature_indexer, encoder, assembler, logreg]).fit(census)
train_pred = model.transform(census)

train_pred.select(['probability', 'prediction', 'label']).show(10, truncate=False)

### Score the Model

We will calculate the model's accuracy on the training set.

In [0]:
accuracy_eval = MulticlassClassificationEvaluator(
    predictionCol='prediction', labelCol='label', metricName='accuracy')

acc = accuracy_eval.evaluate(train_pred)
print(acc)

### Generate Predictions for New Data

Next we will apply our model to new observations. Notice that since the pipeline with have created contains all of the preprocessing stages, we don't need to apply these separately to the new dataset.

In [0]:
new_schema = (
    'age BYTE, workclass STRING, fnlwgt INTEGER, education STRING, educ_num BYTE, '
    'marital_status STRING, occupation STRING, relationship STRING, race STRING, '
    'sex STRING, capital_gain INTEGER, capital_loss SHORT, hrs_per_week BYTE, '
    'native_country STRING'
)

new_data = spark.createDataFrame(
    data = [[53, 'Private', 94081, 'Bachelors', 13, 'Married-civ-spouse', 'Exec-managerial', 'Husband', 'White', 'Male', 0, 0, 44, 'United-States'], 
            [21, 'Private', 202214, 'Assoc-acdm', 12, 'Never-married', 'Adm-clerical', 'Not-in-family', 'White', 'Female', 0, 0, 10, 'United-States'], 
            [40, 'Local-gov', 33155, 'HS-grad', 9, 'Never-married', 'Other-service', 'Own-child', 'Black', 'Male', 0, 0, 40, 'United-States'], 
            [44, 'Private', 147206, 'Some-college', 10, 'Married-civ-spouse', 'Exec-managerial', 'Husband', 'White', 'Female', 0, 0, 40, 'United-States'], 
            [48, 'Local-gov', 493862, 'HS-grad', 9, 'Married-civ-spouse', 'Exec-managerial', 'Husband', 'Black', 'Male', 7298, 0, 38, 'United-States']],
    schema = new_schema
)

new_pred = model.transform(new_data)

new_pred.select(['probability', 'prediction']).show(truncate=False)

## Stages

In [0]:
for stage in model.stages:
    print(stage)

In [0]:
# I want to know the encoding for my label

print(model.stages[0].labels)

In [0]:
# I want to know the string indexer
for c, labels in zip(cat_features, model.stages[1].labelsArray):
    print(c)
    print(labels)
    print()

In [0]:
print(model.stages[-1].coefficients) # coefficients for all the features

### Get all the features coefficient

In [0]:
final_features = num_features.copy()

for labarray in model.stages[1].labelsArray:
    final_features += list(labarray)

In [0]:
pd.set_option('max_rows', None)

In [0]:
pd.DataFrame({
    'feature': final_features,
    'coefficient': model.stages[-1].coefficients
}
)

Unnamed: 0,feature,coefficient
0,age,0.02552095
1,fnlwgt,7.076674e-07
2,educ_num,-0.01927364
3,capital_gain,0.0003191556
4,capital_loss,0.0006473533
5,hrs_per_week,0.02971352
6,Private,-0.3083316
7,Self-emp-not-inc,-0.7990293
8,Local-gov,-0.4909073
9,missing,-0.6229211
