### Appending new column to spark dataframe

In [None]:
from pyspark import Row

def label_mapper(row):
    row_dict = row.asDict()
    row_dict['indexed_label'] = label_map_dict[row_dict[label_col]]
    return Row(**row_dict)

sparkDf.rdd.map(label_mapper).toDF()

#### Discussion 
Prior to Spark 2.0, spark_df.map would alias to spark_df.rdd.map()

### Notes 
#### Categorical vs continuous features
* maxCategories in VectorIndexer refers to threshold. Any feature with number of unique values > maxCategories will be treated as continuous, as opposed to categorical. 

## Spark ML vs Spark MLlib
MLlib is being deprecated, and so we won't have to worry about it in the future, but for now there is an awkward transition period. Here we note some traps to avoid 

#### Spark ML vector is not equal to Spark Mllib equal

In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml import linalg as ml_linalg
from pyspark.mllib.linalg import Vectors as MLLibVectors
from pyspark.sql.functions import col

def label_point_this(inputDf, label_col="indexedLabel", feat_col="features"):
    """wrapper for generating LabeledPoint vectors.
    Logic needed to convert ML Vectors into MLlib vectors
    """

    def as_mllib(v):
        if isinstance(v, ml_linalg.SparseVector):
            return MLLibVectors.sparse(v.size, v.indices, v.values)
        elif isinstance(v, ml_linalg.DenseVector):
            return MLLibVectors.dense(v.toArray())
        else:
            raise TypeError("Unsupported type: {0}".format(type(v)))


    labeledPoints = (inputDf.select(col(label_col).alias("label"), col(feat_col))
                     .rdd.map(lambda row: LabeledPoint(row.label, as_mllib(row.features)))
                     )
    return labeledPoints