To look at pySpark's GLM capabilities, we will use the simplest example of a gausian error distribution with an identity link function, which is equivalent to OLS.  In our model we'll try to answer a simple question: all else equal, how does a change in size (carat) affect a dimaond's price?  We will test the equation:

    price = a + B*carat + cut_dummy_vars + color_dummy_vars + clarity_dummy_vars + e
    
Don't worry about the accuracy of this model specification, or the assumptions behind it; our goal is just to compare pySpark's GLM function to something most users will be more familiar with.  Note that `cut`, `color` and `clarity` had to be converted to numeric because Stata can't work with strings.  This was done using, for example, `encode cut, gen(cut_)`.

In [None]:
from IPython.display import Image
Image(filename='stata_regress.PNG')

In [93]:
def build_indep_vars(df, independent_vars, categorical_vars=None, keep_intermediate=False):

    """
    Data verification
    df               : DataFrame
    independent_vars : List of column names
    categorical_vars : None or list of column names, e.g. ['col1', 'col2']
    """
    assert(type(df) is pyspark.sql.dataframe.DataFrame), 'pypark_glm: A pySpark dataframe is required as the first argument.'
    assert(type(dependent_var) is str), 'pyspark_glm: Dependent variable column name must be the second argument.'
    assert(type(independent_vars) is list), 'pyspark_glm: List of independent variable column names must be the third argument.'
    for iv in independent_vars:
        assert(type(iv) is str), 'pyspark_glm: Independent variables must be column name strings.'
        assert(iv in df.columns), 'pyspark_glm: Independent variable name is not a dataframe column.'
    if categorical_vars:
        for cv in categorical_vars:
            assert(type(cv) is str), 'pyspark_glm: Categorical variables must be column name strings.'
            assert(cv in df.columns), 'pyspark_glm: Categorical variable name is not a dataframe column.'
            assert(cv in independent_vars), 'pyspark_glm: Categorical variables must be independent variables.'

    """
    Code
    """
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
    from pyspark.ml.regression import GeneralizedLinearRegression

    if categorical_vars:
        string_indexer = [StringIndexer(inputCol=x, 
                                        outputCol='{}_index'.format(x))
                          for x in categorical_vars]

        encoder        = [OneHotEncoder(dropLast=False, 
                                        inputCol ='{}_index' .format(x), 
                                        outputCol='{}_vector'.format(x))
                          for x in categorical_vars]

        independent_vars = ['{}_vector'.format(x) if x in categorical_vars else x for x in independent_vars]
    else:
        string_indexer, encoder = [], []

    assembler = VectorAssembler(inputCols=independent_vars, 
                                outputCol='indep_vars')
    pipeline  = Pipeline(stages=string_indexer+encoder+[assembler])
    model = pipeline.fit(df)
    final = model.transform(df)

    if not keep_intermediate:
        fcols = [c for c in final.columns if '_index' not in c[-6:] and '_vector' not in c[-7:]]
        final = final[fcols]

    return final

In [91]:
df = spark.read.csv('s3://ui-spark-data/diamonds.csv', inferSchema=True, header=True, sep=',')

In [94]:
final = build_indep_vars(df, ['carat', 'clarity'], categorical_vars=['clarity'])

NameError: global name 'dependent_var' is not defined

In [87]:
final.show()

+-----+---------+-----+-------+-----+-----+-----+----+----+----+--------------------+
|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|          indep_vars|
+-----+---------+-----+-------+-----+-----+-----+----+----+----+--------------------+
| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|(6,[0,1],[0.23,1.0])|
| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|(6,[0,2],[0.21,1.0])|
| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|(6,[0,4],[0.23,1.0])|
| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|(6,[0,2],[0.29,1.0])|
| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|(6,[0,4],[0.31,1.0])|
| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|(6,[0,3],[0.24,1.0])|
| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|(6,[0,3],[0.24,1.0])|
| 0.26|Very Good|    H|    SI1| 61.9| 55.0|  337|4.07|4.11|2.53|(6,[0,3],[0.26,1.0])|
| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|

In [41]:
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression

In [88]:
glr = GeneralizedLinearRegression(family='gaussian', link='identity', featuresCol='indep_vars', labelCol='price')

In [89]:
model = glr.fit(final)

Py4JJavaError: An error occurred while calling o1446.fit.
: java.lang.AssertionError: assertion failed: lapack.dppsv returned 7.
	at scala.Predef$.assert(Predef.scala:170)
	at org.apache.spark.mllib.linalg.CholeskyDecomposition$.solve(CholeskyDecomposition.scala:40)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:140)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression.train(GeneralizedLinearRegression.scala:265)
	at org.apache.spark.ml.regression.GeneralizedLinearRegression.train(GeneralizedLinearRegression.scala:139)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)


In [85]:
transformed = model.transform(final)

In [44]:
lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", featuresCol='indep_vars', labelCol='price')

In [45]:
model = lr.fit(final)

Py4JJavaError: An error occurred while calling o617.fit.
: java.lang.AssertionError: assertion failed: lapack.dppsv returned 21.
	at scala.Predef$.assert(Predef.scala:170)
	at org.apache.spark.mllib.linalg.CholeskyDecomposition$.solve(CholeskyDecomposition.scala:40)
	at org.apache.spark.ml.optim.WeightedLeastSquares.fit(WeightedLeastSquares.scala:140)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:180)
	at org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:70)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:90)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:71)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:211)
	at java.lang.Thread.run(Thread.java:745)
