In [1]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params

In [3]:
training = sqlContext.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

In [4]:
lr = LogisticRegression(maxIter=10, regParam=0.01)
print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"

LogisticRegression parameters:
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
featuresCol: features column name (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name (default: label)
maxIter: max number of iterations (>= 0) (default: 100, current: 10)
predictionCol: prediction column name (default: prediction)
probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities. (default: probability)
regParam: regularization parameter (>= 0) (default: 0.1, current: 0.01)
threshold: threshold in binary classification prediction, in range [0, 1]. (default: 0.5)
tol: the convergence tolerance for iterative algorithms (default: 1e-06)



In [5]:
model1 = lr.fit(training)
print "Model 1 was fit using parameters: "
print model1.extractParamMap()

Model 1 was fit using parameters: 
{Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='labelCol', doc='label column name'): 'label', Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='threshold', doc='threshold in binary classification prediction, in range [0, 1].'): 0.5, Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='featuresCol', doc='features column name'): 'features', Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability', Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='regParam', doc='regularization parameter (>= 0)'): 0.01, Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalt

In [6]:
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})

paramMap2 = {lr.probabilityCol: "myProbability"}
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)

In [7]:
model2 = lr.fit(training, paramMapCombined)
print "Model 2 was fit using parameter: "
print model2.extractParamMap()

Model 2 was fit using parameter: 
{Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='labelCol', doc='label column name'): 'label', Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='threshold', doc='threshold in binary classification prediction, in range [0, 1].'): 0.5, Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='featuresCol', doc='features column name'): 'features', Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'probability', Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='regParam', doc='regularization parameter (>= 0)'): 0.01, Param(parent='LogisticRegression_4df7b108bfeea36a579c', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty

In [9]:
test = sqlContext.createDataFrame([
        (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
        (0.0, Vectors.dense([3.0, 2.0, -0.1])),
        (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

In [10]:
prediction = model2.transform(test)
selected = prediction.select("features", "label", "myProbability", "prediction")
for row in selected.collect():
    print row

Row(features=DenseVector([-1.0, 1.5, 1.3]), label=1.0, myProbability=DenseVector([0.0571, 0.9429]), prediction=1.0)
Row(features=DenseVector([3.0, 2.0, -0.1]), label=0.0, myProbability=DenseVector([0.9239, 0.0761]), prediction=0.0)
Row(features=DenseVector([0.0, 2.2, -1.5]), label=1.0, myProbability=DenseVector([0.1097, 0.8903]), prediction=1.0)
