In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
conf = SparkConf()
conf.set("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
conf.set("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [6]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

### 导入数据

#### 图像数据

In [3]:
df = spark.read.format("image").option("dropInvalid", True).load(r"D:\spark\data\mllib\images\origin\kittens")
df.select("image.origin", "image.width", "image.height").show(truncate=False)

+-------------------------------------------------------------------------+-----+------+
|origin                                                                   |width|height|
+-------------------------------------------------------------------------+-----+------+
|file:///D:/spark/data/mllib/images/origin/kittens/54893.jpg              |300  |311   |
|file:///D:/spark/data/mllib/images/origin/kittens/DP802813.jpg           |199  |313   |
|file:///D:/spark/data/mllib/images/origin/kittens/29.5.a_b_EGDP022204.jpg|300  |200   |
|file:///D:/spark/data/mllib/images/origin/kittens/DP153539.jpg           |300  |296   |
+-------------------------------------------------------------------------+-----+------+



#### 文本数据

In [4]:
df2 = spark.read.format("libsvm").option("numFeatures", "780").load(r"D:\spark\data\mllib\sample_libsvm_data.txt")

In [33]:
spark.conf.set('spark.sql.repl.eagerEval.maxNumCols', 100)

In [7]:
df2

label,features
0.0,"(780,[127,128,129..."
1.0,"(780,[158,159,160..."
1.0,"(780,[124,125,126..."
1.0,"(780,[152,153,154..."
1.0,"(780,[151,152,153..."
0.0,"(780,[129,130,131..."
1.0,"(780,[158,159,160..."
1.0,"(780,[99,100,101,..."
0.0,"(780,[154,155,156..."
0.0,"(780,[127,128,129..."


### 逻辑回归模型

#### 创建数据集

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

# Prepare training data from a list of (label, features) tuples.
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

In [9]:
training

label,features
1.0,"[0.0,1.1,0.1]"
0.0,"[2.0,1.0,-1.0]"
0.0,"[2.0,1.3,1.0]"
1.0,"[0.0,1.2,-0.5]"


#### 类实例化+模型训练

In [10]:
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)

In [11]:
# Print out the parameters, documentation, and any default values.
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

LogisticRegression parameters:
aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bou

#### 查看模型1的参数

In [13]:
# Since model1 is a Model (i.e., a transformer produced by an Estimator),
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

Model 1 was fit using parameters: 
{Param(parent='LogisticRegression_ac81e243c798', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LogisticRegression_ac81e243c798', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_ac81e243c798', name='featuresCol', doc='features column name.'): 'features', Param(parent='LogisticRegression_ac81e243c798', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LogisticRegression_ac81e243c798', name='labelCol', doc='label column name.'): 'label', Param(parent='LogisticRegression_ac81e243c798', name='predictionCol', doc='prediction column name.'): 'prediction', Param(parent='LogisticRegression_ac81e243c798', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated

#### 指定某些参数

In [14]:
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.

In [15]:
# Specify multiple Params.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})  # type: ignore

#### 更新参数

In [16]:
# You can combine paramMaps, which are python dictionaries.
# Change output column name
paramMap2 = {lr.probabilityCol: "myProbability"}  # type: ignore
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)  # type: ignore

In [17]:
paramMapCombined

{Param(parent='LogisticRegression_ac81e243c798', name='maxIter', doc='max number of iterations (>= 0).'): 30,
 Param(parent='LogisticRegression_ac81e243c798', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LogisticRegression_ac81e243c798', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].'): 0.55,
 Param(parent='LogisticRegression_ac81e243c798', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.'): 'myProbability'}

#### 再次训练

In [19]:
# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMapCombined)
print("Model 2 was fit using parameters: ")
print(model2.extractParamMap())

Model 2 was fit using parameters: 
{Param(parent='LogisticRegression_ac81e243c798', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2, Param(parent='LogisticRegression_ac81e243c798', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0, Param(parent='LogisticRegression_ac81e243c798', name='featuresCol', doc='features column name.'): 'features', Param(parent='LogisticRegression_ac81e243c798', name='fitIntercept', doc='whether to fit an intercept term.'): True, Param(parent='LogisticRegression_ac81e243c798', name='labelCol', doc='label column name.'): 'label', Param(parent='LogisticRegression_ac81e243c798', name='predictionCol', doc='prediction column name.'): 'prediction', Param(parent='LogisticRegression_ac81e243c798', name='probabilityCol', doc='Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated

#### 导入测试集

In [20]:
# Prepare test data
test = spark.createDataFrame([
    (1.0, Vectors.dense([-1.0, 1.5, 1.3])),
    (0.0, Vectors.dense([3.0, 2.0, -0.1])),
    (1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])

#### 对测试集做预测

In [22]:
# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.
prediction = model2.transform(test)
result = prediction.select("features", "label", "myProbability", "prediction") \
    .collect()

for row in result:
    print("features=%s, label=%s , prob=%s, prediction=%s"
          % (row.features, row.label, row.myProbability, row.prediction))

features=[-1.0,1.5,1.3], label=1.0 , prob=[0.05707304171034058,0.9429269582896594], prediction=1.0
features=[3.0,2.0,-0.1], label=0.0 , prob=[0.9238522311704118,0.07614776882958818], prediction=0.0
features=[0.0,2.2,-1.5], label=1.0 , prob=[0.10972776114779739,0.8902722388522026], prediction=1.0


### 文本分类

In [23]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [24]:
training

id,text,label
0,a b c d e spark,1.0
1,b d,0.0
2,spark f g h,1.0
3,hadoop mapreduce,0.0


#### 实例化

In [25]:
# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

In [28]:
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

#### 模型训练

In [29]:
# Fit the pipeline to training documents.
model = pipeline.fit(training)

#### 对测试集做标注

In [30]:
# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row  # type: ignore
    print(
        "(%d, %s) --> prob=%s, prediction=%f" % (
            rid, text, str(prob), prediction   # type: ignore
        )
    )

(4, spark i j k) --> prob=[0.15964077387874118,0.8403592261212589], prediction=1.000000
(5, l m n) --> prob=[0.8378325685476612,0.16216743145233875], prediction=0.000000
(6, spark hadoop spark) --> prob=[0.06926633132976273,0.9307336686702373], prediction=1.000000
(7, apache hadoop) --> prob=[0.9821575333444208,0.01784246665557917], prediction=0.000000
