### 特征工程

#### 对连续值处理

In [14]:
# 1. 二值化

from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer

# 为用户提供统一的切入点来使用spark的各项功能
spark = SparkSession.builder.appName("BinarizerExample").getOrCreate()

data = [(0, 1.1), (1, 8.5), (2, 5.2)]
continuous_dataframe = spark.createDataFrame(data, ['id', 'feature'])

# 查看DataFrame数据
continuous_dataframe.collect()

# 构建binarizer并进行transform
binarizer = Binarizer(threshold=4.0, 
                      inputCol="feature", 
                      outputCol="binarized_feature")

binarized_dataframe = binarizer.transform(continuous_dataframe)
binarizer.getThreshold()

# 查看表信息
binarized_dataframe.show()

+---+-------+-----------------+
| id|feature|binarized_feature|
+---+-------+-----------------+
|  0|    1.1|              0.0|
|  1|    8.5|              1.0|
|  2|    5.2|              1.0|
+---+-------+-----------------+



In [12]:
# 2. 按照给定边界离散化（分箱分桶）

from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer

spark = SparkSession.builder.appName("BucketizerExample").getOrCreate()

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
data = [(-99.2,), (-0.5,), (-1.5,), (1.0,), (100.0,)]
data_frame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketed_features")
bucketed_data = bucketizer.transform(data_frame)

bucketed_data.show()

+--------+-----------------+
|features|bucketed_features|
+--------+-----------------+
|   -99.2|              0.0|
|    -0.5|              1.0|
|    -1.5|              0.0|
|     1.0|              3.0|
|   100.0|              3.0|
+--------+-----------------+



In [13]:
# 3. 按分位数离散化
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("QuantileDiscretizerExample")\
        .getOrCreate()

data = [(0, 18.0), (1, 19.0), (2, 8.0), (3, 5.0), (4, 2.2), (5, 9.2), (6, 14.4)]
df = spark.createDataFrame(data, ["id", "hour"])

discretizer = QuantileDiscretizer(numBuckets=3, inputCol="hour", outputCol="result")
result = discretizer.fit(df).transform(df)

result.show()

+---+----+------+
| id|hour|result|
+---+----+------+
|  0|18.0|   2.0|
|  1|19.0|   2.0|
|  2| 8.0|   1.0|
|  3| 5.0|   0.0|
|  4| 2.2|   0.0|
|  5| 9.2|   1.0|
|  6|14.4|   2.0|
+---+----+------+

