In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
conf = SparkConf()
conf.set("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
conf.set("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

### kmeans聚类

#### 导入数据

In [3]:
# Loads data.
dataset = spark.read.format("libsvm").load(r"D:\spark\data\mllib\sample_kmeans_data.txt")

In [4]:
dataset.show(10,truncate=False)

+-----+-------------------------+
|label|features                 |
+-----+-------------------------+
|0.0  |(3,[],[])                |
|1.0  |(3,[0,1,2],[0.1,0.1,0.1])|
|2.0  |(3,[0,1,2],[0.2,0.2,0.2])|
|3.0  |(3,[0,1,2],[9.0,9.0,9.0])|
|4.0  |(3,[0,1,2],[9.1,9.1,9.1])|
|5.0  |(3,[0,1,2],[9.2,9.2,9.2])|
+-----+-------------------------+



#### 实例化

In [5]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)

#### 模型训练

In [6]:
model = kmeans.fit(dataset)

#### 模型预测

In [7]:
# Make predictions
predictions = model.transform(dataset)

#### 模型评估

In [8]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

In [9]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9997530305375207


#### 给出质心

In [10]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


### Latent Dirichlet allocation

#### 导入数据

In [11]:
from pyspark.ml.clustering import LDA

# Loads data.
dataset = spark.read.format("libsvm").load(r"D:\spark\data\mllib\sample_kmeans_data.txt")

#### 模型实例化

In [12]:
# Trains a LDA model.
lda = LDA(k=10, maxIter=10)

#### 训练模型

In [13]:
model = lda.fit(dataset)

#### 查看上下界

In [14]:
ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -143.8636360060363
The upper bound on perplexity: 1.7374835266429505


#### 描述主题

In [15]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[0, 2, 1]  |[0.3876737934874535, 0.3457439632751837, 0.2665822432373629]   |
|1    |[0, 2, 1]  |[0.3451267812287708, 0.3309268210735292, 0.3239463976977]      |
|2    |[1, 0, 2]  |[0.34222755827442175, 0.33782140944426237, 0.3199510322813159] |
|3    |[2, 1, 0]  |[0.36637747830593803, 0.32068128127364803, 0.3129412404204139] |
|4    |[2, 0, 1]  |[0.379165527019625, 0.3319006641246994, 0.2889338088556756]    |
|5    |[1, 2, 0]  |[0.3737976450777951, 0.33666961134379736, 0.2895327435784075]  |
|6    |[1, 0, 2]  |[0.34691917361588237, 0.3340254468251462, 0.3190553795589714]  |
|7    |[2, 1, 0]  |[0.34417501608466866, 0.33481295056866767, 0.32101203334666356]|
|8    |[0, 1, 2]  |[0.3894

#### 查看结果

In [16]:

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

+-----+-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                 |topicDistribution                                                                                                                                                                                                       |
+-----+-------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(3,[],[])                |[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]                                                                                                                                                                             

### BisectingKMeans 聚类法

#### 导入数据

In [18]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Loads data.
dataset = spark.read.format("libsvm").load(r"D:\spark\data\mllib\sample_kmeans_data.txt")

#### 模型训练

In [19]:
# Trains a bisecting k-means model.
bkm = BisectingKMeans().setK(2).setSeed(1)
model = bkm.fit(dataset)

#### 预测数据

In [None]:
# Make predictions
predictions = model.transform(dataset)

#### 指标评估

In [21]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9997530305375207


#### 显示质心

In [22]:
# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

Cluster Centers: 
[0.1 0.1 0.1]
[9.1 9.1 9.1]
