In [1]:
import findspark
findspark.init()
findspark.find()
from pyspark.sql import SparkSession


spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

In [2]:
from pyspark.sql.functions import col
from pyspark.sql import functions as f
from pyspark.sql.types import IntegerType, StringType, ArrayType
from pyspark.sql.functions import monotonically_increasing_id

In [4]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import IndexToString,StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

from sklearn.metrics import confusion_matrix, precision_score, recall_score

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import StandardScaler

In [3]:
df = spark.read.load('data/iris.csv', format="csv", header=True, delimiter=",", inferSchema=True)
df.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [5]:
(train, test) = df.randomSplit([0.8, 0.2])

Tạo features vector

In [32]:
numericCols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
scaler = StandardScaler(inputCol='features', outputCol='standardized')
labelIndexer = StringIndexer(inputCol="Species", outputCol="indexedLabel")

pipeline = Pipeline(stages=[assembler, scaler, labelIndexer])

data_transform = pipeline.fit(df)
data_scale_output = data_transform.transform(df)
data_scale_output.select(['Species', 'features', 'standardized', 'indexedLabel']).show(5, truncate=False)

+-----------+-----------------+-------------------------------------------------------------------------+------------+
|Species    |features         |standardized                                                             |indexedLabel|
+-----------+-----------------+-------------------------------------------------------------------------+------------+
|Iris-setosa|[5.1,3.5,1.4,0.2]|[6.158928408838787,8.072061621390857,0.7934616853039358,0.26206798787142]|0.0         |
|Iris-setosa|[4.9,3.0,1.4,0.2]|[5.9174018045706,6.9189099611921625,0.7934616853039358,0.26206798787142] |0.0         |
|Iris-setosa|[4.7,3.2,1.3,0.2]|[5.675875200302412,7.38017062527164,0.7367858506393691,0.26206798787142] |0.0         |
|Iris-setosa|[4.6,3.1,1.5,0.2]|[5.555111898168318,7.149540293231902,0.8501375199685027,0.26206798787142]|0.0         |
|Iris-setosa|[5.0,3.6,1.4,0.2]|[6.038165106704694,8.302691953430596,0.7934616853039358,0.26206798787142]|0.0         |
+-----------+-----------------+-----------------

Thử nghiệm với nhiều giá trị *k* khác nhau

In [34]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

silhouette_score=[]
outputs = []
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='standardized', \
                                metricName='silhouette', distanceMeasure='squaredEuclidean')

for i in range(2,10):    
    KMeans_algo = KMeans(featuresCol='standardized', k=i)
    KMeans_fit = KMeans_algo.fit(data_scale_output)
    output = KMeans_fit.transform(data_scale_output)
    outputs.append(output)
    
    score = evaluator.evaluate(output)
    silhouette_score.append(score)
    
    print("Silhouette Score k = {}: {}".format(i, score))

Silhouette Score k = 2: 0.7714149126311811
Silhouette Score k = 3: 0.6435633372614079
Silhouette Score k = 4: 0.5906029920631519
Silhouette Score k = 5: 0.5280198304819759
Silhouette Score k = 6: 0.4860366336265978
Silhouette Score k = 7: 0.4641212493294926
Silhouette Score k = 8: 0.5114460441707614
Silhouette Score k = 9: 0.4743608243424115


Chọn k=2 và thực hiện train, dự đoán kết quả

In [51]:
KMeans_algo = KMeans(featuresCol='standardized', k=2)
KMeans_fit = KMeans_algo.fit(data_scale_output)
output = KMeans_fit.transform(data_scale_output)
output.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|        standardized|indexedLabel|prediction|
+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|[6.15892840883878...|         0.0|         0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|[5.9174018045706,...|         0.0|         0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|[5.67587520030241...|         0.0|         0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|[5.55511189816831...|         0.0|         0|
|  5|          5.0|         3.6|          1.4|         

In [52]:
output.groupby(['prediction', 'Species']).count().show()

+----------+---------------+-----+
|prediction|        Species|count|
+----------+---------------+-----+
|         1| Iris-virginica|   50|
|         0|    Iris-setosa|   50|
|         1|Iris-versicolor|   50|
+----------+---------------+-----+



In [53]:
print(output.filter('prediction == 0 AND Species == "Iris-versicolor"').count())
print(output.filter('prediction == 0 AND Species == "Iris-setosa"').count())
print(output.filter('prediction == 0 AND Species == "Iris-virginica"').count())

0
50
0


In [54]:
print(output.filter('prediction == 1 AND Species == "Iris-versicolor"').count())
print(output.filter('prediction == 1 AND Species == "Iris-setosa"').count())
print(output.filter('prediction == 1 AND Species == "Iris-virginica"').count())

50
0
50


Thử với k=3

In [55]:
KMeans_algo = KMeans(featuresCol='standardized', k=3)
KMeans_fit = KMeans_algo.fit(data_scale_output)
output = KMeans_fit.transform(data_scale_output)
output.show()

+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+------------+----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|         features|        standardized|indexedLabel|prediction|
+---+-------------+------------+-------------+------------+-----------+-----------------+--------------------+------------+----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|[6.15892840883878...|         0.0|         0|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|[5.9174018045706,...|         0.0|         0|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|[5.67587520030241...|         0.0|         0|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|[5.55511189816831...|         0.0|         0|
|  5|          5.0|         3.6|          1.4|         

In [56]:
output.groupby(['prediction', 'Species']).count().show()

+----------+---------------+-----+
|prediction|        Species|count|
+----------+---------------+-----+
|         2|Iris-versicolor|   13|
|         1| Iris-virginica|    8|
|         0|    Iris-setosa|   49|
|         1|Iris-versicolor|   37|
|         1|    Iris-setosa|    1|
|         2| Iris-virginica|   42|
+----------+---------------+-----+

