In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('clusterOnSeedData').getOrCreate()


# First way is the smartest way

In [2]:
# first way
SeeedData = spark.read.csv('seed_data.txt', inferSchema=True,sep='\t')
SeeedData.show()

+-----+-----+------+-----+-----+-----+-----+---+
|  _c0|  _c1|   _c2|  _c3|  _c4|  _c5|  _c6|_c7|
+-----+-----+------+-----+-----+-----+-----+---+
|15.26|14.84| 0.871|5.763|3.312|2.221| 5.22|1.0|
|14.88|14.57|0.8811|5.554|3.333|1.018|4.956|1.0|
|14.29|14.09| 0.905|5.291|3.337|2.699|4.825|1.0|
|13.84|13.94|0.8955|5.324|3.379|2.259|4.805|1.0|
|16.14|14.99|0.9034|5.658|3.562|1.355|5.175|1.0|
|14.38|14.21|0.8951|5.386|3.312|2.462|4.956|1.0|
|14.69|14.49|0.8799|5.563|3.259|3.586|5.219|1.0|
|14.11| 14.1|0.8911| 5.42|3.302|  2.7| null|5.0|
|16.63|15.46|0.8747|6.053|3.465| 2.04|5.877|1.0|
|16.44|15.25| 0.888|5.884|3.505|1.969|5.533|1.0|
|15.26|14.85|0.8696|5.714|3.242|4.543|5.314|1.0|
|14.03|14.16|0.8796|5.438|3.201|1.717|5.001|1.0|
|13.89|14.02| 0.888|5.439|3.199|3.986|4.738|1.0|
|13.78|14.06|0.8759|5.479|3.156|3.136|4.872|1.0|
|13.74|14.05|0.8744|5.482|3.114|2.932|4.825|1.0|
|14.59|14.28|0.8993|5.351|3.333|4.185|4.781|1.0|
|13.99|13.83|0.9183|5.119|3.383|5.234|4.781|1.0|
|15.69|14.75|0.9058|

## [Reading text files as DataFrame(DF)](https://spark.apache.org/docs/2.1.0/sql-programming-guide.html#inferring-the-schema-using-reflection)

In [3]:
# second way
from pyspark.sql import Row

sc = spark.sparkContext

# Load a text file and convert each line to a Row.
lines = sc.textFile("seed_data.txt")
parts = lines.map(lambda l: l.split("\t"))
rows = parts.map(lambda p: Row(area=(p[0]), perimeter=(p[1]), compactness=(p[2]), 
                                 length_of_kernel=(p[3]), width_of_kernel=(p[4]),
                                 asymmetry_coefficient=(p[5]), 
                                 length_of_kernel_groove=(p[6]), label=(p[7]) ))

# Create DataFrame and Infer the schema 
finalDF = spark.createDataFrame(rows)
#finalDF.createOrReplaceTempView("rows")


In [4]:
finalDF.printSchema()

root
 |-- area: string (nullable = true)
 |-- asymmetry_coefficient: string (nullable = true)
 |-- compactness: string (nullable = true)
 |-- label: string (nullable = true)
 |-- length_of_kernel: string (nullable = true)
 |-- length_of_kernel_groove: string (nullable = true)
 |-- perimeter: string (nullable = true)
 |-- width_of_kernel: string (nullable = true)



In [5]:
finalDF.head(1)
#finalDF.show()

[Row(area='15.26', asymmetry_coefficient='2.221', compactness='0.871', label='1', length_of_kernel='5.763', length_of_kernel_groove='5.22', perimeter='14.84', width_of_kernel='3.312')]

## For unsupervised learning we should drop label column

In [6]:
finalDF = finalDF.drop('label') 

In [7]:
finalDF.columns

['area',
 'asymmetry_coefficient',
 'compactness',
 'length_of_kernel',
 'length_of_kernel_groove',
 'perimeter',
 'width_of_kernel']

## convert string to double

In [8]:

from pyspark.sql.types import DoubleType
finalDF = finalDF.withColumn("area", finalDF["area"].cast(DoubleType()))\
                    .withColumn("length_of_kernel", finalDF["length_of_kernel"].cast(DoubleType()))\
                    .withColumn("asymmetry_coefficient", finalDF["asymmetry_coefficient"].cast(DoubleType()))\
                    .withColumn("compactness", finalDF["compactness"].cast(DoubleType()))\
                    .withColumn("length_of_kernel_groove", finalDF["length_of_kernel_groove"].cast(DoubleType()))\
                    .withColumn("width_of_kernel", finalDF["width_of_kernel"].cast(DoubleType()))\
                    .withColumn("perimeter", finalDF["perimeter"].cast(DoubleType()))
finalDF.printSchema()


root
 |-- area: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- length_of_kernel_groove: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)



In [9]:
finalDF.describe("length_of_kernel_groove").show()

+-------+-----------------------+
|summary|length_of_kernel_groove|
+-------+-----------------------+
|  count|                    206|
|   mean|     5.4075291262135945|
| stddev|     0.5323300430722293|
|    min|                  3.485|
|    max|                  6.735|
+-------+-----------------------+



## missing value here

In [10]:
finalDF = finalDF.na.fill(0)

In [11]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [12]:
assmbler = VectorAssembler(inputCols=finalDF.columns,
                           outputCol='features')
final_data = assmbler.transform(finalDF)

In [13]:
final_data.printSchema()

root
 |-- area: double (nullable = false)
 |-- asymmetry_coefficient: double (nullable = false)
 |-- compactness: double (nullable = false)
 |-- length_of_kernel: double (nullable = false)
 |-- length_of_kernel_groove: double (nullable = false)
 |-- perimeter: double (nullable = false)
 |-- width_of_kernel: double (nullable = false)
 |-- features: vector (nullable = true)



In [14]:
final_data.show()

+-----+---------------------+-----------+----------------+-----------------------+---------+---------------+--------------------+
| area|asymmetry_coefficient|compactness|length_of_kernel|length_of_kernel_groove|perimeter|width_of_kernel|            features|
+-----+---------------------+-----------+----------------+-----------------------+---------+---------------+--------------------+
|15.26|                2.221|      0.871|           5.763|                   5.22|    14.84|          3.312|[15.26,2.221,0.87...|
|14.88|                1.018|     0.8811|           5.554|                  4.956|    14.57|          3.333|[14.88,1.018,0.88...|
|14.29|                2.699|      0.905|           5.291|                  4.825|    14.09|          3.337|[14.29,2.699,0.90...|
|13.84|                2.259|     0.8955|           5.324|                  4.805|    13.94|          3.379|[13.84,2.259,0.89...|
|16.14|                1.355|     0.9034|           5.658|                  5.175|    14.9

In [15]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [16]:
scaler_model = scaler.fit(final_data)
final_data = scaler_model.transform(final_data)

In [17]:
kmeans = KMeans(featuresCol='scaledFeatures',k=3)
model = kmeans.fit(final_data)

In [18]:
print('WSSSE')
print(model.computeCost(final_data))

WSSSE
604.166223200647


In [19]:
centers = model.clusterCenters()

In [20]:
centers

[array([  6.18310449,   2.28105979,   8.32394523,   8.47151597,
          6.50917526,  12.23444708,   7.65650551]),
 array([  4.41101231,   2.59437444,   8.12685015,   7.40409018,
          5.40896517,  10.45227519,   6.32424087]),
 array([  4.39678862,   2.48030443,   0.        ,   1.18344575,
          5.03687634,  10.46485344,   7.29055634])]

In [21]:
model.transform(final_data).show()

+-----+---------------------+-----------+----------------+-----------------------+---------+---------------+--------------------+--------------------+----------+
| area|asymmetry_coefficient|compactness|length_of_kernel|length_of_kernel_groove|perimeter|width_of_kernel|            features|      scaledFeatures|prediction|
+-----+---------------------+-----------+----------------+-----------------------+---------+---------------+--------------------+--------------------+----------+
|15.26|                2.221|      0.871|           5.763|                   5.22|    14.84|          3.312|[15.26,2.221,0.87...|[5.24452795332029...|         0|
|14.88|                1.018|     0.8811|           5.554|                  4.956|    14.57|          3.333|[14.88,1.018,0.88...|[5.11393027165176...|         1|
|14.29|                2.699|      0.905|           5.291|                  4.825|    14.09|          3.337|[14.29,2.699,0.90...|[4.91116018695589...|         1|
|13.84|                2.259

In [22]:
result = model.transform(final_data)
result.printSchema()

root
 |-- area: double (nullable = false)
 |-- asymmetry_coefficient: double (nullable = false)
 |-- compactness: double (nullable = false)
 |-- length_of_kernel: double (nullable = false)
 |-- length_of_kernel_groove: double (nullable = false)
 |-- perimeter: double (nullable = false)
 |-- width_of_kernel: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)
 |-- prediction: integer (nullable = true)



In [23]:
result.select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         1|
|         1|
|         1|
|         0|
|         1|
|         1|
|         1|
|         0|
|         0|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
|         1|
|         1|
+----------+
only showing top 20 rows

