In [2]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("KMeansExample")\
        .getOrCreate()

df = spark.read.csv('AppleStore.csv',inferSchema=True, header=True)

In [3]:
import pandas as pd
import numpy as np
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- track_name: string (nullable = true)
 |-- size_bytes: long (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: double (nullable = true)
 |-- rating_count_tot: integer (nullable = true)
 |-- rating_count_ver: integer (nullable = true)
 |-- user_rating: double (nullable = true)
 |-- user_rating_ver: double (nullable = true)
 |-- ver: string (nullable = true)
 |-- cont_rating: string (nullable = true)
 |-- prime_genre: string (nullable = true)
 |-- sup_devices.num: integer (nullable = true)
 |-- ipadSc_urls.num: integer (nullable = true)
 |-- lang.num: integer (nullable = true)
 |-- vpp_lic: integer (nullable = true)



In [4]:
stringIndexer = StringIndexer(inputCol="prime_genre", outputCol="primeGenreIndexed")
model = stringIndexer.fit(df)
df = model.transform(df)



df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- track_name: string (nullable = true)
 |-- size_bytes: long (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: double (nullable = true)
 |-- rating_count_tot: integer (nullable = true)
 |-- rating_count_ver: integer (nullable = true)
 |-- user_rating: double (nullable = true)
 |-- user_rating_ver: double (nullable = true)
 |-- ver: string (nullable = true)
 |-- cont_rating: string (nullable = true)
 |-- prime_genre: string (nullable = true)
 |-- sup_devices.num: integer (nullable = true)
 |-- ipadSc_urls.num: integer (nullable = true)
 |-- lang.num: integer (nullable = true)
 |-- vpp_lic: integer (nullable = true)
 |-- primeGenreIndexed: double (nullable = false)



In [5]:
vectorAss = VectorAssembler(inputCols=["rating_count_tot"],outputCol="features")

df = vectorAss.transform(df)

In [6]:
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(df)
predictions = model.transform(df)

In [7]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
predictions.show()

Cluster Centers: 
[6845.69734065]
[2247896.25]
[408677.87209302]
+---+---------+--------------------+----------+--------+-----+----------------+----------------+-----------+---------------+-------+-----------+-----------------+---------------+---------------+--------+-------+-----------------+-----------+----------+
|_c0|       id|          track_name|size_bytes|currency|price|rating_count_tot|rating_count_ver|user_rating|user_rating_ver|    ver|cont_rating|      prime_genre|sup_devices.num|ipadSc_urls.num|lang.num|vpp_lic|primeGenreIndexed|   features|prediction|
+---+---------+--------------------+----------+--------+-----+----------------+----------------+-----------+---------------+-------+-----------+-----------------+---------------+---------------+--------+-------+-----------------+-----------+----------+
|  1|281656475|     PAC-MAN Premium| 100788224|     USD| 3.99|           21292|              26|        4.0|            4.5|  6.3.5|         4+|            Games|             3

In [8]:
numPred = predictions.groupby('prediction').count()

numPred.show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|    4|
|         2|   86|
|         0| 7107|
+----------+-----+



In [9]:
numPredDF = predictions.toPandas()
numPrediction = numPredDF[['prediction']]
numPrediction

numPredNP = numPrediction.values
numPredNP = numPredNP.ravel()

In [10]:
predictions.createOrReplaceTempView("predFile")

df = spark.sql("select prime_genre,avg(rating_count_tot),count(id) from predFile where prediction = 0 group by prime_genre")
df.show() 

df = spark.sql("select prime_genre,avg(rating_count_tot),count(id) from predFile where prediction = 1 group by prime_genre")
df.show()

df = spark.sql("select prime_genre,avg(rating_count_tot),count(id) from predFile where prediction = 2 group by prime_genre")
df.show()

#df = spark.sql("select appName,appNameIndexed,avg(dataUsage),count(phoneNumber) from predFile where prediction = 3 group by appName,appNameIndexed")
#df.show()

#df = spark.sql("select distinct(appName),avg(dataUsage),count(phoneNumber) from predFile where prediction = 4 group by appName")
#df.show()

#df = spark.sql("select distinct(appName),avg(dataUsage),count(phoneNumber) from predFile where prediction = 5 group by appName")
#df.show()

+-----------------+---------------------+---------+
|      prime_genre|avg(rating_count_tot)|count(id)|
+-----------------+---------------------+---------+
|        Education|   2239.2295805739514|      453|
|       Navigation|    4449.688888888889|       45|
|    Entertainment|    5981.349624060151|      532|
|           Sports|    11575.87610619469|      113|
|     Food & Drink|    5174.639344262295|       61|
|    Photo & Video|     6489.35549132948|      346|
|           Travel|   6005.2531645569625|       79|
|          Finance|    6693.588235294118|      102|
|Social Networking|             12213.65|      160|
|             Book|   2900.6576576576576|      111|
|         Shopping|             13258.75|      120|
|        Reference|    7117.047619047619|       63|
| Health & Fitness|    5072.078651685393|      178|
|        Utilities|    4950.558704453441|      247|
|     Productivity|   8051.3258426966295|      178|
|            Games|    7277.706144957983|     3808|
|           

In [11]:
df_rating = spark.sql("select prediction,min(rating_count_tot),max(rating_count_tot),avg(rating_count_tot) from predFile group by prediction")
df_rating.show()

+----------+---------------------+---------------------+---------------------+
|prediction|min(rating_count_tot)|max(rating_count_tot)|avg(rating_count_tot)|
+----------+---------------------+---------------------+---------------------+
|         1|              1724546|              2974676|           2247896.25|
|         2|               208648|              1126879|   408677.87209302327|
|         0|                    0|               200047|    6845.697340650063|
+----------+---------------------+---------------------+---------------------+

