In [1]:
from pyspark.sql import SQLContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

**Read in 'buy-clicks.csv' and 'combined_data.csv' files...**

In [2]:
spark = SQLContext(sc)

buy_clicks = sqlContext.read.load('file:///home/cloudera/Downloads/big_data_capstone_datasets_and_scripts/flamingo-data/buy-clicks.csv', 
                        format='com.databricks.spark.csv', 
                        header='true',inferSchema='true')

combined_data = sqlContext.read.load('file:///home/cloudera/Downloads/big_data_capstone_datasets_and_scripts/combined-data.csv', 
                        format='com.databricks.spark.csv', 
                        header='true',inferSchema='true')

In [3]:
buy_clicks.columns

['timestamp', 'txId', 'userSessionId', 'team', 'userId', 'buyId', 'price']

In [4]:
buy_clicks.count()

2947

In [5]:
type(buy_clicks)

pyspark.sql.dataframe.DataFrame

**Aggregate total purchases by userId by summing up the price column in 'buy_clicks'**

In [6]:
total_purchases = buy_clicks.groupBy('userId').sum('price').show()
type(total_purchases)

+------+----------+
|userId|sum(price)|
+------+----------+
|   231|      63.0|
|  2032|      20.0|
|   233|      28.0|
|  1234|      53.0|
|  1634|      27.0|
|    34|      95.0|
|  1434|       9.0|
|  1835|      27.0|
|  2035|      40.0|
|  1235|      40.0|
|  1436|      16.0|
|   436|      43.0|
|  1636|      25.0|
|   236|      43.0|
|  2236|      15.0|
|  1837|      67.0|
|    38|      30.0|
|  1639|     155.0|
|   239|      20.0|
|   439|      25.0|
+------+----------+
only showing top 20 rows



NoneType

**Convert 'total_purchases' to a dataframe...**

In [7]:
total_purchases = buy_clicks.groupBy('userId').sum('price')
type(total_purchases)

pyspark.sql.dataframe.DataFrame

In [8]:
combined_data.columns

['userId',
 'userSessionId',
 'teamLevel',
 'platformType',
 'count_gameclicks',
 'count_hits',
 'count_buyId',
 'avg_price']

In [9]:
combined_data.count()

4619

In [10]:
type(combined_data)

pyspark.sql.dataframe.DataFrame

**Aggregate counts of game_clicks and flamingo hits by userId by summing up the appropriate columns in 'combined_data'**

In [11]:
combined_data_by_userid = combined_data.groupby('userId').sum('count_gameclicks','count_hits').show()

+------+---------------------+---------------+
|userId|sum(count_gameclicks)|sum(count_hits)|
+------+---------------------+---------------+
|  1631|                  392|             45|
|  2231|                  395|             38|
|  2031|                 2271|            227|
|   231|                  262|             28|
|  2032|                  638|             59|
|   832|                 1469|            141|
|    32|                  281|             25|
|  1632|                  309|             30|
|   432|                   32|              3|
|  1433|                  372|             53|
|   233|                  250|             29|
|  2033|                  575|             80|
|   433|                   34|              2|
|  1034|                  376|             45|
|  1834|                  510|             47|
|  1234|                  590|             73|
|    34|                  665|             79|
|  1634|                 2546|            266|
|  1434|     

**Convert 'combined_data_by_userid' to a dataframe...**

In [12]:
combined_data_by_userid = combined_data.groupby('userId').sum('count_gameclicks','count_hits')

**Give alias names to the two datasets to be joined...**

In [13]:
inner_join = combined_data_by_userid.alias("a").join(
    total_purchases.alias("b"), combined_data_by_userid['userId'] == 
    total_purchases['userId']).select("a.userId", "a.sum(count_gameclicks)",
                                      "a.sum(count_hits)", "b.sum(price)").show()

+------+---------------------+---------------+----------+
|userId|sum(count_gameclicks)|sum(count_hits)|sum(price)|
+------+---------------------+---------------+----------+
|   231|                  262|             28|      63.0|
|  2032|                  638|             59|      20.0|
|   233|                  250|             29|      28.0|
|    34|                  665|             79|      95.0|
|  1234|                  590|             73|      53.0|
|  1434|                  772|             89|       9.0|
|  1634|                 2546|            266|      27.0|
|  1235|                  367|             39|      40.0|
|  1835|                  734|             94|      27.0|
|   236|                  606|             70|      43.0|
|   436|                 4392|            494|      43.0|
|  1436|                  622|             65|      16.0|
|  1636|                  317|             29|      25.0|
|  2236|                  299|             36|      15.0|
|  1837|      

**Convert 'inner_join' to a dataframe...**

In [15]:
inner_join = combined_data_by_userid.alias("a").join(
    total_purchases.alias("b"), combined_data_by_userid['userId'] == 
    total_purchases['userId']).select("a.userId", "a.sum(count_gameclicks)",
                                      "a.sum(count_hits)", "b.sum(price)")

**So, we will develop clusters based on total gameclicks, total hits, and total purchases by userId. Use Vector
Assembler to gather all of the features for clustering.**

In [16]:
featuresUsed = ['sum(count_gameclicks)', 'sum(count_hits)', 'sum(price)']
assembler = VectorAssembler(inputCols=featuresUsed, outputCol="features_unscaled")
assembled = assembler.transform(inner_join)

**Scale the features using StandardScaler:**

In [21]:
scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withStd=True, withMean=True)
scalerModel = scaler.fit(assembled)
scaledData = scalerModel.transform(assembled)
scaledData.show()

+------+---------------------+---------------+----------+--------------------+--------------------+
|userId|sum(count_gameclicks)|sum(count_hits)|sum(price)|   features_unscaled|            features|
+------+---------------------+---------------+----------+--------------------+--------------------+
|   231|                  262|             28|      63.0|   [262.0,28.0,63.0]|[-0.6344202852895...|
|  2032|                  638|             59|      20.0|   [638.0,59.0,20.0]|[-0.0018534459321...|
|   233|                  250|             29|      28.0|   [250.0,29.0,28.0]|[-0.6546085886733...|
|    34|                  665|             79|      95.0|   [665.0,79.0,95.0]|[0.04357023668131...|
|  1234|                  590|             73|      53.0|   [590.0,73.0,53.0]|[-0.0826066594671...|
|  1434|                  772|             89|       9.0|    [772.0,89.0,9.0]|[0.22358260851973...|
|  1634|                 2546|            266|      27.0| [2546.0,266.0,27.0]|[3.20808679208386...|


**Select the features column make the data persist:**

In [22]:
scaledData = scaledData.select("features")
scaledData.persist()

DataFrame[features: vector]

**We can now perform K-Means clustering to generate 2 clusters:**

In [25]:
kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(scaledData)
transformed = model.transform(scaledData)

**Print the center of these two clusters...**

In [26]:
centers = model.clusterCenters()
centers

[array([-0.31333318, -0.33179717, -0.37859269]),
 array([ 2.35752989,  2.3388565 , -0.03049294]),
 array([-0.04642581,  0.05337151,  1.8170137 ])]