###  Importing Basic Spark Libraries

In [2]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("kmeans_songs").setMaster("yarn-client")
sc = SparkContext(conf=conf)
sc

# EDA & Data Preparation

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('kmeans').getOrCreate()

### Load Data

In [4]:
df = spark.read.csv('s3a://sparkmldatasets/creditcard_clustering.csv', header=True, inferSchema=True)

In [5]:
df.show()

+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
|CUST_ID|    BALANCE|BALANCE_FREQUENCY|PURCHASES|ONEOFF_PURCHASES|INSTALLMENTS_PURCHASES|CASH_ADVANCE|PURCHASES_FREQUENCY|ONEOFF_PURCHASES_FREQUENCY|PURCHASES_INSTALLMENTS_FREQUENCY|CASH_ADVANCE_FREQUENCY|CASH_ADVANCE_TRX|PURCHASES_TRX|CREDIT_LIMIT|   PAYMENTS|MINIMUM_PAYMENTS|PRC_FULL_PAYMENT|TENURE|
+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
| C10001|  40.900749|         0.818182|     95.4|             0.0|                  95.4|  

In [6]:
df.count()

8950

In [7]:
df.columns

['CUST_ID',
 'BALANCE',
 'BALANCE_FREQUENCY',
 'PURCHASES',
 'ONEOFF_PURCHASES',
 'INSTALLMENTS_PURCHASES',
 'CASH_ADVANCE',
 'PURCHASES_FREQUENCY',
 'ONEOFF_PURCHASES_FREQUENCY',
 'PURCHASES_INSTALLMENTS_FREQUENCY',
 'CASH_ADVANCE_FREQUENCY',
 'CASH_ADVANCE_TRX',
 'PURCHASES_TRX',
 'CREDIT_LIMIT',
 'PAYMENTS',
 'MINIMUM_PAYMENTS',
 'PRC_FULL_PAYMENT',
 'TENURE']

In [14]:
df_filtered = df.na.drop()

In [16]:
df_filtered.count()

8636

### VectorAssembler

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
assembler = VectorAssembler(inputCols=[ 'BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE'],outputCol='features')


In [17]:
model_data = assembler.transform(df_filtered)

In [18]:
model_data.show(10)

+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+--------------------+
|CUST_ID|    BALANCE|BALANCE_FREQUENCY|PURCHASES|ONEOFF_PURCHASES|INSTALLMENTS_PURCHASES|CASH_ADVANCE|PURCHASES_FREQUENCY|ONEOFF_PURCHASES_FREQUENCY|PURCHASES_INSTALLMENTS_FREQUENCY|CASH_ADVANCE_FREQUENCY|CASH_ADVANCE_TRX|PURCHASES_TRX|CREDIT_LIMIT|   PAYMENTS|MINIMUM_PAYMENTS|PRC_FULL_PAYMENT|TENURE|            features|
+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+--------------------+
| C10001|  40.900749|       

# Model Building

### KMeans Clustering

In [19]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator


In [20]:
kmeans = KMeans(k=10, seed=1)  

In [21]:
model = kmeans.fit(model_data.select('features'))

In [22]:
model

KMeans_4c5e92854bb2a5c86584

In [23]:
model.computeCost(model_data)

117187383991.44473

In [24]:
model.computeCost(model_data.select('features'))

117187383991.44473

In [25]:
output = model.transform(model_data)


### Prediction is cluster label output

In [26]:
output.show(100)  

+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+--------------------+----------+
|CUST_ID|    BALANCE|BALANCE_FREQUENCY|PURCHASES|ONEOFF_PURCHASES|INSTALLMENTS_PURCHASES|CASH_ADVANCE|PURCHASES_FREQUENCY|ONEOFF_PURCHASES_FREQUENCY|PURCHASES_INSTALLMENTS_FREQUENCY|CASH_ADVANCE_FREQUENCY|CASH_ADVANCE_TRX|PURCHASES_TRX|CREDIT_LIMIT|   PAYMENTS|MINIMUM_PAYMENTS|PRC_FULL_PAYMENT|TENURE|            features|prediction|
+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+--------------------+-------

In [27]:
output

DataFrame[CUST_ID: string, BALANCE: double, BALANCE_FREQUENCY: double, PURCHASES: double, ONEOFF_PURCHASES: double, INSTALLMENTS_PURCHASES: double, CASH_ADVANCE: double, PURCHASES_FREQUENCY: double, ONEOFF_PURCHASES_FREQUENCY: double, PURCHASES_INSTALLMENTS_FREQUENCY: double, CASH_ADVANCE_FREQUENCY: double, CASH_ADVANCE_TRX: int, PURCHASES_TRX: int, CREDIT_LIMIT: double, PAYMENTS: double, MINIMUM_PAYMENTS: double, PRC_FULL_PAYMENT: double, TENURE: int, features: vector, prediction: int]

### Cluster Center

In [28]:
centers = model.clusterCenters()

In [29]:
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[5.72403875e+03 9.79279805e-01 6.02307089e+02 3.51421472e+02
 2.50991551e+02 3.65415095e+03 3.27802250e-01 1.52412546e-01
 2.31545862e-01 3.80377066e-01 1.02341772e+01 1.00221519e+01
 8.41099684e+03 1.88355776e+03 2.02852697e+03 1.68056804e-03
 1.15601266e+01]
[4.06673126e+03 9.88429764e-01 1.04349473e+03 1.19814727e+02
 9.23680000e+02 9.20037169e+02 4.70798873e-01 3.84297091e-02
 4.41597818e-01 1.03030255e-01 3.01818182e+00 1.88363636e+01
 4.26636364e+03 1.58459625e+03 2.29600155e+04 1.51514545e-03
 1.19090909e+01]
[2.54840512e+03 9.78528242e-01 7.47953770e+03 5.08906016e+03
 2.39289690e+03 4.47409398e+02 9.19159347e-01 7.34866827e-01
 7.04637113e-01 5.64821331e-02 1.46370968e+00 8.28427419e+01
 8.44737903e+03 6.88850102e+03 1.20231299e+03 3.42522851e-01
 1.19354839e+01]
[3.69895394e+03 9.14283657e-01 7.59844545e+02 4.39715496e+02
 3.20129050e+02 6.29329502e+03 3.36183979e-01 1.57149236e-01
 2.47983079e-01 4.41547992e-01 1.51570248e+01 1.22561983e+01
 7.64444027e+03 

# Model Evaluation

### Silhouette Score

Values closer to 1 indicate maximum separation.  The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.

In [30]:
# calculating Silhouette score
evaluator = ClusteringEvaluator()


In [31]:
silhouette_score = evaluator.evaluate(output)


In [32]:
print("silhouette_score = " + str(silhouette_score))


silhouette_score = 0.437229062724


## Elbow Method

In [40]:
ks = [5,10,15,20]
costfunction = []

for k_num in ks:
    # build kmeans model with k as no of cluster
    print("K :  ",k_num)
    model_k = KMeans(k=k_num , seed=1)

    # train the model
    model = model_k.fit(model_data.select('features'))

    # Append costfunction to list of costfunction
    costfunction.append(model.computeCost(model_data))

('K :  ', 5)
('K :  ', 10)
('K :  ', 15)
('K :  ', 20)


In [41]:
costfunction

[179115102958.93616, 117187383991.44473, 92915233462.28668, 77691304084.55678]

### Plot Elbow Curve

In [None]:
import matplotlib.pyplot as plt


plt.clf()

In [None]:
# Plot k vs cost_function
plt.plot(ks, costfunction, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('cost function')
plt.xticks(ks)
plt.show()

In [None]:
%matplot plt