# Import library and start Spark

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
                    .appName('Credit')\
                    .getOrCreate()

23/08/07 10:38:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Load data and EDA

In [3]:
# Load data
df = spark.read.csv("./creditcard_clustering.csv", header=True, inferSchema=True)

In [4]:
# show 10 rows
df.show(10)

+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
|CUST_ID|    BALANCE|BALANCE_FREQUENCY|PURCHASES|ONEOFF_PURCHASES|INSTALLMENTS_PURCHASES|CASH_ADVANCE|PURCHASES_FREQUENCY|ONEOFF_PURCHASES_FREQUENCY|PURCHASES_INSTALLMENTS_FREQUENCY|CASH_ADVANCE_FREQUENCY|CASH_ADVANCE_TRX|PURCHASES_TRX|CREDIT_LIMIT|   PAYMENTS|MINIMUM_PAYMENTS|PRC_FULL_PAYMENT|TENURE|
+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
| C10001|  40.900749|         0.818182|     95.4|             0.0|                  95.4|  

In [5]:
# schema info
df.printSchema()

root
 |-- CUST_ID: string (nullable = true)
 |-- BALANCE: double (nullable = true)
 |-- BALANCE_FREQUENCY: double (nullable = true)
 |-- PURCHASES: double (nullable = true)
 |-- ONEOFF_PURCHASES: double (nullable = true)
 |-- INSTALLMENTS_PURCHASES: double (nullable = true)
 |-- CASH_ADVANCE: double (nullable = true)
 |-- PURCHASES_FREQUENCY: double (nullable = true)
 |-- ONEOFF_PURCHASES_FREQUENCY: double (nullable = true)
 |-- PURCHASES_INSTALLMENTS_FREQUENCY: double (nullable = true)
 |-- CASH_ADVANCE_FREQUENCY: double (nullable = true)
 |-- CASH_ADVANCE_TRX: integer (nullable = true)
 |-- PURCHASES_TRX: integer (nullable = true)
 |-- CREDIT_LIMIT: double (nullable = true)
 |-- PAYMENTS: double (nullable = true)
 |-- MINIMUM_PAYMENTS: double (nullable = true)
 |-- PRC_FULL_PAYMENT: double (nullable = true)
 |-- TENURE: integer (nullable = true)



In [6]:
df.columns

['CUST_ID',
 'BALANCE',
 'BALANCE_FREQUENCY',
 'PURCHASES',
 'ONEOFF_PURCHASES',
 'INSTALLMENTS_PURCHASES',
 'CASH_ADVANCE',
 'PURCHASES_FREQUENCY',
 'ONEOFF_PURCHASES_FREQUENCY',
 'PURCHASES_INSTALLMENTS_FREQUENCY',
 'CASH_ADVANCE_FREQUENCY',
 'CASH_ADVANCE_TRX',
 'PURCHASES_TRX',
 'CREDIT_LIMIT',
 'PAYMENTS',
 'MINIMUM_PAYMENTS',
 'PRC_FULL_PAYMENT',
 'TENURE']

In [15]:
df = df.dropna()

In [18]:
df.show()

+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
|CUST_ID|    BALANCE|BALANCE_FREQUENCY|PURCHASES|ONEOFF_PURCHASES|INSTALLMENTS_PURCHASES|CASH_ADVANCE|PURCHASES_FREQUENCY|ONEOFF_PURCHASES_FREQUENCY|PURCHASES_INSTALLMENTS_FREQUENCY|CASH_ADVANCE_FREQUENCY|CASH_ADVANCE_TRX|PURCHASES_TRX|CREDIT_LIMIT|   PAYMENTS|MINIMUM_PAYMENTS|PRC_FULL_PAYMENT|TENURE|
+-------+-----------+-----------------+---------+----------------+----------------------+------------+-------------------+--------------------------+--------------------------------+----------------------+----------------+-------------+------------+-----------+----------------+----------------+------+
| C10001|  40.900749|         0.818182|     95.4|             0.0|                  95.4|  

# Data Transform

In [19]:
# Essembler
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=[ 'BALANCE',
                                        'BALANCE_FREQUENCY',
                                        'PURCHASES',
                                        'ONEOFF_PURCHASES',
                                        'INSTALLMENTS_PURCHASES',
                                        'CASH_ADVANCE',
                                        'PURCHASES_FREQUENCY',
                                        'ONEOFF_PURCHASES_FREQUENCY',
                                        'PURCHASES_INSTALLMENTS_FREQUENCY',
                                        'CASH_ADVANCE_FREQUENCY',
                                        'CASH_ADVANCE_TRX',
                                        'PURCHASES_TRX',
                                        'CREDIT_LIMIT',
                                        'PAYMENTS',
                                        'MINIMUM_PAYMENTS',
                                        'PRC_FULL_PAYMENT',
                                        'TENURE'],
                            outputCol='features')

model_data = assembler.transform(df)

In [20]:
model_data.head()

Row(CUST_ID='C10001', BALANCE=40.900749, BALANCE_FREQUENCY=0.818182, PURCHASES=95.4, ONEOFF_PURCHASES=0.0, INSTALLMENTS_PURCHASES=95.4, CASH_ADVANCE=0.0, PURCHASES_FREQUENCY=0.166667, ONEOFF_PURCHASES_FREQUENCY=0.0, PURCHASES_INSTALLMENTS_FREQUENCY=0.083333, CASH_ADVANCE_FREQUENCY=0.0, CASH_ADVANCE_TRX=0, PURCHASES_TRX=2, CREDIT_LIMIT=1000.0, PAYMENTS=201.802084, MINIMUM_PAYMENTS=139.509787, PRC_FULL_PAYMENT=0.0, TENURE=12, features=DenseVector([40.9007, 0.8182, 95.4, 0.0, 95.4, 0.0, 0.1667, 0.0, 0.0833, 0.0, 0.0, 2.0, 1000.0, 201.8021, 139.5098, 0.0, 12.0]))

# Model building

In [21]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator


In [22]:
kmeans = KMeans(k=7, seed=1)  
model = kmeans.fit(model_data.select('features'))

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(model.transform(model_data))
print("Silhouette with squared euclidean distance = " + str(silhouette))

23/08/07 10:46:29 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Silhouette with squared euclidean distance = 0.5263055625736056


# Elbow method

In [30]:
ks = [5,10,15, 20]
costfunction = []

for k_num in ks:
    # build kmeans model with k as no of cluster
    print("K :  ",k_num)
    model_k = KMeans(k=k_num , seed=1)

    # train the model
    model = model_k.fit(model_data.select('features'))

    # Make predictions and calculate the cost
    model.summary.trainingCost
    costfunction.append(model.summary.trainingCost)
print(costfunction)
print(max(costfunction))

K :   5
K :   10
K :   15
K :   20
[178119058463.0186, 116478442018.5889, 91035111526.56789, 78925993918.66504]
178119058463.0186
