In [1]:
# Import the libraries we will need
import pandas as pd
import numpy as np

import findspark
findspark.init()

from pyspark.sql import *
import pyspark.sql.functions as F
from pyspark.sql.types import *

# create the Spark Session
spark = SparkSession.builder.appName("Q1").getOrCreate()

In [2]:
data = spark.read.csv("hw2-bundle/hw2-bundle/kmeans/data/data.txt").toDF("observation")
data = data.withColumn("observation", F.split(F.col("observation"), " ").cast("array<float>"))
data = data.withColumn("obs_index", F.monotonically_increasing_id())
window = Window.orderBy(F.col("obs_index"))
data = data.withColumn("obs_index", F.row_number().over(window) - 1)

init_C_random = spark.read.csv("hw2-bundle/hw2-bundle/kmeans/data/c1.txt").toDF("centroid")
init_C_random = init_C_random.withColumn("centroid", F.split(F.col("centroid"), " ").cast("array<float>"))
init_C_random = init_C_random.withColumn("c_index", F.monotonically_increasing_id())
window = Window.orderBy(F.col("c_index"))
init_C_random = init_C_random.withColumn("c_index", F.row_number().over(window) - 1)

init_C_far = spark.read.csv("hw2-bundle/hw2-bundle/kmeans/data/c2.txt").toDF("centroid")
init_C_far = init_C_far.withColumn("centroid", F.split(F.col("centroid"), " ").cast("array<float>"))
init_C_far = init_C_far.withColumn("c_index", F.monotonically_increasing_id())
window = Window.orderBy(F.col("c_index"))
init_C_far = init_C_far.withColumn("c_index", F.row_number().over(window) - 1)

In [3]:
data.show(5)

+--------------------+---------+
|         observation|obs_index|
+--------------------+---------+
|[0.0, 0.64, 0.64,...|        0|
|[0.21, 0.28, 0.5,...|        1|
|[0.06, 0.0, 0.71,...|        2|
|[0.0, 0.0, 0.0, 0...|        3|
|[0.0, 0.0, 0.0, 0...|        4|
+--------------------+---------+
only showing top 5 rows



In [4]:
init_C_random.show(5)

+--------------------+-------+
|            centroid|c_index|
+--------------------+-------+
|[0.0, 0.64, 0.64,...|      0|
|[0.21, 0.28, 0.5,...|      1|
|[0.06, 0.0, 0.71,...|      2|
|[0.0, 0.0, 0.0, 0...|      3|
|[0.0, 0.0, 0.0, 0...|      4|
+--------------------+-------+
only showing top 5 rows



In [5]:
def l1_distance(x, y):
    return float(np.linalg.norm(np.array(x) - np.array(y), ord=1))
l1_distance_udf = F.udf(l1_distance, FloatType())

def l2_distance(x, y):
    return float(np.linalg.norm(np.array(x) - np.array(y), ord=2))
l2_distance_udf = F.udf(l2_distance, FloatType())

In [18]:
data_centroids = data.crossJoin(init_C_random)
data_centroids = data_centroids.withColumn("obs_to_c_dist", l2_distance_udf(F.col("observation"), F.col("centroid")))

window = Window.partitionBy("obs_index")
data_centroids = data_centroids.withColumn("min_dist", F.min("obs_to_c_dist").over(window))
data_centroids = data_centroids.filter(((F.col("min_dist") - F.col("obs_to_c_dist")) < 1e-6) &
                                       ((F.col("obs_to_c_dist") - F.col("min_dist")) < 1e-6)).sort("obs_index").drop("min_dist")

In [20]:
data_centroids.show(11)

+--------------------+---------+--------------------+-------+-------------+
|         observation|obs_index|            centroid|c_index|obs_to_c_dist|
+--------------------+---------+--------------------+-------+-------------+
|[0.0, 0.64, 0.64,...|        0|[0.0, 0.64, 0.64,...|      0|          0.0|
|[0.21, 0.28, 0.5,...|        1|[0.21, 0.28, 0.5,...|      1|          0.0|
|[0.06, 0.0, 0.71,...|        2|[0.06, 0.0, 0.71,...|      2|          0.0|
|[0.0, 0.0, 0.0, 0...|        3|[0.0, 0.0, 0.0, 0...|      3|          0.0|
|[0.0, 0.0, 0.0, 0...|        4|[0.0, 0.0, 0.0, 0...|      4|          0.0|
|[0.0, 0.0, 0.0, 0...|        5|[0.0, 0.0, 0.0, 0...|      5|          0.0|
|[0.0, 0.0, 0.0, 0...|        6|[0.0, 0.0, 0.0, 0...|      6|          0.0|
|[0.0, 0.0, 0.0, 0...|        7|[0.0, 0.0, 0.0, 0...|      7|          0.0|
|[0.15, 0.0, 0.46,...|        8|[0.15, 0.0, 0.46,...|      8|          0.0|
|[0.06, 0.12, 0.77...|        9|[0.06, 0.12, 0.77...|      9|          0.0|
|[0.0, 0.0, 

In [25]:
cost = np.sum(data_centroids.select("obs_to_c_dist").rdd.flatMap(lambda x: x).collect())
cost

397603.9143035412

In [26]:
toy = data_centroids.filter(F.col("c_index") == 0)

In [27]:
toy.show(10)

+--------------------+---------+--------------------+-------+-------------+
|         observation|obs_index|            centroid|c_index|obs_to_c_dist|
+--------------------+---------+--------------------+-------+-------------+
|[0.0, 0.64, 0.64,...|        0|[0.0, 0.64, 0.64,...|      0|          0.0|
|[0.0, 0.69, 0.34,...|       12|[0.0, 0.64, 0.64,...|      0|     17.00417|
|[0.0, 0.42, 0.42,...|       15|[0.0, 0.64, 0.64,...|      0|    29.811537|
|[0.0, 0.0, 0.0, 0...|       17|[0.0, 0.64, 0.64,...|      0|    188.83965|
|[0.0, 0.68, 0.0, ...|       34|[0.0, 0.64, 0.64,...|      0|    34.173424|
|[0.0, 0.0, 0.48, ...|       37|[0.0, 0.64, 0.64,...|      0|    42.667694|
|[0.0, 0.41, 1.66,...|       39|[0.0, 0.64, 0.64,...|      0|     60.42072|
|[0.0, 0.0, 0.0, 0...|       44|[0.0, 0.64, 0.64,...|      0|     65.28628|
|[0.18, 0.0, 0.18,...|       46|[0.0, 0.64, 0.64,...|      0|     171.2131|
|[0.0, 0.42, 1.68,...|       59|[0.0, 0.64, 0.64,...|      0|     60.31706|
+-----------

In [29]:
toy.take(10)

[Row(observation=[0.0, 0.6399999856948853, 0.6399999856948853, 0.0, 0.3199999928474426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6399999856948853, 0.0, 0.0, 0.0, 0.3199999928474426, 0.0, 1.2899999618530273, 1.9299999475479126, 0.0, 0.9599999785423279, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.777999997138977, 0.0, 0.0, 3.75600004196167, 61.0, 278.0, 1.0], obs_index=0, centroid=[0.0, 0.6399999856948853, 0.6399999856948853, 0.0, 0.3199999928474426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6399999856948853, 0.0, 0.0, 0.0, 0.3199999928474426, 0.0, 1.2899999618530273, 1.9299999475479126, 0.0, 0.9599999785423279, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.777999997138977, 0.0, 0.0, 3.75600004196167, 61.0, 278.0, 1.0], c_index=0, obs_to_c_dist=0.0),
 Row(observation=[0.0, 0.6899999976158142, 0.3

In [None]:
data_centroids.groupBy("c_index").agg()

In [None]:
def get_partition(data, C):
    data_centroids = data.crossJoin(init_C_random)
    data_centroids = data_centroids.withColumn("obs_to_c_dist", l2_distance_udf(F.col("observation"), F.col("centroid")))

    window = Window.partitionBy("obs_index")
    data_centroids = data_centroids.withColumn("min_dist", F.min("obs_to_c_dist").over(window))
    data_centroids = data_centroids.filter(((F.col("min_dist") - F.col("obs_to_c_dist")) < 1e-6) &
                                        ((F.col("obs_to_c_dist") - F.col("min_dist")) < 1e-6)).sort("obs_index").drop("min_dist")

In [None]:
def cost_l2(data, C, P):
    