# Exercise 1

1. An optical measurement method captures the surface of objects as a  set of 3D location vectors (x, y, z)T stores them in table S(id, dim,  val). Due to interference, not all location vectors are fully detected  during the measurement. 
2. The values of the dimensions x, y, z are recorded separately for each 
vector id, see example on the right.

In [1]:
import findspark
findspark.init()

from pyspark import SparkContext

# 創建SparkContext
sc = SparkContext(appName="SparkIsSoGreat")

# 資料表格: column1 = id, column2 = dim, column3 = value
data = [
    (1, 'x', 2),
    (2, 'z', 4),
    (1, 'y', 3),
    (3, 'y', 2), 
    (4, 'y', 4),
    (3, 'z', 4),
    (4, 'z', 2),
    (2, 'x', 2),
    (2, 'y', 4),
    (5, 'y', 3),
    (3, 'x', 4),
    (4, 'x', 4)
]

# 將資料轉換成RDD
rdd = sc.parallelize(data)

# 輸出RDD
print(rdd.collect())


23/11/15 08:16:30 WARN Utils: Your hostname, hung.local resolves to a loopback address: 127.0.0.1; using 10.172.68.141 instead (on interface en0)
23/11/15 08:16:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/15 08:16:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


[(1, 'x', 2), (2, 'z', 4), (1, 'y', 3), (3, 'y', 2), (4, 'y', 4), (3, 'z', 4), (4, 'z', 2), (2, 'x', 2), (2, 'y', 4), (5, 'y', 3), (3, 'x', 4), (4, 'x', 4)]


In [31]:
# 精簡寫法 
fully_captured_ids = (
    rdd.map(lambda x: (x[0], x[1])) # (id, dim)
    .distinct() # 去除重複的資料
    .groupByKey() # 以id為key, dim為value group起來
    .filter(lambda x: len(x[1]) == 3) # 只留下dim包含x,y,z有3個的id 
    .map(lambda x: x[0]) # 只留下id 
)
# filtered_rdd = rdd.filter(lambda x: x[0] in fully_captured_ids.collect())
print(fully_captured_ids.collect())

def filter_function(x):
    '''
    x: (id, dim, value) for each row in the data
    we keep rows that have the same id and contain 'x', 'y', 'z' in the dim column
    '''
    # iterate through data, find out all the values of the same id
    id = x[0]
    dimensions = set([i[1] for i in data if i[0] == id])
    return len(dimensions) == 3


filtered_rdd = rdd.filter(filter_function)
filtered_rdd.collect()

[2, 3, 4]


[(2, 'z', 4),
 (3, 'y', 2),
 (4, 'y', 4),
 (3, 'z', 4),
 (4, 'z', 2),
 (2, 'x', 2),
 (2, 'y', 4),
 (3, 'x', 4),
 (4, 'x', 4)]

In [63]:
# 將資料按照id分組
grouped_rdd = filtered_rdd.groupBy(lambda x: x[0])

# 定義計算向量長度的函數
def calculate_length(vectors):
    test = ((id, dim, val) for id, dim, val in vectors if dim == 'x')
    x_val = next((val for _, dim, val in vectors if dim == 'x'), None) # use next() to take the first value of list if it exists
    y_val = next((val for _, dim, val in vectors if dim == 'y'), None)
    z_val = next((val for _, dim, val in vectors if dim == 'z'), None) 
    print(f'vectors={vectors}, val={list(test)[0][2]}')

    if x_val is not None and y_val is not None and z_val is not None:
        return (x_val ** 2 + y_val ** 2 + z_val ** 2) ** 0.5
    else:
        return None

# 使用map和自訂函數計算向量長度
length_rdd = grouped_rdd.map(lambda x: (x[0], calculate_length(list(x[1]))))

# 篩選出完全捕獲的向量
filtered_length_rdd = length_rdd.filter(lambda x: x[1] is not None)

# 生成最終的表格T(id, length)
output_table = filtered_length_rdd.map(lambda x: (x[0], x[1]))

# 顯示結果表格
print(output_table.collect())


[(2, 6.0), (3, 6.0), (4, 6.0)]


vectors=[(3, 'y', 2), (3, 'z', 4), (3, 'x', 4)], val=4
vectors=[(4, 'y', 4), (4, 'z', 2), (4, 'x', 4)], val=4
vectors=[(2, 'z', 4), (2, 'x', 2), (2, 'y', 4)], val=2
