In [1]:
import os
import subprocess
def module(*args):        
    if isinstance(args[0], list):        
        args = args[0]        
    else:        
        args = list(args)        
    (output, error) = subprocess.Popen(['/usr/bin/modulecmd', 'python'] + args, stdout=subprocess.PIPE).communicate()
    exec(output)    
module('load', 'apps/java/jdk1.8.0_102/binary')    
os.environ['PYSPARK_PYTHON'] = os.environ['HOME'] + '/.conda/envs/jupyter-spark/bin/python'

In [2]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[2]") \
    .appName("COM6012 PCA") \
    .getOrCreate()

sc = spark.sparkContext

## 1. 基于RDD的数据类型：

### 1.1 Local vector：dense and sparse vector

A local vector has integer-typed and **0-based indices** and double-typed values, **stored on a single machine**.

In [5]:
import numpy as np

from pyspark.mllib.linalg import Vectors # from pyspark.ml.linalg import Vectors

sv1 = Vectors.sparse(3, [0, 2], [1.0, 3.0])

dv1 = Vectors.dense([1, 2, 3])

print("sparse vector: ", sv1)
print("dense vector: ", dv1)

sparse vector:  (3,[0,2],[1.0,3.0])
dense vector:  [1.0,2.0,3.0]


### 1.2 labeled point：

In [7]:
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

neg = LabeledPoint(0.0, SparseVector(3, [0, 2], [1.0, 3.0]))

print("laebl: ", neg.label)
print("features: ", neg.features)

laebl:  0.0
features:  (3,[0,2],[1.0,3.0])


### 1.3 Local matrix：dense and sparse

**Matrices.sparse(row, column, [column index], [row index], [values]) 稀疏矩阵的列索引采取CSC格式。首先，在第一列前添加全为零的列，因此，第一列的非零值的个数为零，之后，每一列的值为当前列非零值的个数加前一列非零值的个数。**

In [8]:
from pyspark.mllib.linalg import Matrix, Matrices

# dense matrix.
dm = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6])

# sparse matrix ((9.0, 0.0), (0.0, 8.0), (0.0, 6.0))
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 2, 1], [9, 6, 8]) # 0, 0+1, 0+1+2  => 0, 1, 3

print(dm)
print(sm)

DenseMatrix([[1., 2.],
             [3., 4.],
             [5., 6.]])
3 X 2 CSCMatrix
(0,0) 9.0
(2,1) 6.0
(1,1) 8.0


### 1.4 distributed matrix：

* RowMatrix
* IndexedRowMatrix
* CoordinateMatrix
* BlockMatrix

In [13]:
from pyspark.mllib.linalg.distributed import RowMatrix

# Create an RDD of vectors.
rows = sc.parallelize([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])

# Create a RowMatrix from an RDD of vectors.
mat = RowMatrix(rows)

# Get its size.
m = mat.numRows()  # 4
n = mat.numCols()  # 3
rowsRDD = mat.rows

print(rowsRDD.collect())

<pyspark.mllib.linalg.distributed.RowMatrix object at 0x2ae2f75bc9e8>
[DenseVector([1.0, 2.0, 3.0]), DenseVector([4.0, 5.0, 6.0]), DenseVector([7.0, 8.0, 9.0]), DenseVector([10.0, 11.0, 12.0])]


## 2. PCA：see week7

### 2.1 Dataframe：

In [14]:
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]

df = spark.createDataFrame(data, ["features"])
df.show()

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

+--------------------+
|            features|
+--------------------+
| (5,[1,3],[1.0,7.0])|
|[2.0,0.0,3.0,4.0,...|
|[4.0,0.0,0.0,6.0,...|
+--------------------+

+-----------------------------------------------------------+
|pcaFeatures                                                |
+-----------------------------------------------------------+
|[1.6485728230883807,-4.013282700516296,-5.524543751369388] |
|[-4.645104331781534,-1.1167972663619026,-5.524543751369387]|
|[-6.428880535676489,-5.337951427775355,-5.524543751369389] |
+-----------------------------------------------------------+



In [15]:
model.explainedVariance

DenseVector([0.7944, 0.2056, 0.0])

### 2.2 RDD：

### 特征分解：

In [22]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

rows = sc.parallelize([
    Vectors.sparse(5, {1: 1.0, 3: 7.0}),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
])

mat = RowMatrix(rows)
# Principal components (axis 3 dimensional) 
pc = mat.computePrincipalComponents(3)
print(pc)
# dataset after dimensional reduction 
projected = mat.multiply(pc)
print(projected.rows.collect())

DenseMatrix([[-0.44859172, -0.28423808,  0.08344545],
             [ 0.13301986, -0.05621156,  0.04423979],
             [-0.12523156,  0.76362648, -0.57807123],
             [ 0.21650757, -0.56529588, -0.79554051],
             [-0.84765129, -0.11560341, -0.15501179]])
[DenseVector([1.6486, -4.0133, -5.5245]), DenseVector([-4.6451, -1.1168, -5.5245]), DenseVector([-6.4289, -5.338, -5.5245])]


* 特征向量矩阵 pc = mat.computePrincipalComponents(3) 


* 降维后的数据 = 原数据乘特征向量 projected = mat.multiply(pc) 

**注意：上述方法会对数据自动进行中心化处理。**

### 奇异值分解：

In [21]:
# Compute the top 5 singular values and corresponding singular vectors.
svd = mat.computeSVD(3, computeU=True)
U = svd.U       # The U factor is a RowMatrix.
s = svd.s       # The singular values are stored in a local dense vector.
V = svd.V       # The V factor is a local dense matrix. 特征向量

print(s)
print(V)

[13.029275535600473,5.368578733451684,2.5330498218813755]
DenseMatrix([[-0.31278534,  0.31167136,  0.30366911],
             [-0.02980145, -0.17133211, -0.02226069],
             [-0.12207248,  0.15256471, -0.95070998],
             [-0.71847899, -0.68096285, -0.0172245 ],
             [-0.60841059,  0.62170723,  0.05606596]])


* **svd = mat.computeSVD(3, computeU=True) 需要对数据进行中心化处理。**

In [23]:
from pyspark.mllib.feature import StandardScaler

#We center the data to remove the mean. 
standardizer = StandardScaler(True, False)
model = standardizer.fit(rows)
centeredRows = model.transform(rows)
centeredRows.collect()
centeredmat = RowMatrix(centeredRows)

# Compute the top 3 singular values and corresponding singular vectors.
svd = centeredmat.computeSVD(3, computeU=True)
U = svd.U       # The U factor is a RowMatrix.
s = svd.s       # The singular values are stored in a local dense vector.
V = svd.V       # The V factor is a local dense matrix.

print(V)

DenseMatrix([[-0.44859172, -0.28423808, -0.81664677],
             [ 0.13301986, -0.05621156, -0.03622012],
             [-0.12523156,  0.76362648, -0.34267361],
             [ 0.21650757, -0.56529588, -0.13898906],
             [-0.84765129, -0.11560341,  0.44162541]])
