In [3]:
# Reference: 
# http://stackoverflow.com/questions/42889965/multiply-two-numpy-matrices-in-pyspark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark Matrix multiplication with python example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [42]:
import numpy as np
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

# 1. Simple demo for small dense matrix

In [66]:
from pyspark.mllib.linalg.distributed import *

def as_block_matrix(rdd, rowsPerBlock=2, colsPerBlock=3):
    return IndexedRowMatrix(
        rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0]))
    ).toBlockMatrix(rowsPerBlock, colsPerBlock)

In [58]:
A =  np.array([1,1,3,4,5,3],dtype=np.float64).reshape(2, 3)

In [59]:
A

array([[ 1.,  1.,  3.],
       [ 4.,  5.,  3.]])

In [60]:
matrixA = as_block_matrix(spark.sparkContext.parallelize(A))

In [62]:
tA = matrixA.transpose()

In [63]:
tA.toLocalMatrix()

DenseMatrix(3, 2, [1.0, 1.0, 3.0, 4.0, 5.0, 3.0], 0)

In [64]:
mul = matrixA.multiply(tA)

In [65]:
mul.toLocalMatrix()

DenseMatrix(2, 2, [11.0, 18.0, 18.0, 50.0], 0)

# 2. Simple demo for small dense matrix

In [96]:
from scipy import sparse
from numpy import array
I = array([0,3,1,0])
J = array([0,3,1,2])
V = array([4,5,7,9])
A = sparse.coo_matrix((V,(I,J)),shape=(4,4))

In [102]:
dA = A.toarray()

In [103]:
from pyspark.mllib.linalg.distributed import *

def as_block_matrix(rdd, rowsPerBlock=4, colsPerBlock=4):
    return IndexedRowMatrix(
        rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0]))
    ).toBlockMatrix(rowsPerBlock, colsPerBlock)

In [104]:
matrixA = as_block_matrix(spark.sparkContext.parallelize(dA))

# 2. Demo for large matrix

In [89]:
import numpy as np
from pyspark.mllib.linalg.distributed import RowMatrix
A = np.arange(1024 ** 2, dtype=np.float32).reshape(1024, 1024)
B = np.arange(1024 ** 2, dtype=np.float32).reshape(1024, 1024)

In [90]:
A

array([[  0.00000000e+00,   1.00000000e+00,   2.00000000e+00, ...,
          1.02100000e+03,   1.02200000e+03,   1.02300000e+03],
       [  1.02400000e+03,   1.02500000e+03,   1.02600000e+03, ...,
          2.04500000e+03,   2.04600000e+03,   2.04700000e+03],
       [  2.04800000e+03,   2.04900000e+03,   2.05000000e+03, ...,
          3.06900000e+03,   3.07000000e+03,   3.07100000e+03],
       ..., 
       [  1.04550400e+06,   1.04550500e+06,   1.04550600e+06, ...,
          1.04652500e+06,   1.04652600e+06,   1.04652700e+06],
       [  1.04652800e+06,   1.04652900e+06,   1.04653000e+06, ...,
          1.04754900e+06,   1.04755000e+06,   1.04755100e+06],
       [  1.04755200e+06,   1.04755300e+06,   1.04755400e+06, ...,
          1.04857300e+06,   1.04857400e+06,   1.04857500e+06]], dtype=float32)

In [91]:
from pyspark.mllib.linalg.distributed import *

def as_block_matrix(rdd, rowsPerBlock=1024, colsPerBlock=1024):
    return IndexedRowMatrix(
        rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0]))
    ).toBlockMatrix(rowsPerBlock, colsPerBlock)

In [92]:
%time matrixA = as_block_matrix(spark.sparkContext.parallelize(A))
%time matrixB = as_block_matrix(spark.sparkContext.parallelize(B))

CPU times: user 36.8 ms, sys: 12.9 ms, total: 49.6 ms
Wall time: 1.09 s
CPU times: user 30.6 ms, sys: 8.93 ms, total: 39.5 ms
Wall time: 980 ms


In [93]:
%time aaa= matrixA.toLocalMatrix()

CPU times: user 1.21 s, sys: 64.8 ms, total: 1.28 s
Wall time: 5.51 s


In [94]:
%time product = matrixA.multiply(matrixB)

CPU times: user 2.47 ms, sys: 1.62 ms, total: 4.09 ms
Wall time: 6.86 s


In [95]:
%time product.toLocalMatrix()

CPU times: user 1.21 s, sys: 67 ms, total: 1.28 s
Wall time: 6.24 s


DenseMatrix(1024, 1024, [365967179776.0, 915186122752.0, 1.4644, 2.0136, 2.5628, 3.1120, 3.6612, 4.2104, ..., 5.5946, 5.6001, 5.6056, 5.6111, 5.6166, 5.6221, 5.6276, 5.6331], 0)