In [1]:
import numpy as np

In [2]:
# Given 2 sparse matrices A and B, return the result of matrix multiplication C = AB

class SparseMatrix(object):
    def __init__(self, nrow, ncol, S):
        self.nrow = nrow
        self.ncol = ncol
        self.S = S
    
    @staticmethod
    def to_sparse(M):
        S = dict()
        for r, row in enumerate(M):
            for c, value in enumerate(row):
                if value:
                    S[(r, c)] = value
        return S
     
    @classmethod
    def from_dense(cls, M):
        nrow, ncol = len(M), len(M[0])
        S = cls.to_sparse(M)
        return cls(nrow, ncol, S)
    
    @classmethod
    def from_sparse(cls, nrow, ncol, S):
        return cls(nrow, ncol, S)
    
    def matmul(self, B):
        C = dict()
        for (a_r, a_c), a_val in self.S.items():
            for b_c in range(B.ncol):
                if (a_c, b_c) in B.S:
                    b_val = B.S[(a_c, b_c)]
                    C[(a_r, b_c)] = C.get((a_r, b_c), 0) + a_val * b_val
        return self.from_sparse(self.nrow, B.ncol, C)
    
    def to_dense(self):
        M = [[0] * self.ncol for _ in range(self.nrow)]
        for (r, c), value in self.S.items():
            M[r][c] = value
        return M            

In [7]:
def multiply(A, B):
    A = SparseMatrix.from_dense(A)
    B = SparseMatrix.from_dense(B)
    C = A.matmul(B)
    return C.to_dense()

In [8]:
matA = np.array([[1, 0, 0], [-1, 0, 3]])
matB = np.array([[7, 0, 0], [0, 0, 0], [0, 0, 1]])

In [9]:
matC = multiply(matA, matB)

In [10]:
matC

[[7, 0, 0], [-7, 0, 3]]

### Jaccard Similarity
The code below manipulates scipy.sparse.csc_matrix’s raw representation directly

In [15]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, csc_matrix

In [16]:
def jaccard_similarities(mat):
    cols_sum = mat.getnnz(axis=0)
    ab = mat.T * mat

    # for rows
    aa = np.repeat(cols_sum, ab.getnnz(axis=0))
    # for columns
    bb = cols_sum[ab.indices]

    similarities = ab.copy()
    similarities.data = similarities.data / (aa + bb - ab.data)

    return similarities

In [22]:
data = [[0,1,0,0,0,1,0,0,1], [0,0,1,0,0,0,0,0,1], [1,1,0,0,0,1,0,0,0]]
R = pd.DataFrame(data, columns=['F0','F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8'], index=['User1', 'User2', 'User3'])
R

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8
User1,0,1,0,0,0,1,0,0,1
User2,0,0,1,0,0,0,0,0,1
User3,1,1,0,0,0,1,0,0,0


In [23]:
# User-user similarities
jac_sim = jaccard_similarities(csc_matrix(R.T))

In [24]:
print(jac_sim)

  (0, 1)	0.25
  (0, 2)	0.5
  (0, 0)	1.0
  (1, 0)	0.25
  (1, 1)	1.0
  (2, 0)	0.5
  (2, 2)	1.0


In [25]:
jac_sim.toarray()

array([[1.  , 0.25, 0.5 ],
       [0.25, 1.  , 0.  ],
       [0.5 , 0.  , 1.  ]])

In [26]:
# Another way
n = len(R)
X = R.T.values
csr = csr_matrix((X > 0).astype(int))
intersect = csr.T.dot(csr).toarray()
rowsum = R.values.sum(axis=1)
rsumtile = np.repeat(rowsum.reshape((n, 1)), n, axis=1)
union = rsumtile.T + rsumtile - intersect
jaccard_similarity = intersect / union
jaccard_similarity

array([[1.  , 0.25, 0.5 ],
       [0.25, 1.  , 0.  ],
       [0.5 , 0.  , 1.  ]])