# scipy.sparse cheatsheet

## Быстрое создание CSR матрицы

In [1]:
import numpy as np
from scipy import sparse

In [2]:
# собираем данные в три списка
rows = []  # id строк ячеек
cols = []  # id столбцов ячеек
data = []  # значение

for i in range(10):
    rows.append(i)
    cols.append(i)
    data.append(i)
    
sm = sparse.csr_matrix((data, (rows, cols)))
sm.todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 2, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 3, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 4, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 5, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 6, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 7, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 8, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 9]], dtype=int32)

## Рандомная матрица с определенной плотностью и типом

In [4]:
sm = sparse.rand(10, 10, density=0.1, format='csr')
sm

<10x10 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

## Количество ненулевых элементов

In [6]:
sm = sparse.rand(1000, 1000, density=0.001, format='csr')
sm

<1000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 1000 stored elements in Compressed Sparse Row format>

In [7]:
# в матрице
sm.nnz

1000

In [11]:
from sklearn.preprocessing import binarize

In [12]:
%%time
# в строках
binarize(sm).sum(axis=1).A1

Wall time: 495 µs


array([5.])

In [13]:
%%time
# в столбцах
binarize(sm).sum(axis=0).A1

Wall time: 2.97 ms


array([1., 1., 0., 0., 0., 1., 0., 0., 1., 1.])

## Сортировка ненулевого содержимого строки в CSR

In [16]:
sm = sparse.rand(1, 10, 0.5, 'csr')
sm.todense()

matrix([[0.68159831, 0.51341214, 0.16990306, 0.18732587, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.17114247]])

In [17]:
row = sm[0]
for arg_id in np.argsort(row.data)[::-1]:
    print("col_id:", row.indices[arg_id], "val:", row.data[arg_id])

col_id: 0 val: 0.6815983082427854
col_id: 1 val: 0.5134121353165945
col_id: 3 val: 0.18732586621903047
col_id: 9 val: 0.1711424692899004
col_id: 2 val: 0.16990305583784593


## Поиск top-N значений строки в CSR

In [18]:
sm = sparse.rand(1, 100, 0.5, 'csr')

In [19]:
top = 5
row = sm[0]
for arg_id in np.argsort(row.data)[-top:]:
    print("col_id:", row.indices[arg_id], "val:", row.data[arg_id])

col_id: 25 val: 0.940574505548259
col_id: 29 val: 0.9710635234321112
col_id: 89 val: 0.9820824888289549
col_id: 48 val: 0.9968481699573234
col_id: 75 val: 0.9985655881547676


## Поиск cosine similarity между строками матрицы

In [20]:
sm = sparse.rand(5, 100, 0.5, 'csr')

In [22]:
from sklearn.preprocessing import normalize
n_sm = normalize(sm)
sim_m = n_sm.dot(n_sm.T)
sim_m  # будьте осторожны! может получиться очень плотная матрица

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [23]:
sim_m.todense()

matrix([[1.        , 0.4043736 , 0.28862666, 0.49324888, 0.34978905],
        [0.4043736 , 1.        , 0.33011815, 0.42464815, 0.39766884],
        [0.28862666, 0.33011815, 1.        , 0.34860013, 0.35535945],
        [0.49324888, 0.42464815, 0.34860013, 1.        , 0.44088587],
        [0.34978905, 0.39766884, 0.35535945, 0.44088587, 1.        ]])

## Быстрое зануление диагонали

In [24]:
positions = range(sim_m.shape[0])
eye = sparse.csr_matrix((np.ones(len(positions)), (positions, positions)), sim_m.shape)
sim_m = sim_m - sim_m.multiply(eye)

In [25]:
sim_m.todense()

matrix([[0.        , 0.4043736 , 0.28862666, 0.49324888, 0.34978905],
        [0.4043736 , 0.        , 0.33011815, 0.42464815, 0.39766884],
        [0.28862666, 0.33011815, 0.        , 0.34860013, 0.35535945],
        [0.49324888, 0.42464815, 0.34860013, 0.        , 0.44088587],
        [0.34978905, 0.39766884, 0.35535945, 0.44088587, 0.        ]])

## Зануление значений матрицы по маске

In [26]:
mask = np.random.randint(0, 2, (1, 5))
mask

array([[0, 1, 1, 0, 0]])

In [27]:
sm = sparse.rand(5, 5, 0.5, 'csr')
sm.todense()

matrix([[0.96984044, 0.1955809 , 0.24296056, 0.        , 0.814099  ],
        [0.        , 0.        , 0.        , 0.        , 0.02146882],
        [0.        , 0.26044102, 0.        , 0.        , 0.79646274],
        [0.95040723, 0.        , 0.        , 0.        , 0.        ],
        [0.47264014, 0.40863907, 0.60154649, 0.        , 0.90618502]])

In [24]:
# маска по строкам
sm.multiply(mask).todense()

matrix([[0.       , 0.       , 0.       , 0.       , 0.       ],
        [0.       , 0.       , 0.       , 0.       , 0.       ],
        [0.       , 0.       , 0.       , 0.       , 0.       ],
        [0.       , 0.       , 0.       , 0.       , 0.       ],
        [0.       , 0.       , 0.5784594, 0.       , 0.       ]])

In [28]:
# маска по столбцам
sm.multiply(mask.T).todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.02146882],
        [0.        , 0.26044102, 0.        , 0.        , 0.79646274],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 0.        ]])

## Зануление ячеек матрицы A, присутствующих в матрице B

In [29]:
sm.todense()

matrix([[0.96984044, 0.1955809 , 0.24296056, 0.        , 0.814099  ],
        [0.        , 0.        , 0.        , 0.        , 0.02146882],
        [0.        , 0.26044102, 0.        , 0.        , 0.79646274],
        [0.95040723, 0.        , 0.        , 0.        , 0.        ],
        [0.47264014, 0.40863907, 0.60154649, 0.        , 0.90618502]])

In [30]:
sm1 = sparse.rand(5, 5, 0.5, 'csr')
sm1.todense()

matrix([[0.04163227, 0.        , 0.11128635, 0.49039824, 0.        ],
        [0.09116916, 0.        , 0.25556216, 0.29867247, 0.58953823],
        [0.        , 0.        , 0.34420871, 0.        , 0.44112522],
        [0.        , 0.        , 0.        , 0.        , 0.94890424],
        [0.40620949, 0.        , 0.20677508, 0.        , 0.        ]])

In [31]:
sm = sm - sm.multiply(binarize(sm1))
sm.todense()

matrix([[0.        , 0.1955809 , 0.        , 0.        , 0.814099  ],
        [0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.26044102, 0.        , 0.        , 0.        ],
        [0.95040723, 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.40863907, 0.        , 0.        , 0.90618502]])