# 压缩格式存储

In [4]:
import numpy as np
from scipy.sparse import coo_matrix,csr_matrix, csc_matrix
#  示例解读
indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
# 按row行来压缩
# 对于第i行，非0数据列是indices[indptr[i]:indptr[i+1]] 数据是data[indptr[i]:indptr[i+1]]
# 在本例中
# 第0行，有非0的数据列是indices[indptr[0]:indptr[1]] = indices[0:2] = [0,2]
# 数据是data[indptr[0]:indptr[1]] = data[0:2] = [1,2],所以在第0行第0列是1，第2列是2
# 第1行，有非0的数据列是indices[indptr[1]:indptr[2]] = indices[2:3] = [2]
# 数据是data[indptr[1]:indptr[2] = data[2:3] = [3],所以在第1行第2列是3
# 第2行，有非0的数据列是indices[indptr[2]:indptr[3]] = indices[3:6] = [0,1,2]
# 数据是data[indptr[2]:indptr[3]] = data[3:6] = [4,5,6],所以在第2行第0列是4，第1列是5,第2列是6

array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]])

In [5]:
indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
csc_matrix((data, indices, indptr), shape=(3, 3)).toarray()

# 按col列来压缩
# i = 0
# 对于第i列，非0数据行是indices[indptr[i]:indptr[i+1]] 数据是data[indptr[i]:indptr[i+1]]
# 在本例中
# 第0列，有非0的数据行是indices[indptr[0]:indptr[1]] = indices[0:2] = [0,2]
# 数据是data[indptr[0]:indptr[1]] = data[0:2] = [1,2],所以在第0列第0行是1，第2行是2
# 第1行，有非0的数据行是indices[indptr[1]:indptr[2]] = indices[2:3] = [2]
# 数据是data[indptr[1]:indptr[2] = data[2:3] = [3],所以在第1列第2行是3
# 第2行，有非0的数据行是indices[indptr[2]:indptr[3]] = indices[3:6] = [0,1,2]
# 数据是data[indptr[2]:indptr[3]] = data[3:6] = [4,5,6],所以在第2列第0行是4，第1行是5,第2行是6

array([[1, 0, 4],
       [0, 0, 5],
       [2, 3, 6]])

In [23]:
from numpy import array
from scipy.sparse import coo_matrix
row  = array([0,0,1,3,1,0,0])
col  = array([0,2,1,3,1,0,0])
data = array([2,1,3,1,2,1,4])
A = coo_matrix((data,(row,col)), shape=(4,4))
print(A)
print("----------Array----------")
print(A.toarray())

B = A.tocsc()
print("---------tocsc-----------")
print(B)

C = B.todense()
print("--------todense------------")
print(C)

  (0, 0)	2
  (0, 2)	1
  (1, 1)	3
  (3, 3)	1
  (1, 1)	2
  (0, 0)	1
  (0, 0)	4
----------Array----------
[[7 0 1 0]
 [0 5 0 0]
 [0 0 0 0]
 [0 0 0 1]]
---------tocsc-----------
  (0, 0)	7
  (1, 1)	5
  (0, 2)	1
  (3, 3)	1
--------todense------------
[[7 0 1 0]
 [0 5 0 0]
 [0 0 0 0]
 [0 0 0 1]]


# 读取邻接矩阵
采用压缩文件格式，51984 => 38464, 压缩率74%

In [7]:
import numpy as np

data = np.load('./data/pemsd7-m/adj.npz')
print(data.files)

['indices', 'indptr', 'format', 'shape', 'data']


In [18]:
print(data['indices'].shape)
# ele ∈ [0, 227]

(19118,)


In [17]:
# shape: (229,)
print(data['indptr'])

[    0    48   123   212   318   407   494   587   645   731   836   945
  1024  1117  1184  1293  1376  1430  1493  1596  1642  1747  1823  1925
  2011  2110  2209  2311  2358  2436  2541  2647  2754  2860  2960  3065
  3168  3267  3366  3407  3506  3553  3652  3731  3777  3831  3885  3991
  4090  4191  4278  4364  4465  4568  4653  4786  4869  4956  5092  5226
  5329  5432  5540  5625  5731  5835  5969  6102  6208  6338  6446  6533
  6619  6722  6802  6900  6994  7085  7215  7305  7392  7485  7607  7695
  7808  7928  8057  8150  8271  8358  8452  8556  8650  8757  8851  8949
  9074  9199  9293  9373  9495  9613  9706  9808  9882  9986 10060 10150
 10267 10333 10391 10427 10456 10520 10549 10618 10673 10731 10786 10855
 10899 10959 11011 11063 11116 11187 11240 11308 11374 11430 11486 11547
 11587 11648 11712 11768 11799 11860 11914 11970 12033 12079 12193 12328
 12445 12509 12647 12782 12862 12996 13063 13140 13233 13326 13416 13509
 13591 13661 13743 13814 13862 13934 13966 14037 14

In [11]:
print(data['format'])

print(data['shape'])

b'csc'
[228 228]


In [13]:
print(data['data'].shape)

(19118,)


# 读取数据文件vel.csv


In [14]:
with open('./data/pemsd7-m/vel.csv', encoding='utf-8') as f:
    data_2 = np.loadtxt(f, delimiter=",")
    print(data_2.shape)

(12672, 228)


# sp.load_npz()

In [26]:
import scipy.sparse as sp

data = sp.load_npz('./data/pemsd7-m/adj.npz')
data1 = data
print(data)

print(data.tocsc())
if data1 == data:
    print(1)

  (0, 0)	1.0
  (1, 0)	0.874723658573536
  (2, 0)	0.3612846282098772
  (3, 0)	0.150751115862349
  (4, 0)	0.44773753416261347
  (7, 0)	0.9366682108438592
  (8, 0)	0.6294460508233404
  (11, 0)	0.7839333057097398
  (12, 0)	0.3309976422216514
  (15, 0)	0.6802770768089623
  (58, 0)	0.10052330878754748
  (65, 0)	0.10478651398778062
  (68, 0)	0.12590298087652654
  (108, 0)	0.5608901322885123
  (109, 0)	0.6899634477684644
  (112, 0)	0.20452601018760724
  (114, 0)	0.32572485193348727
  (115, 0)	0.8756092297080661
  (116, 0)	0.1010639188922501
  (118, 0)	0.30866295600959187
  (120, 0)	0.7924221612667589
  (123, 0)	0.9318102066961166
  (124, 0)	0.4643323538620246
  (126, 0)	0.5328487422206488
  (127, 0)	0.8412964212185728
  :	:
  (185, 227)	0.406905517928299
  (186, 227)	0.15049343836804044
  (187, 227)	0.18220630019898112
  (188, 227)	0.3979505510953252
  (192, 227)	0.17455919974749026
  (193, 227)	0.4049386337428703
  (195, 227)	0.13101336046275158
  (197, 227)	0.13362495666231547
  (203, 227)	0

  exec(code_obj, self.user_global_ns, self.user_ns)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

In [20]:
import numpy as np

data = np.load('./data/pemsd7-m/adj.npz')
print(data.files)

['indices', 'indptr', 'format', 'shape', 'data']


In [21]:
print(len(data['data']))
print(data['data'])

19118
[1.         0.87472366 0.36128463 ... 0.11876059 1.         1.        ]


In [30]:
import math

print(math.log(1/2))

-0.6931471805599453


In [47]:
import numpy as np

def matrix_to_csc(matrix):
    rows, cols = matrix.shape
    non_zero_indices = np.transpose(np.nonzero(matrix))  # 获取非零元素的坐标并转置
    non_zero_indices_sorted = non_zero_indices[np.argsort(non_zero_indices[:, 1])]
    print(non_zero_indices_sorted)
    # 构建CSC格式的三个数组
    indptr = [0]
    indices = []
    data = []

    current_col = 0
    for row_idx,col_idx  in non_zero_indices_sorted:

            # 更新indptr数组
        while col_idx > current_col:
            indptr.append(len(indices))
            current_col += 1


        # 存储非零元素的行索引和值
        indices.append(row_idx)
        data.append(matrix[row_idx, col_idx])


    indptr.append(len(indices))
    current_col += 1
    return indptr, indices, data

# 示例使用
matrix = np.array([[1, 0, 3],
                   [0, 0, 2],
                   [4, 0, 5]])

indptr, indices, data = matrix_to_csc(matrix)

print("indptr:", indptr)  # [0 2 3 5]
print("indices:", indices)  # [0 2 0 2 2]
print("data:", data)  # [1 4 3 2 5]


[[0 0]
 [2 0]
 [0 2]
 [1 2]
 [2 2]]
indptr: [0, 2, 2, 5]
indices: [0, 2, 0, 1, 2]
data: [1, 4, 3, 2, 5]
