# View all the matrix info in dataset.

In [1]:
import os, sys
import numpy as np
import csv
from tqdm import tqdm

In [2]:
waco_prefix = os.getenv("AUTOSPARSE_HOME")
waco_prefix = os.path.join(waco_prefix, "baseline", "waco")


In [4]:
def from_csr(filename) :
    waco_prefix = os.getenv("AUTOSPARSE_HOME")
    if waco_prefix is None : 
        print("Err : environment variable WACO_HOME is not defined")
        return 
    waco_prefix = os.path.join(waco_prefix, "baseline", "waco")
    
    filepath = os.path.join(waco_prefix, "pretrained", "dataset", filename+'.csr')
    csr = np.fromfile(filepath, dtype='<i4')
    num_row,num_col,nnz = csr[0],csr[1],csr[2]
    coo = np.zeros((nnz,2),dtype=int)
    coo[:,1] = csr[3+num_row+1:] # col
    bins = np.array(csr[4:num_row+4]) - np.array(csr[3:num_row+3])
    coo[:,0] = np.repeat(range(num_row), bins)
    return num_row, num_col, nnz, coo

In [5]:
def get_all_files_in_directory(directory):
    file_names = []
    for filename in os.listdir(directory):
        if os.path.isfile(os.path.join(directory, filename)):
            file_names.append(filename)
    return file_names

In [10]:
prefix = os.getenv("AUTOSPARSE_HOME")
waco_prefix = os.path.join(prefix, "baseline", "waco")
dataset_info = []
csv_file_path = os.path.join(waco_prefix, 'dataset_analyse_32.csv')
print("[INFO] analyse dataset info.")
files_in_directory = get_all_files_in_directory(
    os.path.join(prefix, "pretrained", "dataset")
)
for file_name in tqdm(
    files_in_directory, total = len(files_in_directory),
    desc = "Analyse csr file"
):
    if ".csr" not in file_name:
        continue
    mtx_filepath = os.path.join(prefix, "pretrained", "dataset", file_name)
    num_row, num_col, num_nonezero = np.fromfile(
        mtx_filepath, count=3, dtype = '<i4'
    )
    
    dataset_info.append([
        file_name.split(".")[0], num_row, num_col, 
        num_nonezero, num_nonezero*1.0/num_row/num_col,
        os.path.getsize(mtx_filepath)
    ])

# Write to a csv file
with open(csv_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow([
        "File Name", "Num Rows", "Num Cols", "Num Nonzero", 
        "Sparsity", "Filesize"
    ])
    for data_row in tqdm(
        dataset_info, total = len(dataset_info), desc="Write analyse csv file"
    ):
        if data_row[1] % 32 == 0 and data_row[2] % 32 == 0:
            csv_writer.writerow(data_row)

[INFO] analyse dataset info.


Analyse csr file: 100%|██████████| 975/975 [00:00<00:00, 19848.98it/s]
Write analyse csv file: 100%|██████████| 975/975 [00:00<00:00, 466405.84it/s]
