In [1]:
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix, lil_matrix, vstack, hstack, save_npz, load_npz, block_diag, identity, random
from scipy.sparse.linalg import inv, spsolve, splu
from scipy.linalg import lu
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Pool, shared_memory
import matplotlib.pyplot as plt
import time
import cProfile
import pstats 
import csv
import os 
from tqdm import tqdm

In [2]:
def lower_block_bidiagonal_nonsingular(n_blocks, block_size):
    """
    Generate a nonsingular sparse lower block bidiagonal matrix in CSR format.

    Parameters:
        n_blocks (int): Number of diagonal blocks.
        block_size (int): Size of each square block.

    Returns:
        scipy.sparse.csr_matrix: The resulting nonsingular sparse matrix.
        numpy.ndarray: The corresponding RHS vector.
    """
    N = n_blocks * block_size  # Total size of the matrix
    data, row_indices, col_indices = [], [], []

    # Generate diagonal (B_i) and lower diagonal (L_i) blocks
    for i in range(n_blocks):
        row_offset = i * block_size
        col_offset = i * block_size

        # Ensure nonzero entries in the main diagonal block (B_i)
        block_main = np.random.rand(block_size, block_size) + np.eye(block_size)  # Make B_i non-singular
        for r in range(block_size):
            for c in range(block_size):
                val = block_main[r, c]
                data.append(val)
                row_indices.append(row_offset + r)
                col_indices.append(col_offset + c)

        # Lower block (L_i), ensuring nonzero entries
        if i < n_blocks - 1:
            row_offset = (i + 1) * block_size
            col_offset = i * block_size
            block_lower = np.random.rand(block_size, block_size)  # Random values ensure nonzero entries

            for r in range(block_size):
                for c in range(block_size):
                    val = block_lower[r, c]
                    data.append(val)
                    row_indices.append(row_offset + r)
                    col_indices.append(col_offset + c)

    # Create sparse CSR matrix
    sparse_matrix = csr_matrix((data, (row_indices, col_indices)), shape=(N, N))

    # Generate a random RHS vector (column vector)
    rhs_vector = np.random.rand(N, 1)  # Nx1 dense vector

    return sparse_matrix, rhs_vector

In [3]:
p = 4 # no. of processors
k = 2 # recursion/iteration depth
n = int(p*2**k)

n_blocks = n + 1
block_size = 2

# M, f = lower_block_bidiagonal_nonsingular(n_blocks, block_size)
# x = spsolve(M,f)

# save_folder = "LBBM_p4" # Lower block bidiagonal matrix for 4 processors
# save_npz(f"{save_folder}/n_{n_blocks}_mat.npz",M)
# np.save(f"{save_folder}/n_{n_blocks}_rhs.npy",f)
# np.save(f"{save_folder}/n_{n_blocks}_sol.npy",x)

In [4]:
save_folder = "LBBM_p4"
M,f,x = load_npz(f"{save_folder}/n_{n_blocks}_mat.npz"), np.load(f"{save_folder}/n_{n_blocks}_rhs.npy"), np.load(f"{save_folder}/n_{n_blocks}_sol.npy")

In [30]:
def placeholder_name(M, f, block_size : int, processors : int):
    """ 
    Performs Block Cyclic Reduction (BCR) in parallel for solving lower block bidiagonal systems.

    Parameters:
    -----------
    M : scipy.sparse.csr_matrix or numpy.ndarray
        The coefficient matrix of size (N, N), where N = (n+1) * block_size.
        Must be a square lower block bidiagonal matrix.

    f : numpy.ndarray
        The right-hand side (RHS) vector of size (N, 1), corresponding to Mx = f.

    block_size : int
        The size of each block in the matrix.

    processors : int
        The number of processors used for parallel block cyclic reduction.

    Returns:
    --------
    x : numpy.ndarray
        The solution vector of size (N, 1) satisfying Mx = f.
    """
    N, L = M.shape
    assert N == L,  f"M must be sqaure but has dimensions {N}x{L}"
    n = (N - 1) // block_size
    assert n % processors == 0, f"M must have size (n+1)*block_size x (n+1)*block_size, where n = p * 2**k. p is not a multiple of n."
    nbyp = n // processors 
    assert ((nbyp & (nbyp-1) == 0) and nbyp != 0), f"M must have size (n+1)*block_size x (n+1)*block_size, where n = p * 2**k. n/p is not a power of two."

    row_index_start = block_size
    row_index_end = block_size*(1+nbyp)
    col_index_start = 0
    col_index_end = block_size*(nbyp+1)
    
    # Divide among the processors
    for processor in range(processors):
        M_copy = M[row_index_start:row_index_end, col_index_start:col_index_end]
        f_copy = f[row_index_start:row_index_end]
        M_k, f_k, A_k_s, B_k_s, f_k_s = forward_placeholder(M_copy, f_copy, block_size, processors)
        break
        row_index_start = row_index_end
        row_index_end += nbyp*block_size 
        col_index_start = col_index_end - block_size
        col_index_end += block_size*nbyp
    
    sol_x = []
    x0 = spsolve(M[:block_size,:block_size],f[:block_size])
    B_k = M_k[:,:block_size]
    A_k = M_k[:,block_size:]
    x8 = spsolve(A_k,f_k-B_k@x0)
    print(f"x0: {x0}")
    print(f"x8: {x8}")

    for i in range(len(A_k_s)):
        something =  spsolve(A_k_s[i], f_k_s[i].flatten() - B_k_s[i]@x0)
        print("x?: ",something)
    # print(len(A_k_s))
    # x4 = spsolve(A_k_s[2], f_k_s[2].flatten() - B_k_s[2]@x0)
    # print(f"x4: {x4}")
    

def forward_placeholder(M, f, block_size : int, processors : int, A_s = [], B_s = [], f_s = []):
    n,m = M.shape
    if n == block_size:
        print(f"Forward step finished!")
        return M,f,A_s,B_s,f_s
    
    M_next = csr_matrix((n//2,n//2+block_size))
    f_next = np.zeros(n//2)
    # Do one step
    for i in range(0,n,2*block_size):
        # Extract block elements from input
        B1 = M[i:i+block_size, i:i+block_size] 
        A1 = M[i:i+block_size, i + block_size: i + 2*block_size] 
        B2 = M[i + block_size: i + 2*block_size,i + block_size: i + 2*block_size ]
        A2 = M[i + block_size: i + 2*block_size,i + 2*block_size: i + 3*block_size]
        f1 = f[i:i+block_size]
        f2 = f[i + block_size: i + 2*block_size]

        # Compute inverses and values for the next depth. This is equivalent to removing all odd indices from the input
        A1_inv = inv(A1)
        B2_inv = inv(B2)
        new_B1 = A1_inv@B1
        new_A1 = -B2_inv@A2
        new_f1 = A1_inv@f1 - B2_inv@f2

        A_s.append(new_A1)
        B_s.append(new_B1)
        f_s.append(new_f1)

        # Set the new values to obtain a reduced system of half the original size
        j = i//2
        M_next[j:j+block_size,j:j+block_size] = new_B1
        M_next[j:j+block_size,j+block_size:j+2*block_size] = new_A1
        f_next[j:j+block_size] = new_f1.flatten()

    # Recursively apply the same procedure
    return forward_placeholder(M_next,f_next,block_size,processors,A_s,B_s,f_s)
    

np.set_printoptions(precision=4, suppress=True)
placeholder_name(M,f,block_size=block_size,processors=p)
print(x)


Forward step finished!
x0: [0.0805 0.2187]
x8: [0.1314 0.4535]
x?:  [ 0.3779 -0.0888]
x?:  [0.144  0.4618]
x?:  [0.1314 0.4535]
[ 0.0805  0.2187 -0.3295  0.4999  0.3779 -0.0888  1.0978 -0.5688  0.1314
  0.4535  0.0751  0.3689 -0.5048  0.9562  0.1716  0.4375  0.0131 -0.0088
 -0.0643  0.6311  0.4779  0.2102  0.4897 -0.3163 -0.214   0.9643 -0.3173
  0.4477  0.2401  0.2972  0.1356  0.6062  0.1661 -0.0249]


  return splu(A).solve
  Ainv = spsolve(A, I)
  self._set_arrayXarray_sparse(i, j, x)
