In [None]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.fft import dct, idct
from skimage.io import imread

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
rootfolder = ".."

# 2D-DCT

The goal of this section is to define and use the dictionary representing the 2D-DCT.

Let $s$ be a $p\times p$ patch extract from an image. We can see $s$ as a vector of dimension $M=p^2$. We can move from one representation to the other one through a `reshape` operation.


The following code defines two functions: `dct2` and `idct2`. The `dct2` function computes the coefficient $x\in\mathbb{R}^M$ of a patch $s\in\mathbb{R}^M$, while the `idct2` is the inverse function:

$$
x = \text{dct2}(s) = D^Ts, \qquad s = \text{idct2}(x) = Dx
$$

NB: here $D$ is _not_ the 1D-DCT matrix.


In [None]:
def dct2(s):
    return dct(dct(s.T, norm="ortho").T, norm="ortho")


def idct2(x):
    return idct(idct(x.T, norm="ortho").T, norm="ortho")

Useful function for plot the 2D DCT dictionary


In [None]:
def get_dictionary_img(D):
    M = D.shape[0]
    p = int(round(np.sqrt(M)))
    bound = 2
    img = np.ones((p * p + bound * (p - 1), p * p + bound * (p - 1)))
    for i in range(M):
        m = np.mod(i, p)
        n = int((i - m) / p)
        m = m * p + bound * m
        n = n * p + bound * n
        atom = D[:, i].reshape((p, p))
        if atom.min() < atom.max():
            atom = (atom - atom.min()) / (atom.max() - atom.min())
        img[m : m + p, n : n + p] = atom

    return img

Set the patch size


In [None]:
p = 8  # patch size
M = p**2  # number of elements in the patch

Load the image and extract a patch


In [None]:
img = imread(f"{rootfolder}/data/cameraman.png") / 255  # bring the image in [0,1] range

# extract a patch from the image
s_block = img[0:p, 0:p]

# unroll the patch
s = s_block.flatten()

# show the patch
plt.imshow(s_block, cmap="gray")

## Compute the 2D DCT matrix (M x M)

To build the matrix corresponding to the 2D DCT we can use the `idct2` function.


In [None]:
D = np.zeros((M, M))
cnt = 0
for i in range(p):
    for j in range(p):
        basis = np.zeros((p, p))
        basis[i, j] = 1  # Create an impulse response
        D[:, cnt] = idct2(basis).flatten()
        cnt = cnt + 1

Verify that D is orthonormal


In [None]:
is_D_orth = np.allclose(D.T @ D, np.eye(M))
print(f"DCT dictionary is orthogonal: {is_D_orth}")

All the atoms in D can be seen as patch. Let's plot them!


In [None]:
D_img = get_dictionary_img(D)
plt.imshow(D_img, cmap="gray")

Compute the representation w.r.t. the 2D DCT matrix


In [None]:
x = dct2(s_block).flatten()

## Separable 2D DCT

Build the 1D DCT matrix


In [None]:
D1 = np.zeros((p, p))
for i in range(p):
    D1[:, i] = dct(np.eye(p)[:, i], norm="ortho")

Compute the dct2 exploiting the separability. In this case do not unroll the patch: the coefficients will be organized in a $p\times p$ matrix.


In [None]:
x_block = D1 @ s_block @ D1.T

Check if the coefficient computed using the two methods are equal


In [None]:
is_coeff_equal = np.all(np.abs(x_block.reshape(-1) - x) < 1e-10)
print(f"The two vectors of coefficients are the same: {is_coeff_equal}")

# JPEG Compression

The goal of this section is to implement the compression algorithm at the core of JPEG compression.

The idea is to divide the image in $8\times 8$ non overlapping patches and to compress each patch separately. More precisely, given a patch $s$ and its coefficient vector $x$ w.r.t. the DCT basis, we keep only the coefficients having a magnitude larger than a compression threshold $\tau$. In practice, a larger $\tau$ yields a greater compression ratio, where we define the compression ratio as

$$
\text{Comp Ratio} = 1 - \frac{\#\text{Non zero coefficients}}{\#\text{pixel in the image}}
$$

The larger the compression ratio, the smaller the amount of memory we need to store the compressed image.

Another figure of merit we use to evaluate the compression is the PSNR:

$$
\text{PSNR} = 10\log_{10}\frac{1}{\text{MSE}(Y, \widehat Y)}
$$

where $y$ is the original, uncompressed image, $\widehat y$ is the compressed image, and $\text{MSE}$ is the Mean Squared Error.


## threshold = 0.1


In [None]:
# Set the threshold for the compression
threshold = 0.1

Initialize the compressed image and the number of nonzero coefficients


In [None]:
img_comp = np.zeros_like(img)
nnz_coeff = 0

Process the image patchwize


In [None]:
for i in range(0, img.shape[0], p):
    for j in range(0, img.shape[1], p):
        # Extract the 2D patch (do NOT flatten yet)
        s_block = img[i : i + p, j : j + p]

        # Compute 2D DCT coefficients of the patch
        x_block = dct2(s_block)  # Returns p × p coefficients
        x = x_block.flatten()  # Flatten for thresholding

        # Hard thresholding (skip DC component at index 0)
        x_HT = x.copy()
        x_HT[1:] = np.where(np.abs(x[1:]) >= threshold, x[1:], 0)

        # Reconstruct the patch from thresholded coefficients
        x_HT_block = x_HT.reshape(p, p)  # Reshape to p × p
        s_hat = idct2(x_HT_block)  # Inverse 2D DCT

        # Insert reconstructed patch into compressed image
        img_comp[i : i + p, j : j + p] = s_hat

        # Count non-zero coefficients (excluding DC)
        nnz_coeff += np.count_nonzero(x_HT[1:])

Compute the PSNR


In [None]:
psnr = 10 * np.log10(1 / (np.mean((img - img_comp) ** 2)))
psnr

Compute the compression ratio


In [None]:
num_patches = (img.shape[0] // p) * (img.shape[1] // p)
num_pixels = img.shape[0] * img.shape[1]
num_nonzero_coeff = nnz_coeff + num_patches

# Compute compression ratio
comp_ratio = 1 - num_nonzero_coeff / num_pixels

Show the original image and the compressed one:


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 12))
ax[0].imshow(img, cmap="gray")
ax[0].set_title("Original Image")

ax[1].imshow(img_comp, cmap="gray")
ax[1].set_title(
    f"Compressed Image,\nPSNR = {psnr:.2f}, compression_ratio = {comp_ratio:.2f}"
)


## The threshold = 0.2


In [None]:
# Set the threshold for the compression
threshold = 0.2
img_comp = np.zeros_like(img)
nnz_coeff = 0

In [None]:
for i in range(0, img.shape[0], p):
    for j in range(0, img.shape[1], p):
        # Extract the 2D patch (do NOT flatten yet)
        s_block = img[i : i + p, j : j + p]

        # Compute 2D DCT coefficients of the patch
        x_block = dct2(s_block)  # Returns p × p coefficients
        x = x_block.flatten()  # Flatten for thresholding

        # Hard thresholding (skip DC component at index 0)
        x_HT = x.copy()
        x_HT[1:] = np.where(np.abs(x[1:]) >= threshold, x[1:], 0)

        # Reconstruct the patch from thresholded coefficients
        x_HT_block = x_HT.reshape(p, p)  # Reshape to p × p
        s_hat = idct2(x_HT_block)  # Inverse 2D DCT

        # Insert reconstructed patch into compressed image
        img_comp[i : i + p, j : j + p] = s_hat

        # Count non-zero coefficients (excluding DC)
        nnz_coeff += np.count_nonzero(x_HT[1:])
psnr = 10 * np.log10(1 / (np.mean((img - img_comp) ** 2)))
num_patches = (img.shape[0] // p) * (img.shape[1] // p)
num_pixels = img.shape[0] * img.shape[1]
num_nonzero_coeff = nnz_coeff + num_patches

# Compute compression ratio
comp_ratio = 1 - num_nonzero_coeff / num_pixels

fig, ax = plt.subplots(1, 2, figsize=(16, 12))
ax[0].imshow(img, cmap="gray")
ax[0].set_title("Original Image")

ax[1].imshow(img_comp, cmap="gray")
ax[1].set_title(
    f"Compressed Image,\nPSNR = {psnr:.2f}, compression_ratio = {comp_ratio:.2f}"
)

## Threshold = 3


In [None]:
# Set the threshold for the compression
threshold = 3
img_comp = np.zeros_like(img)
nnz_coeff = 0
for i in range(0, img.shape[0], p):
    for j in range(0, img.shape[1], p):
        # Extract the 2D patch (do NOT flatten yet)
        s_block = img[i : i + p, j : j + p]

        # Compute 2D DCT coefficients of the patch
        x_block = dct2(s_block)  # Returns p × p coefficients
        x = x_block.flatten()  # Flatten for thresholding

        # Hard thresholding (skip DC component at index 0)
        x_HT = x.copy()
        x_HT[1:] = np.where(np.abs(x[1:]) >= threshold, x[1:], 0)

        # Reconstruct the patch from thresholded coefficients
        x_HT_block = x_HT.reshape(p, p)  # Reshape to p × p
        s_hat = idct2(x_HT_block)  # Inverse 2D DCT

        # Insert reconstructed patch into compressed image
        img_comp[i : i + p, j : j + p] = s_hat

        # Count non-zero coefficients (excluding DC)
        nnz_coeff += np.count_nonzero(x_HT[1:])
psnr = 10 * np.log10(1 / (np.mean((img - img_comp) ** 2)))
num_patches = (img.shape[0] // p) * (img.shape[1] // p)
num_pixels = img.shape[0] * img.shape[1]
num_nonzero_coeff = nnz_coeff + num_patches

# Compute compression ratio
comp_ratio = 1 - num_nonzero_coeff / num_pixels

fig, ax = plt.subplots(1, 2, figsize=(16, 12))
ax[0].imshow(img, cmap="gray")
ax[0].set_title("Original Image")

ax[1].imshow(img_comp, cmap="gray")
ax[1].set_title(
    f"Compressed Image,\nPSNR = {psnr:.2f}, compression_ratio = {comp_ratio:.2f}"
)

Observations:

- The compressed image is much more pixelated than the original one.
- The PSNR is quite high, meaning that the difference between the original and the compressed image is not very noticeable.
- The larger the compression ratio, the smaller the amount of memory we need to store the compressed image.
