# Import packages


In [None]:
# ! pip install joblib

In [None]:
import gc
from pathlib import Path
from typing import Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import skimage
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from joblib import Parallel, delayed
from PIL import Image
from scipy.stats import pearsonr, spearmanr
from skimage import transform
from skimage.color import rgb2hed, rgba2rgb
from skimage.io import imread
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary
from torchvision import datasets
from tqdm.auto import tqdm

import wandb

In [None]:
# https://www.kaggle.com/bminixhofer/deterministic-neural-networks-using-pytorch
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


# seed_everything()

In [None]:
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
setup_seed(42)

# Image Data Download

In [None]:
# !wget https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs909/patches_256.zip
# !unzip /content/patches_256.zip -d /content/

# Wandb

In [None]:
# wandb.init(
#     project="data-mining",
#     config={"learning_rate": 0.001, "architecture": "CNN", "epochs": 5},
# )

# Utility functions:


Define some utility functions for working with images.


In [None]:
def read_image(image_id: str) -> np.array:
    """Reads an image from the dataset

    Args:
        image_id (str): The id of the image to be read

    Returns:
        np.array: The image as a numpy array
    """

    image_folder = Path("data/patches_256")

    image_path = image_folder / f"{image_id}.png"

    rgb_image = imread(image_path)

    # if the image has an alpha channel, remove it
    if rgb_image.shape[-1] == 4:
        rgb_image = rgb_image[:, :, :3]

    return rgb_image

In [None]:
def convert_rgb_to_hed(input_rgb_image: np.array) -> np.array:
    """
    Converts an RGB image to the HED color space.

    Parameters:
        input_rgb_image (np.array): The input RGB image.

    Returns:
        np.array: The image converted to the HED color space.
    """
    hed_image = rgb2hed(input_rgb_image)
    hed_image = hed_image.astype(np.float32)
    return hed_image

In [None]:
def calculate_intensity_avg(input_image: np.array, channel: int) -> float:
    """
    Calculates the average intensity for a specific channel in an RGB or HED image.

    Parameters:
        input_image (np.array): The input image (RGB or HED).
        channel (int): The channel index for which to calculate the average intensity.

    Returns:
        float: The average intensity for the specified channel.
    """
    return input_image[:, :, channel].flatten().mean()

In [None]:
def calculate_intensity_std(input_image: np.array, channel: int) -> float:
    """
    Calculates the standard deviation of the intensity for a specific channel in an RGB or HED image.

    Parameters:
        input_image (np.array): The input image (RGB or HED).
        channel (int): The channel index for which to calculate the standard deviation of the intensity.

    Returns:
        float: The standard deviation of the intensity for the specified channel.
    """
    return input_image[:, :, channel].flatten().std()

In [None]:
def calculate_avg_h_intensity(image_id: str) -> dict:
    """
    Calculate the average H intensity of an image.

    Parameters:
        image_id (str): The ID of the image.

    Returns:
        dict: A dictionary containing the image ID and the average H intensity.
    """
    rgb_image = read_image(image_id)
    hed_image = convert_rgb_to_hed(rgb_image)
    avg_h_intensity = calculate_intensity_avg(hed_image, 0)
    
    del rgb_image, hed_image

    return {
        "image_id": image_id,
        "avg_h_intensity": avg_h_intensity,
    }

In [None]:
def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = pd.read_csv(
        "https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs909/protein_expression_data.csv"
    )

    # create specimen id field
    df["specimen_id"] = df.VisSpot.apply(lambda x: x.split("-")[2])

    # create image id field
    df["image_id"] = df.VisSpot.apply(lambda x: x.split("-")[2]) + "_" + df.id

    df = df.set_index("image_id").sort_index()

    training_specimens = ["A1", "B1", "D1"]
    testing_specimens = ["C1"]

    training_data = df.loc[df["specimen_id"].isin(training_specimens)]
    testing_data = df.loc[df["specimen_id"].isin(testing_specimens)]

    return training_data, testing_data

In [None]:
def calculate_evaluation_metrics(y_true, y_pred):
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    pearson_corr, _ = pearsonr(y_true, y_pred)
    spearman_corr, _ = spearmanr(y_true, y_pred)
    return {
        "rmse": rmse,
        "r2": r2,
        "pearson_corr": pearson_corr,
        "spearman_corr": spearman_corr,
    }

# Load data


Load the protein expression data after splitting it into training and testing:

In [None]:
training_data, testing_data = load_data()

In [None]:
print(f"Number of training samples: {len(training_data)}")
print(f"Number of testing samples: {len(testing_data)}")

# Question No. 1: (Data Analysis)


For the following questions, we will use only the `training_data`


## Counting Examples:


In [None]:
(
    training_data.groupby("specimen_id", as_index=False)
    .agg(n_sample=("id", "count"))
    .sort_values("n_sample", ascending=False)
)

We see that specimen `A1` has the largest numer of spots (almost double).

## Protein Expression Histograms


In [None]:
protiens = ["NESTIN", "cMYC", "MET"]
colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]

In [None]:
# plot protein histograms for specimen A1
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

for i, protein in enumerate(protiens):

    sns.histplot(
        training_data.query("specimen_id == 'A1'"), x=protein, ax=ax[i], color=colors[i]
    )

    mean = training_data.query("specimen_id == 'A1'")[protein].mean()
    std = training_data.query("specimen_id == 'A1'")[protein].std()
    skew = training_data.query("specimen_id == 'A1'")[protein].skew()
    min = training_data.query("specimen_id == 'A1'")[protein].min()
    max = training_data.query("specimen_id == 'A1'")[protein].max()

    # decrease font size in set_title
    ax[i].set_title(
        f"Specimen A1\nmean: {mean:.2f}\nstd: {std:.2f}\nskew: {skew:.2f}\nmin: {min:.2f}, max: {max:.2f}",
        fontsize=10,
    )
    ax[i].set_xlabel(protein)
    ax[i].set_ylabel("Frequency")


plt.show()

In [None]:
# plot protein histograms for specimen B1
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

for i, protein in enumerate(protiens):

    sns.histplot(
        training_data.query("specimen_id == 'B1'"), x=protein, ax=ax[i], color=colors[i]
    )

    mean = training_data.query("specimen_id == 'B1'")[protein].mean()
    std = training_data.query("specimen_id == 'B1'")[protein].std()
    skew = training_data.query("specimen_id == 'B1'")[protein].skew()
    min = training_data.query("specimen_id == 'B1'")[protein].min()
    max = training_data.query("specimen_id == 'B1'")[protein].max()

    
    ax[i].set_title(
        f"Specimen B1\nmean: {mean:.2f}\nstd: {std:.2f}\nskew: {skew:.2f}\nmin: {min:.2f}, max: {max:.2f}",
        fontsize=10,
    )
    ax[i].set_xlabel(protein)
    ax[i].set_ylabel("Frequency")


plt.show()

In [None]:
# plot protein histograms for specimen D1
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

for i, protein in enumerate(protiens):

    sns.histplot(
        training_data.query("specimen_id == 'D1'"), x=protein, ax=ax[i], color=colors[i]
    )

    mean = training_data.query("specimen_id == 'D1'")[protein].mean()
    std = training_data.query("specimen_id == 'D1'")[protein].std()
    skew = training_data.query("specimen_id == 'D1'")[protein].skew()
    min = training_data.query("specimen_id == 'D1'")[protein].min()
    max = training_data.query("specimen_id == 'D1'")[protein].max()

    ax[i].set_title(
        f"Specimen D1\nmean: {mean:.2f}\nstd: {std:.2f}\nskew: {skew:.2f}\nmin: {min:.2f}, max: {max:.2f}",
        fontsize=10,
    )
    ax[i].set_xlabel(protein)
    ax[i].set_ylabel("Frequency")


plt.show()

From the above plots, we notice the following:

1. All histograms have very similar distribution to the normal distribution. There's an apparent left *skewnewss* in the histograms. This suggest that the protein expression values, across the specimens, have some unfrequent negative values.

2. Different protients have different ranges. `NESTIN` have a slightly smaller range the `cMYC` and `MET`. The ranges are consistnet across the different specimens. However, the `NESTIN` range for `D1` is thinner and has smaller negative values. The high negative values in `NESTIN` which are present in `A1` and `B1` but absent in `D1` might affect the learning algorithm to learn the patterns.

3. Spread and central tendency: overall, the different protein across the different specimens have mean close to 0 and standard deviation close to 1.

4. Outliers: the extreme negative points can be though as outliers and might be challenging for the model to learn.


## Image Pre-processing


To have a good understanding of the potential usefulness of the HED color space, we'll select examples from the training data where NESTIN has very low, average, and very high values.

This will hopefully allow us see if we can use the HED color space as more effective reprsentation of the original images.

In [None]:
mean_nestin_a = training_data.loc[training_data["specimen_id"] == "A1", "NESTIN"].mean()
mean_nestin_b = training_data.loc[training_data["specimen_id"] == "B1", "NESTIN"].mean()
mean_nestin_d = training_data.loc[training_data["specimen_id"] == "D1", "NESTIN"].mean()

# select images from specimen A1
sample_images_A1 = pd.concat(
    [
        # select images with the smallest NESTIN values
        training_data.query("specimen_id == 'A1'")
        .nsmallest(3, "NESTIN")
        .reset_index()
        .loc[:, ["image_id", "NESTIN"]],
        # select images with NESTIN values around the mean
        training_data.loc[
            (training_data["specimen_id"] == "A1")
            & (training_data["NESTIN"] > mean_nestin_a - 0.1)
            & (training_data["NESTIN"] < mean_nestin_a + 0.1),
            "NESTIN",
        ]
        .head(3)
        .reset_index(),
        # select images with the largest NESTIN values
        training_data.query("specimen_id == 'A1'")
        .nlargest(3, "NESTIN")
        .reset_index()
        .loc[:, ["image_id", "NESTIN"]],
    ]
)

# select images from specimen B1
sample_images_B1 = pd.concat(
    [
        # select images with the smallest NESTIN values
        training_data.query("specimen_id == 'B1'")
        .nsmallest(3, "NESTIN")
        .reset_index()
        .loc[:, ["image_id", "NESTIN"]],
        # select images with NESTIN values around the mean
        training_data.loc[
            (training_data["specimen_id"] == "B1")
            & (training_data["NESTIN"] > mean_nestin_b - 0.1)
            & (training_data["NESTIN"] < mean_nestin_b + 0.1),
            "NESTIN",
        ]
        .head(3)
        .reset_index(),
        # select images with the largest NESTIN values
        training_data.query("specimen_id == 'B1'")
        .nlargest(3, "NESTIN")
        .reset_index()
        .loc[:, ["image_id", "NESTIN"]],
    ]
)

# select images from specimen D1
sample_images_D1 = pd.concat(
    [
        # select images with the smallest NESTIN values
        training_data.query("specimen_id == 'D1'")
        .nsmallest(3, "NESTIN")
        .reset_index()
        .loc[:, ["image_id", "NESTIN"]],
        # select images with NESTIN values around the mean
        training_data.loc[
            (training_data["specimen_id"] == "D1")
            & (training_data["NESTIN"] > mean_nestin_d - 0.1)
            & (training_data["NESTIN"] < mean_nestin_d + 0.1),
            "NESTIN",
        ]
        .head(3)
        .reset_index(),
        # select images with the largest NESTIN values
        training_data.query("specimen_id == 'D1'")
        .nlargest(3, "NESTIN")
        .reset_index()
        .loc[:, ["image_id", "NESTIN"]],
    ]
)

In [None]:
all_sample_data = (
    pd.concat([sample_images_A1, sample_images_B1, sample_images_D1])
    .reset_index(drop=True)
    .sort_values("NESTIN")
)

In [None]:
all_sample_data

In [None]:
print(f"Number of samples: {len(all_sample_data)}")

In [None]:
for index, row in all_sample_data.iterrows():
    image_id = row["image_id"]
    nestin_value = row["NESTIN"]

    img = read_image(image_id)
    hed_img = convert_rgb_to_hed(img)

    fig, ax = plt.subplots(1, 4, figsize=(15, 5))

    ax[0].imshow(img)
    ax[0].set_title(f"Image ID: {image_id} NESTIN = {nestin_value:.2f}")

    ax[1].imshow(hed_img[:, :, 0], cmap="gray")
    ax[1].set_title("Hematoxylin")

    ax[2].imshow(hed_img[:, :, 1], cmap="gray")
    ax[2].set_title("Eosin")

    ax[3].imshow(hed_img[:, :, 2], cmap="gray")
    ax[3].set_title("DAB")

    plt.show()

Without an expert domain knowledge in the working of tissue images and the measurement of different protiens, it will be hard to accurtly describe what this conversion is doing and why certain images have low values for NESTIN, while others have higher values.

However, we can quickly see that HED color space is a good representation of the image. In particular, the H and D channels.

The H channel highlights the cellular nuclei while the D channel seems to detect the *pink* areas.

In most cases, the E channel seems unresponsive.

## H-channel Analysis


In [None]:
avg_h_intensity_list = Parallel(n_jobs=-1, verbose=10)(
    delayed(calculate_avg_h_intensity)(image_id) for image_id in training_data.index
)

In [None]:
avg_h_intensity_list[:3]

In [None]:
avg_h_intensity_df = pd.DataFrame(avg_h_intensity_list).set_index("image_id")

In [None]:
# add NESTIN and specimen_id to the dataframe
avg_h_intensity_df = avg_h_intensity_df.join(training_data[["NESTIN", "specimen_id"]])

In [None]:
avg_h_intensity_df.head()

In [None]:
correlation = avg_h_intensity_df["avg_h_intensity"].corr(avg_h_intensity_df["NESTIN"])

ax = sns.scatterplot(
    data=avg_h_intensity_df,
    x="avg_h_intensity",
    y="NESTIN",
    hue="specimen_id",
    alpha=0.2,
)

ax.set_title("Average H intensity vs NESTIN expression\nCorrelation: {:.2f}".format(correlation))
ax.set_xlabel("Average H intensity")
ax.set_ylabel("NESTIN expression")

From the scatter plot and the correlation value we can see that there is a positive relation between the average intensity value of the `H` channel and the expression levels of `NESTIN`.

However, this correlation is weak and won't capture the true relation of the target variable.


## Performance Metrics for Prediction


Our problem is a regression problem where we want to predict a continous value and compare it with the target variable.

The target variable (`NESTTIN` for example) takes on values between -6 and 2.

Some of the common regression performance metrics:

1. **Mean Absolute Error**: this metric takes the absolute difference between predicted values and ture values. Lower values (closer to 0) means better prediction.

2. **Mean Squared Error**: this metric takes the squred difference between predictive values and true values. Since it's _squared_, it's a polynomial function. One disadvantage of this function is the difficulty in interpreting its values because it squares the values.

3. **Root Mean Squared Error**: to address the issue of interpreting the results in MSE, this metric takes the squared root to the MSE value, resulting in a value in the same unit as the target variable.

# Question No. 2: (Feature Extraction and Classical Regression)

## Feature Extraction:

The purpose of this section is to extract candidate features _f<sub>1</sub>_, _f<sub>2</sub>_, _f<sub>3</sub>_, ..., _f<sub>n</sub>_ from the original images, and measure the correlation between these features and the target varible.

Then, use the candidate features to perform regression using traditional machine learning algorithms.

Since the work on this and the following questions will require experimenting with different appraoches and judging their performance on a hold-out data and in the spirit of avoiding any leakge from the test data, we split our original training data into **training** and **validation**.

For **training** we use specimens `A1` and `D1` and for **validation** we use specimen `B1`.

This way, we ensure that the final **testing** data of specimen `C1` will not influence the feature engineering or model development and will be used only at the very last to measure the performance of different approaches.

In [None]:
# split training data into training and validation sets
validation_data = training_data[training_data["specimen_id"].isin(["B1"])]
training_data = training_data[training_data["specimen_id"].isin(["A1", "D1"])]

In [None]:
print(f"Number of training samples: {len(training_data)}")
print(f"Number of validation samples: {len(validation_data)}")
print(f"Number of testing samples: {len(testing_data)}")

Before we dive in, we resize our original images from `256`x`256` to `64`x`64`.

While there will be a loss of information, but this is essential for computational feasibility.

Let's first resize a single image and see how it compares to the original image:

In [None]:
original_image = read_image("A1_0x40")
resized_image = skimage.transform.resize(original_image, (64, 64), anti_aliasing=True)

In [None]:
print(f"Original image shape: {original_image.shape}")
print(f"Resized image shape: {resized_image.shape}")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].imshow(original_image)
ax[0].set_title("Original image")
ax[0].axis("off")

ax[1].imshow(resized_image)
ax[1].set_title("Resized image")
ax[1].axis("off")

plt.show()

We quickly see that the image has lost some of its quality, but the spatial information are still recognizable and can be fed into a machine learning algorithm.

Let's see the change in the actual values of the pixels:

In [None]:
print(f"Original image: (min, max) = ({original_image.min()}, {original_image.max()})")
print(f"Resized image: (min, max) = ({resized_image.min()}, {resized_image.max()})")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

ax[0].hist(original_image.flatten(), bins=100, color="blue", alpha=0.7)
ax[0].set_title("Original image")

ax[1].hist(resized_image.flatten(), bins=100, color="red", alpha=0.7)
ax[1].set_title("Resized image")

plt.show()

After resizing, the image was normalized to the range `[0-1]` but it persevered the same distribution. Therefore, the image can be use for further tasks such as PCA.

### Channel statistics

In [None]:
def calculate_image_channel_stats(image_id: str, resize=False):
    """
    Calculate the intensity statistics for each channel of an image.

    Args:
        image_id (str): The ID of the image.

    Returns:
        dict: A dictionary containing the image ID and the calculated intensity statistics for each channel.
            - "image_id": The ID of the image.
            - "h_intensity_avg": The average intensity of the H channel in the HED color space.
            - "h_intensity_std": The standard deviation of the intensity of the H channel in the HED color space.
            - "r_intensity_avg": The average intensity of the R channel in the RGB color space.
            - "r_intensity_std": The standard deviation of the intensity of the R channel in the RGB color space.
            - "g_intensity_avg": The average intensity of the G channel in the RGB color space.
            - "g_intensity_std": The standard deviation of the intensity of the G channel in the RGB color space.
            - "b_intensity_avg": The average intensity of the B channel in the RGB color space.
            - "b_intensity_std": The standard deviation of the intensity of the B channel in the RGB color space.
    """
    rgb_image = read_image(image_id)

    if resize:
        rgb_image = skimage.transform.resize(rgb_image, (64, 64), anti_aliasing=True)

    hed_image = convert_rgb_to_hed(rgb_image)

    h_intensity_avg = calculate_intensity_avg(hed_image, 0)
    h_intensity_std = calculate_intensity_std(hed_image, 0)

    r_intensity_avg = calculate_intensity_avg(rgb_image, 0)
    r_intensity_std = calculate_intensity_std(rgb_image, 0)

    g_intensity_avg = calculate_intensity_avg(rgb_image, 1)
    g_intensity_std = calculate_intensity_std(rgb_image, 1)

    b_intensity_avg = calculate_intensity_avg(rgb_image, 2)
    b_intensity_std = calculate_intensity_std(rgb_image, 2)

    return {
        "image_id": image_id,
        "h_intensity_avg": h_intensity_avg,
        "h_intensity_std": h_intensity_std,
        "r_intensity_avg": r_intensity_avg,
        "r_intensity_std": r_intensity_std,
        "g_intensity_avg": g_intensity_avg,
        "g_intensity_std": g_intensity_std,
        "b_intensity_avg": b_intensity_avg,
        "b_intensity_std": b_intensity_std,
    }

In [None]:
training_images_channels_stats_list = Parallel(n_jobs=-1, verbose=10)(
    delayed(calculate_image_channel_stats)(image_id, True)
    for image_id in training_data.index
)

In [None]:
training_images_channels_stats_df = pd.DataFrame(
    training_images_channels_stats_list
).set_index("image_id")

In [None]:
training_images_channels_stats_df = training_images_channels_stats_df.join(
    training_data[["NESTIN", "specimen_id"]]
)

In [None]:
training_images_channels_stats_df.head()

The above dataframe contains the image channel statistics (meand and standard deviation) for each image in the training data (specimens `A1` and `D1`)

### PCA

In [None]:
def prepare_images_np_dataset(input_df):
    rgb_images_list = []
    hed_images_list = []

    for image_id in tqdm(input_df.index):
        rgb_image = read_image(image_id)
        resized_image = skimage.transform.resize(
            rgb_image, (64, 64), anti_aliasing=True
        )
        hed_image = convert_rgb_to_hed(resized_image)

        rgb_images_list.append(resized_image)
        hed_images_list.append(hed_image)

    rgb_images = np.array(rgb_images_list)
    hed_images = np.array(hed_images_list)

    del rgb_images_list, hed_images_list

    return rgb_images, hed_images

In [None]:
train_rgb_images, train_hed_images = prepare_images_np_dataset(training_data)

In [None]:
val_rgb_images, val_hed_images = prepare_images_np_dataset(validation_data)

In [None]:
train_rgb_images.shape, train_hed_images.shape

In [None]:
val_rgb_images.shape, val_hed_images.shape

In [None]:
# TODO:
#   investigate the correct usage of gc.collect()
gc.collect()

In [None]:
pca_rgb = PCA(svd_solver="randomized", random_state=42)
pca_hed = PCA(svd_solver="randomized", random_state=42)
pca_hd = PCA(svd_solver="randomized", random_state=42)
pca_h = PCA(svd_solver="randomized", random_state=42)

In [None]:
n_train_samples = train_rgb_images.shape[0]

X_train_rgb = train_rgb_images.reshape(n_train_samples, -1)
X_train_hed = train_hed_images.reshape(n_train_samples, -1)

# H and D channels
X_train_hd = train_hed_images[:, :, :, [0, 2]].reshape(n_train_samples, -1)

# only H channel
X_train_h = train_hed_images[:, :, :, 0].reshape(n_train_samples, -1)

In [None]:
n_val_samples = val_rgb_images.shape[0]

X_val_rgb = val_rgb_images.reshape(n_val_samples, -1)
X_val_hed = val_hed_images.reshape(n_val_samples, -1)

# H and D channels
X_val_hd = val_hed_images[:, :, :, [0, 2]].reshape(n_val_samples, -1)

# only H channel
X_val_h = val_hed_images[:, :, :, 0].reshape(n_val_samples, -1)

In [None]:
print(f"X_train_rgb shape: {X_train_rgb.shape}")
print(f"X_train_hed shape: {X_train_hed.shape}")
print(f"X_train_hd shape: {X_train_hd.shape}")
print(f"X_train_h shape: {X_train_h.shape}")

print(f"X_val_rgb shape: {X_val_rgb.shape}")
print(f"X_val_hed shape: {X_val_hed.shape}")
print(f"X_val_hd shape: {X_val_hd.shape}")
print(f"X_val_h shape: {X_val_h.shape}")

In [None]:
# takes one minute and 28 seconds
pca_rgb.fit(X_train_rgb)

In [None]:
# takes 51 seconds
pca_hed.fit(X_train_hed)

In [None]:
# this takes 42 seconds
pca_hd.fit(X_train_hd)

In [None]:
# this takes 25 seconds
pca_h.fit(X_train_h)

In [None]:
# find the number of components that explain 95% of the variance

n_components_rgb = np.argmax(np.cumsum(pca_rgb.explained_variance_ratio_) >= 0.95) + 1
n_components_hed = np.argmax(np.cumsum(pca_hed.explained_variance_ratio_) >= 0.95) + 1
n_components_hd = np.argmax(np.cumsum(pca_hd.explained_variance_ratio_) >= 0.95) + 1
n_components_h = np.argmax(np.cumsum(pca_h.explained_variance_ratio_) >= 0.95) + 1

# plot a vertical line where 95% of the variance is explained

fig, ax = plt.subplots(1, 4, figsize=(15, 5))

ax[0].plot(np.cumsum(pca_rgb.explained_variance_ratio_))
ax[0].axvline(n_components_rgb, color="red", linestyle="--")
ax[0].set_title(
    f"Explained variance by PCA (RGB)\n{n_components_rgb} components", fontsize=10
)
ax[0].set_xlabel("Number of components")
ax[0].set_ylabel("Cumulative explained variance")

ax[1].plot(np.cumsum(pca_hed.explained_variance_ratio_))
ax[1].axvline(n_components_hed, color="red", linestyle="--")
ax[1].set_title(
    f"Explained variance by PCA (HED)\n{n_components_hed} components", fontsize=10
)
ax[1].set_xlabel("Number of components")

ax[2].plot(np.cumsum(pca_hd.explained_variance_ratio_))
ax[2].axvline(n_components_hd, color="red", linestyle="--")
ax[2].set_title(
    f"Explained variance by PCA (H and D channels)\n{n_components_hd} components",
    fontsize=10,
)
ax[2].set_xlabel("Number of components")


ax[3].plot(np.cumsum(pca_h.explained_variance_ratio_))
ax[3].axvline(n_components_h, color="red", linestyle="--")
ax[3].set_title(
    f"Explained variance by PCA (H channel)\n{n_components_h} components", fontsize=10
)
ax[3].set_xlabel("Number of components")

plt.show()

### GLCM

### Transfer Learning

## Regression Models

### SVR

In [None]:
SVR_model = SVR(kernel="rbf", C=1, gamma=0.1, epsilon=0.1)

In [None]:
n_comp = 1227
X_train_h_pca = pca_h.transform(X_train_h)[:,:n_comp]
X_val_h_pca = pca_h.transform(X_val_h)[:,:n_comp]

In [None]:
SVR_model.fit(X_train_h_pca, training_data["NESTIN"])

In [None]:
y_val_pred = SVR_model.predict(X_val_h_pca)
y_val_true = validation_data["NESTIN"]

In [None]:
y_val_pred.shape, y_val_true.shape

In [None]:
calculate_evaluation_metrics(y_val_true, y_val_pred)

# Question No. 3 (Using Convolutional Neural Networks)


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

In [None]:
print(f"Using {device} device")

In [None]:
# Hyper parameters
num_epochs = 5
batch_size = 100
learning_rate = 0.001

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()

        # first convolutional layer
        self.conv_layer_1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=5,
                stride=1,
                padding=2,
            ),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.ReLU(),
        )

        # second convolutional layer
        self.conv_layer_2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2,
            ),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.ReLU(),
        )

        # fully connected layer
        self.fc = nn.Linear(32 * 17 * 17, 1)

    def forward(self, x):
        out = self.conv_layer_1(x)
        out = self.conv_layer_2(out)
        out = out.view(out.size(0), -1)
        out = self.fc(out)
        return out

In [None]:
model = ConvNet().to(device)

In [None]:
# wandb.watch(model, log_freq=100)

In [None]:
# print the number of parameters in the model
n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters in the model: {n_params}")

In [None]:
# training data
training_image_ids = training_data.index.to_numpy()
training_labels = training_data["NESTIN"].to_numpy()

# validation data
validation_image_ids = validation_data.index.to_numpy()
validation_labels = validation_data["NESTIN"].to_numpy()

# testing data
testing_image_ids = testing_data.index.to_numpy()
testing_labels = testing_data["NESTIN"].to_numpy()

In [None]:
class RGBToHEDTransform:
    def __call__(self, pic):
        """
        Convert a PIL Image or numpy.ndarray from RGB to HED color space.

        Parameters:
            pic (PIL Image or numpy.ndarray): Image to be converted.

        Returns:
            Tensor: Converted image.
        """
        # Convert PIL Image to numpy array
        if isinstance(pic, Image.Image):
            img_array = np.array(pic)
        elif isinstance(pic, np.ndarray):
            img_array = pic
        elif torch.is_tensor(pic):
            img_array = pic.numpy()
            img_array = img_array.swapaxes(0, 2)
        else:
            raise TypeError(
                "img should be PIL Image or ndarray. Got {}".format(type(pic))
            )

        # Convert RGB to HED. The output array from rgb2hed can have negative values,
        # so it's important to scale and shift the values to bring them into a suitable range (e.g., 0 to 1) if necessary.
        hed_img = rgb2hed(img_array)

        # return the H channel
        return F.to_tensor(hed_img[:, :, 0])

In [None]:
class CustomDataset(Dataset):
    def __init__(self, image_ids: np.array, labels: np.array, transform=None):
        self.image_ids = image_ids
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        label = self.labels[idx]

        # read the image
        rgb_image = read_image(image_id)

        # apply transformation
        if self.transform:
            transformed_image = self.transform(rgb_image)

        return transformed_image, label

In [None]:
transformations = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Resize((64, 64)),
        RGBToHEDTransform(),
    ]
)

In [None]:
training_dataset = CustomDataset(training_image_ids, training_labels, transformations)
validation_dataset = CustomDataset(
    validation_image_ids, validation_labels, transformations
)
testing_dataset = CustomDataset(testing_image_ids, testing_labels, transformations)

In [None]:
train_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(
    validation_dataset, batch_size=batch_size, shuffle=False
)
test_dataloader = DataLoader(testing_dataset, batch_size=batch_size, shuffle=False)

In [None]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
def train(
    dataloader: DataLoader,
    model: ConvNet,
    loss_fn: nn.MSELoss,
    optimizer: torch.optim.SGD,
    epoch: int,
):

    size = len(dataloader.dataset)
    model.train()

    train_loss = 0

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

        train_loss += loss.item()

        if batch % 5 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            # wandb.log({"loss": loss})

    avg_train_loss = train_loss / size

In [None]:
def test(dataloader, model, loss_fn, epoch):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    model.eval()

    test_loss = 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)

            X = torch.tensor(X, dtype=torch.float32)
            y = torch.tensor(y, dtype=torch.float32)

            pred = model(X)

            test_loss += loss_fn(pred, y).item()

    avg_val_loss = test_loss / size
    # wandb.log({"val_loss": avg_val_loss})

    print(f"Test Error: \n Avg loss: {test_loss:>8f} \n")

In [None]:
print(f"Number of parameters in the model: {count_parameters(model)}")

In [None]:
# TODO:
#   add code here to track history of the training and validation loss
#   and then plot the loss over the epochs

epochs = 5
for t in range(epochs):

    print(f"Epoch {t+1}\n-------------------------------")

    train(train_dataloader, model, loss_fn, optimizer, epoch=t + 1)
    test(validation_dataloader, model, loss_fn, epoch=t + 1)

print("Done!")