# CNN-based feature fusion and quality monitoring

In this paper, a feature-level multi-sensor fusion method based on 2D CNN is developed for quality monitoring using the converted acoustic and photodiode images as the inputs. In this work, a 10-layer CNN was developed to identify the three part quality levels by fusing two types of sensing data. There are 4 convolutional layers, 4 max-pooling layers, and 2 fully-connected layers in the proposed CNN model. Additionally, a rectified linear units (ReLU) activation function and a dropout strategy are applied following the first fully connected layer to mitigate the overfitting problem. The number of kernels of convolutional layer 1 to 4 are 16, 32, 32, and 64, respectively. The kernel size of each convolutional operation is 5 × 5 and the step size is 1. The kernel size of all the max-pooling layers is 2 × 2 and the step size is 2. The number of output features of the two fully-connected layers are 256 and 3, respectively.

<img src="./figures/CNN.png" alt="Sample Image" width="50%">

In [None]:
import torch
import torch.utils.data as Data
import torchvision
import torch.nn as nn
from torchvision import datasets,transforms,models
import time
import os
import matplotlib.pyplot as plt
import numpy as np
import cv2
import scipy.io as sio
from sklearn.model_selection import train_test_split
from PIL import Image
import pywt
from ipywidgets import interact, IntSlider
import ipywidgets as widgets
import warnings
from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
import torch.onnx
from tqdm import tqdm

# Ignore specific UserWarnings
warnings.filterwarnings("ignore", category=UserWarning, module='torchvision.io.image')

# from jupyterthemes import get_themes
# import jupyterthemes as jt
# from jupyterthemes.stylefx import set_nb_theme
# !jt -t grade3 -T -N

# 1. Data Preprocessing

## Load microphone data

In [None]:
all_path_microphone = []
signal_microphone = os.listdir("../image_data/Microphone")
signal_microphone = signal_microphone[1:]
for f, fsignal in enumerate(signal_microphone):
    filepath = "../image_data/Microphone" + "/" + fsignal
    filename = os.listdir(filepath)
    for fname in filename:
        ffpath = filepath + "/" + fname
        path = [f, ffpath]
        all_path_microphone.append(path)
# print(len(all_path_microphone))

def display_image_mcp(index):
    img_path = all_path_microphone[index][1]
    img = cv2.imread(img_path, 0)  

    plt.figure(figsize=(10, 6))
    plt.imshow(img, cmap='gray')
    plt.axis('off')  # Hide the axis
    plt.show()

interact(display_image_mcp, index=IntSlider(min=0, max=len(all_path_microphone)-1, step=1, value=0,description='Microphone'))


## Load photodiode data

In [None]:
all_path_photodiode = []
signal_photodiode = os.listdir("../image_data/Photodiode")
signal_photodiode = [x for x in signal_photodiode if x != '.DS_Store']

for f, fsignal in enumerate(signal_photodiode):
    filepath = os.path.join("../image_data/Photodiode", fsignal)
    filename = os.listdir(filepath)
    # Additional check for .DS_Store inside subdirectories
    filename = [file for file in filename if file != '.DS_Store']
    for fname in filename:
        ffpath = os.path.join(filepath, fname)
        path = [f, ffpath]
        all_path_photodiode.append(path)

# all_path_photodiode = []
# signal_photodiode = os.listdir("../image_data/Photodiode")
# signal_photodiode = signal_photodiode[1:]
# for f, fsignal in enumerate(signal_photodiode):
#     filepath = "../image_data/Photodiode" + "/" + fsignal
#     filename = os.listdir(filepath)
#     for fname in filename:
#         ffpath = filepath + "/" + fname
#         path = [f, ffpath]
#         all_path_photodiode.append(path)
# # print(len(all_path_photodiode))

def display_image_phd(index):
    img_path = all_path_photodiode[index][1]
    img = cv2.imread(img_path, 0)  

    plt.figure(figsize=(10, 6))
    plt.imshow(img, cmap='gray')
    plt.axis('off')  # Hide the axis
    plt.show()
    
interact(display_image_phd, index=IntSlider(min=0, max=len(all_path_photodiode)-1, step=1, value=0,description='Photodiode'))


In [None]:
i=0
data_x1_list = []
data_x2_list = []
data_y1_list = []
data_y2_list = []

for item1, item2 in zip(all_path_microphone,all_path_photodiode):
    # print(item[0],item[1]) # 0 E:\ml_datasets\zhoucheng_data\0\0_1.png
    if item1[0] == item2[0]:
        img1=cv2.imread(item1[1],0)    
        img2=cv2.imread(item2[1],0)  
        
        arr1 = np.asarray(img1, dtype="float32")
        data_x1_list.append(arr1)
        
        arr2 = np.asarray(img2, dtype="float32")
        data_x2_list.append(arr2)
        
        i += 1
        data_y1_list.append(item1[0])
        data_y2_list.append(item2[0])

data_x1 = np.stack(data_x1_list, axis=0)[:, np.newaxis, :, :]
data_y1 = np.stack(data_y1_list, axis=0)
data_x2 = np.stack(data_x2_list, axis=0)[:, np.newaxis, :, :]
data_y2 = np.stack(data_y2_list, axis=0)

# print(data_x1.shape)
# print(data_y1.shape)
# print(data_x2.shape)
# print(data_y2.shape)

data_x1 = data_x1 / 255
data_x2 = data_x2 / 255
data_y1 = np.asarray(data_y1)
data_y2 = np.asarray(data_y2)

data_x1 = torch.from_numpy(data_x1)
data_y1 = torch.from_numpy(data_y1)
data_y1 = data_y1.long()

X1_train, X1_test, Y1_train, Y1_test = train_test_split(data_x1,
                                                    data_y1,
                                                    test_size=0.2,
                                                    random_state=999,
                                                    stratify=data_y1)
data_x2 = torch.from_numpy(data_x2)
data_y2 = torch.from_numpy(data_y2)
data_y2 = data_y2.long()

X2_train, X2_test, Y2_train, Y2_test = train_test_split(data_x2,
                                                    data_y2,
                                                    test_size=0.2,
                                                    random_state=999,
                                                    stratify=data_y2)


data1 = Data.TensorDataset(X1_train, X2_train, Y1_train)
data2 = Data.TensorDataset(X1_test, X2_test, Y1_test)
train_loader = Data.DataLoader(data1, batch_size=24,shuffle=True)
valid_loader = Data.DataLoader(data2, batch_size=24)
# print(len(data1))
# print(len(data2))

# 2. Build the CNN 

## Convolutional Neural Network (CNN) hyperparameters:
1. **Output Channels (out_channels):**
    1. Description: This parameter determines the number of filters (or kernels) used in each convolutional layer. Each filter extracts different features from the input image, so increasing the number of output channels generally allows the network to capture more complex features.
    2. Effect of Increasing:
       1. Increases the computational complexity of the model.
       2. Can capture more detailed features from the input, potentially improving model performance on complex tasks.
    3. Common Range: Typically starts from 16 and may go up to 128 or more in deeper layers.
2. **Kernel Size (kernel_size):**
    1. Description: The size of the filter that is applied to the input or the previous layer to produce a feature map. It's usually a square shape.
    2. Effect of Increasing:
        1. Larger kernels cover more area of the input image, capturing more global features.
        2. Can lead to more computational cost and potentially smoother features, but might lose small, detailed features.
    3. Common Range: Often 3x3, 5x5, sometimes 7x7.
3. **Stride (stride):**
    1. Description:The number of pixels the kernel moves across the image or feature map during the convolution operation. A stride of 1 moves the filter one pixel at a time.
    2. Effect of Increasing:
        1. Increases the downsampling effect, reducing the size of the output feature map faster.
        2. Reduces the overlap between receptive fields, potentially losing fine details.
    3. Common Range: 1 or 2. A stride of more than 2 is less common but might be used in very deep networks or specific architectures.
4. **Padding (padding):**
    1. Description:Padding adds zeros around the border of the input image or feature map. This allows the convolutional layer to produce feature maps that are the same size as the input, maintaining the spatial dimensions after convolution.
    2. Effect of Increasing:
        1. Ensures that the feature map does not shrink after applying the filter, which is especially important in deep networks to keep useful information.
        2. Allows the filter to properly operate on the elements at the edge of the input.
    3. Common Range: Often set to zero (valid padding) or to a value that makes the output size equal to the input size (same padding, often kernel_size/2 for odd kernels).

In [None]:
# class CNN(nn.Module):
#     def __init__(self):
#         super(CNN, self).__init__()   # 继承__init__功能
#         ## 第一层卷积
#         self.conv1 = nn.Sequential(
#             # 输入[1,64,64]
#             nn.Conv2d(
#                 in_channels=1,    # 输入图片的高度
#                 out_channels=16,  # 输出图片的高度
#                 kernel_size=5,    # 5x5的卷积核，相当于过滤器
#                 stride=1,         # 卷积核在图上滑动，每隔一个扫一次
#                 padding=2,        # 给图外边补上0
#             ),
#             # 经过卷积层 输出[16,64,64] 传入池化层
#             nn.MaxPool2d(kernel_size=2)   # 经过池化 输出[16,32,32] 传入下一个卷积
#         )
#         ## 第二层卷积
#         self.conv2 = nn.Sequential(
#             nn.Conv2d(
#                 in_channels=16,    # 同上
#                 out_channels=32,
#                 kernel_size=5,
#                 stride=1,
#                 padding=2
#             ),
#             # 经过卷积 输出[32, 32, 32] 传入池化层
#             nn.MaxPool2d(kernel_size=2)  # 经过池化 输出[32,16,16] 传入输出层
#         )
#         ## 第三层卷积
#         self.conv3 = nn.Sequential(
#             nn.Conv2d(
#                 in_channels=32,    # 同上
#                 out_channels=64,
#                 kernel_size=5,
#                 stride=1,
#                 padding=2
#             ),
#             # 经过卷积 输出[64, 16, 16] 传入池化层
#             nn.MaxPool2d(kernel_size=2)  # 经过池化 输出[64,8,8] 传入输出层
#         )
#           ## 第四层卷积
#         self.conv4 = nn.Sequential(
#             nn.Conv2d(
#                 in_channels=64,    # 同上
#                 out_channels=64,
#                 kernel_size=5,
#                 stride=1,
#                 padding=2
#             ),
#             # 经过卷积 输出[64, 8, 8] 传入池化层
#             nn.MaxPool2d(kernel_size=2)  # 经过池化 输出[64,4,4] 传入输出层
#         )
            
#         ## 输出层
        
#         self.output = nn.Sequential(
#             nn.Linear(in_features=64*2*2*2, out_features=128),
#             nn.ReLU(),
#             nn.Dropout(p=0.5),
#             nn.Linear(in_features=128, out_features=3)
#         )
    

#     def forward(self, x1, x2):           # [64×64×1]
#         x1 = self.conv1(x1)           # [64×64×16]
#         x1 = self.conv2(x1)           # [64×64×16]
        
#         x2 = self.conv1(x2)           # [64×64×16]
#         x2 = self.conv2(x2)           # [64×64×16] 
           
        
#         x = torch.cat((x1,x2),3)

#         x = self.conv3(x)
#         x = self.conv4(x)
        
#         x = x.view(x.size(0), -1)   # 保留batch, 将后面的乘到一起 [batch, 32*7*7]
               
#         output = self.output(x)     # 输出[50,10]
#         return output

# model = CNN()
# model

In [None]:
class CNN(nn.Module):
    def __init__(self, conv_params):
        super(CNN, self).__init__()
        # Unpacking parameters for each layer
        c1_out, c1_kernel, c1_stride, c1_padding = conv_params['conv1']
        c2_out, c2_kernel, c2_stride, c2_padding = conv_params['conv2']
        c3_out, c3_kernel, c3_stride, c3_padding = conv_params['conv3']
        c4_out, c4_kernel, c4_stride, c4_padding = conv_params['conv4']

        # Define the layers using the unpacked parameters
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, c1_out, c1_kernel, c1_stride, c1_padding),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(c1_out, c2_out, c2_kernel, c2_stride, c2_padding),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(c2_out, c3_out, c3_kernel, c3_stride, c3_padding),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(c3_out, c4_out, c4_kernel, c4_stride, c4_padding),
            nn.MaxPool2d(kernel_size=2)
        )
        self.output = nn.Sequential(
            nn.Linear(c4_out * 4 * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 3)
        )

    def forward(self, x1, x2):
        x1 = self.conv1(x1)
        x1 = self.conv2(x1)
        
        x2 = self.conv1(x2)
        x2 = self.conv2(x2)
        
        x = torch.cat((x1, x2), 3)
        x = self.conv3(x)
        x = self.conv4(x)
        
        x = x.view(x.size(0), -1)
        return self.output(x)


# Function to create a slider with customized width
def create_custom_slider(description, value, min, max, step):
    return widgets.IntSlider(
        value=value,
        min=min,
        max=max,
        step=step,
        description=description,
        style={'description_width': 'initial'},  # This allows the description to take as much space as it needs
        layout=widgets.Layout(width='50%')  # Adjust the width of the slider itself
    )

# Example usage in a model parameter context
def create_param_sliders():
    params = {
        'conv1': [16, 5, 1, 2],  # example values: out_channels, kernel_size, stride, padding
        'conv2': [32, 5, 1, 2],
        'conv3': [64, 5, 1, 2],
        'conv4': [64, 5, 1, 2]
    }
    sliders = {}
    for layer, param in params.items():
        sliders[layer] = [
            create_custom_slider(f'{layer} out_channels', param[0], 16, 128, 16),
            create_custom_slider(f'{layer} kernel_size', param[1], 3, 7, 1),
            create_custom_slider(f'{layer} stride', param[2], 1, 3, 1),
            create_custom_slider(f'{layer} padding', param[3], 0, 4, 1)
        ]
    return sliders

sliders = create_param_sliders()
ui = widgets.VBox([widgets.VBox(s) for s in sliders.values()])
display(ui)

def on_button_clicked(b):
    conv_params = {k: [s.value for s in v] for k, v in sliders.items()}
    global model
    model = CNN(conv_params)
    print(model)
    
button = widgets.Button(description="Update Model")
button.on_click(on_button_clicked)
display(button)

def confusion_matrix(labels, preds, conf_matrix):
    for p, t in zip(labels, preds):
        conf_matrix[p, t] += 1
    return conf_matrix

In [None]:
summary(model, input_size=[(1, 32, 32), (1, 32, 32)])  # Specify the input size of the network

In [None]:
# writer = SummaryWriter('runs/model_visualization')
# dummy_input1 = torch.randn(1, 1, 32, 32)
# dummy_input2 = torch.randn(1, 1, 32, 32)
# writer.add_graph(model, (dummy_input1, dummy_input2))
# writer.close()
# torch.onnx.export(model, (dummy_input1, dummy_input2), "model.onnx")
# !netron model.onnx

# 3. Model training

In [None]:
loss_f = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
epochs = 30
Train_epoch, Test_epoch, Train_accuracy, Test_accuracy, Loss = [], [], [], [], []
Train_time, Test_time = [], []
time0 = time.time()
Predict_label, True_label = [], []

for epoch in range(epochs):
    time1 = time.time()
    Train_epoch.append(epoch + 1)
    running_loss, running_correct = 0, 0
    print("Epoch {}/{}".format(epoch + 1, epochs))
    print("-" * 10)

    # Wrap train_loader with tqdm for a progress bar
    for data in tqdm(train_loader, desc="Training"):
        X1_train, X2_train, Y_train = data
        # Assume model and data are on the same device, add .to(device) if needed
        y_pred = model(X1_train, X2_train)
        loss = loss_f(y_pred, Y_train)
        pred = torch.max(y_pred, 1)[1]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        running_correct += torch.sum(pred == Y_train).item()

    train_loss = running_loss / len(train_loader.dataset)
    train_acc = running_correct / len(train_loader.dataset) *100
    print(f"Train Loss: {train_loss:.4f}, Train ACC: {train_acc:.4f}%")

    Train_time.append(time.time() - time1)
    Loss.append(train_loss)
    Train_accuracy.append(train_acc * 100)

    # Validation loop with tqdm
    test_loss, test_correct = 0, 0
    conf_matrix = torch.zeros(3, 3)
    for data in tqdm(valid_loader, desc="Validation"):
        X1_test, X2_test, Y_test = data
        outputs = model(X1_test, X2_test)
        pred = torch.max(outputs, 1)[1]
        loss = loss_f(outputs, Y_test)
        
        if epoch == epochs-1:
            Predict_label.append(pred.numpy())
            True_label.append(Y_test.numpy())
            conf_matrix = confusion_matrix(Y_test, pred, conf_matrix)
            
        test_loss += loss.item()
        test_correct += torch.sum(pred == Y_test).item()

    test_loss /= len(valid_loader.dataset)
    test_accuracy = test_correct / len(valid_loader.dataset) *100
    print(f"Valid Loss: {test_loss:.4f}, Valid ACC: {test_accuracy:.4f}%")

    Test_accuracy.append(test_accuracy)
    Test_time.append(time.time() - time1)

# Save results to files and print the confusion matrix
np.savetxt('CNN1-train_time_two_sensor_feature_micpho1_32.txt', Train_time, fmt="%.4f")
np.savetxt('CNN1-test_time_two_sensor_feature_micpho1_32.txt', Test_time, fmt="%.4f")
save_fn = 'CNN1_two_sensor_feature_micpho1_32.mat'
sio.savemat(save_fn, {'train_epoch': Train_epoch, 'train_accuracy': Train_accuracy,
                              'test_epoch': Test_epoch, 'test_accuracy': Test_accuracy,
                              'train_loss': Loss, 
                              'predict_label': Predict_label, 
                              'true_label': True_label})
print(conf_matrix)

# 4. Model performance

In [None]:
import seaborn as sn
import itertools
import pandas as pd

conf_matrix=conf_matrix.numpy()
conf_matrix=conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
conf_matrix = np.around(conf_matrix, decimals=4)

plt.rc('font',family='Times New Roman',size=16)
plt.rcParams['xtick.direction'] = 'in'
plt.rcParams['ytick.direction'] = 'in'
df_cm = pd.DataFrame(conf_matrix,
                     index = ["Low quality","Medium quality","High quality"],
                     columns = ["Low quality","Medium quality","High quality"])

plt.figure(figsize = (8,6))
sn.heatmap(df_cm, annot=True,annot_kws={"size": 24},cmap="Blues",fmt='.4f')
plt.gca().set_title('Confusion matrix',fontsize=24)
plt.gca().set_xlabel('Predict label',fontsize=24)
plt.gca().set_ylabel('True label',fontsize=24)
#plt.gca().xaxis.set_ticks_position('none') 
#plt.gca().yaxis.set_ticks_position('none')
plt.gca().set_yticklabels(plt.gca().get_yticklabels(), rotation=0)
plt.grid(True, which='minor', linewidth=0.8 , linestyle='-')
plt.subplots_adjust(top = 0.99, bottom = 0.12, right = 1.02, left = 0.12, hspace = 0, wspace = 0) #调整图像边缘
plt.margins(0,0)
plt.show()

## Sample output

<img src="./figures/confusion_matrix.png" alt="Sample Image" width="50%">