In [None]:
from zipfile import ZipFile
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torchvision import transforms
import torch.optim as optim
import numpy as np
from torch.utils.data import Subset
import matplotlib.pyplot as plt
import os
import cv2
import shutil

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

from tensorflow.keras import mixed_precision
from tensorflow.keras.layers import LayerNormalization, MultiHeadAttention, Add


In [None]:
# PatchEmbedding Class
class PatchEmbedding(layers.Layer):
    def __init__(self, patch_size, projection_dim):
        super(PatchEmbedding, self).__init__()
        self.patch_size = patch_size
        self.projection_dim = projection_dim
        self.projection = tf.keras.layers.Dense(projection_dim)

    def call(self, cnn_features):
        batch_size = tf.shape(cnn_features)[0]
        patch_size = self.patch_size

        # Reshape the CNN features to patches
        patches = tf.image.extract_patches(
            images=cnn_features,
            sizes=[1, patch_size, patch_size, 1],
            strides=[1, patch_size, patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )

        patch_dim = patches.shape[-1]
        num_patches = patches.shape[1] * patches.shape[2]  # Calculate the number of patches

        patches = tf.reshape(patches, (batch_size, num_patches, patch_dim))
        embeddings = self.projection(patches)

        return embeddings

# TransformerEncoderLayer Class
class TransformerEncoderLayer(layers.Layer):
    def __init__(self, num_heads, embedding_dim, mlp_dim, dropout_rate=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.layer_norm1 = layers.LayerNormalization()
        self.multi_head_attention = layers.MultiHeadAttention(num_heads, embedding_dim)
        self.add1 = layers.Add()
        self.layer_norm2 = layers.LayerNormalization()
        self.mlp = models.Sequential([
            layers.Dense(mlp_dim, activation='relu'),
            layers.Dense(embedding_dim)
        ])
        self.add2 = layers.Add()
        self.dropout = layers.Dropout(dropout_rate)

    def call(self, x):
        x1 = self.layer_norm1(x)
        attention_output = self.multi_head_attention(x1, x1)
        x2 = self.add1([x, attention_output])
        x3 = self.layer_norm2(x2)
        x3 = self.mlp(x3)
        x4 = self.add2([x2, x3])
        return self.dropout(x4)

# CNN_EmotionClassifier Class (PyTorch)
class CNN_EmotionClassifier(nn.Module):
    def __init__(self, num_classes=7):
        super(CNN_EmotionClassifier, self).__init__()

        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(128)

        self.res_conv1 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
        self.res_bn1 = nn.BatchNorm2d(128)
        self.res_conv2 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
        self.res_bn2 = nn.BatchNorm2d(128)


        self.fc1 = nn.Linear(128 * 6 * 6, 512)
        self.fc2 = nn.Linear(512, num_classes)

        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.adaptive_pool(F.relu(self.bn1(self.conv1(x))))
        x = self.adaptive_pool(F.relu(self.bn2(self.conv2(x))))
        x = self.adaptive_pool(F.relu(self.bn3(self.conv3(x))))

        residual = x
        x = F.relu(self.res_bn1(self.res_conv1(x)))
        x = self.res_bn2(self.res_conv2(x))
        x += residual
        x = F.relu(x)

        x = self.adaptive_pool(x)
        x = x.view(-1, 128 * 6 * 6)

        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)

        return x

# CNN_Transformer_Model Class
class CNN_Transformer_Model(tf.keras.Model):
    def __init__(self, cnn, image_size, patch_size, num_heads, projection_dim, mlp_dim, num_classes, num_transformer_layers, dropout_rate=0.1):
        super(CNN_Transformer_Model, self).__init__()
        self.cnn = cnn
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2
        self.patch_embedding = PatchEmbedding(patch_size, projection_dim)
        self.transformer_layers = [TransformerEncoderLayer(num_heads, projection_dim, mlp_dim, dropout_rate) for _ in range(num_transformer_layers)]
        self.global_pool = layers.GlobalAveragePooling1D()
        self.dense = layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        cnn_features = self.cnn(inputs)  # Pass inputs through CNN
        patches = self.patch_embedding(cnn_features)
        x = patches
        for layer in self.transformer_layers:
            x = layer(x)
        x = self.global_pool(x)
        output = self.dense(x)
        return output

    '''cnn_features = self.cnn(inputs)  # Pass inputs through CNN

        patches = tf.image.extract_patches(
            images=cnn_features,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding='VALID'
        )

        batch_size = tf.shape(patches)[0]
        patch_dim = patches.shape[-1]
        patches = tf.reshape(patches, (batch_size, self.num_patches, patch_dim))

        x = self.patch_embedding(patches)
        for layer in self.transformer_layers:
            x = layer(x)

        x = self.global_pool(x)
        output = self.dense(x)
        return output'''

# Instantiate the CNN model and convert to a Keras model

#UNCOMMENT the .to('cuda') part of this line if you're using GPU
cnn_model = CNN_EmotionClassifier(num_classes=7)#.to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.001)
  # Set the PyTorch model to evaluation mode
cnn_model.eval()

cnn_model_tf = tf.keras.models.Sequential([
    layers.Conv2D(5, 5, activation='relu', input_shape=(224, 224, 3)),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(10, 5, activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2))
])
'''
cnn_model_tf = tf.keras.models.Sequential([
    layers.Conv2D(5, 5, activation='relu', input_shape=(224, 224, 3)),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(10, 5, activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Flatten(),
    layers.Dense(32, activation='relu'),
])'''

# Instantiate the CNN-Transformer model

#Hyperparameters
image_size = 224
patch_size = 32
num_heads = 8
projection_dim = 128
mlp_dim = 256
num_transformer_layers = 4
dropout_rate = 0.1
num_classes = 7

cnn_transformer_model = CNN_Transformer_Model(
    cnn=cnn_model_tf,
    image_size=image_size,
    patch_size=patch_size,
    num_heads=num_heads,
    projection_dim=projection_dim,
    mlp_dim=mlp_dim,
    num_classes=num_classes,
    num_transformer_layers=num_transformer_layers,
    dropout_rate=dropout_rate
)

# Compile the model
cnn_transformer_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

from google.colab import drive
drive.mount('/content/gdrive')
dataset_path = '/content/gdrive/MyDrive/APS360 Project/Data/train'

# Load and prepare the dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    dataset_path,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(224,224),
    batch_size=32,
    label_mode='categorical'
).cache().prefetch(tf.data.experimental.AUTOTUNE)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    dataset_path,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(224,224),
    batch_size=32,
    label_mode='categorical'
).cache().prefetch(tf.data.experimental.AUTOTUNE)

# Train the model
history = cnn_transformer_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),
        tf.keras.callbacks.ModelCheckpoint('cnn_transformer_model.keras', save_best_only=True),
        tf.keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=1)
    ]
)

# Evaluate the model
final_val_loss, final_val_accuracy = cnn_transformer_model.evaluate(val_ds)
print(f"Final Validation Loss: {final_val_loss:.4f}, Final Validation Accuracy: {final_val_accuracy:.4f}")

# Save the model
cnn_transformer_model.save('cnn_transformer_model.keras')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Mounted at /content/gdrive
Found 28710 files belonging to 7 classes.
Using 22968 files for training.
Found 28710 files belonging to 7 classes.
Using 5742 files for validation.
Epoch 1/10




[1m 48/718[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m48:07[0m 4s/step - accuracy: 0.1706 - loss: 330.6751