In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
dataset_path = "/content/drive/MyDrive/KTH_TIPS"
classes = ["aluminium_foil", "corduroy", "cotton", "linen", "sandpaper", "sponge", "styrofoam"]

In [None]:
from sklearn.model_selection import train_test_split
import shutil
import glob

train_dir = os.path.join(dataset_path, "train")
val_dir = os.path.join(dataset_path, "val")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

for cls in classes:
    cls_path = os.path.join(dataset_path, cls)
    if not os.path.exists(cls_path):
        print(f" Missing class folder: {cls_path}")
        continue

    imgs = glob.glob(os.path.join(cls_path, "*.png"))
    if len(imgs) == 0:
        print(f"No images found in {cls_path}, skipping...")
        continue

    os.makedirs(os.path.join(train_dir, cls), exist_ok=True)
    os.makedirs(os.path.join(val_dir, cls), exist_ok=True)

    print(f"Found {len(imgs)} images for '{cls}'")

    # Split safely only if more than 1 image
    if len(imgs) > 1:
        train_imgs, val_imgs = train_test_split(imgs, test_size=0.2, random_state=42)
    else:
        train_imgs, val_imgs = imgs, []

    # Copy to respective folders
    for t in train_imgs:
        shutil.copy(t, os.path.join(train_dir, cls))
    for v in val_imgs:
        shutil.copy(v, os.path.join(val_dir, cls))

print("\nDataset successfully split into 'train' and 'val' folders.")

Found 81 images for 'aluminium_foil'
Found 81 images for 'corduroy'
Found 81 images for 'cotton'
Found 81 images for 'linen'
Found 81 images for 'sandpaper'
Found 81 images for 'sponge'
Found 81 images for 'styrofoam'

Dataset successfully split into 'train' and 'val' folders.


In [None]:
img_size = (224, 224)
batch_size = 32
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

train_gen = train_datagen.flow_from_directory(
    train_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True
)

val_gen = val_datagen.flow_from_directory(
    val_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False
)


Found 448 images belonging to 7 classes.
Found 119 images belonging to 7 classes.


In [None]:
from sklearn.model_selection import train_test_split
import cv2
from tensorflow.keras.utils import to_categorical
import glob

data = []
labels = []

for idx, cls in enumerate(classes):
    folder = os.path.join(dataset_path, cls)
    for f in glob.glob(os.path.join(folder, "*.png")): # Use glob.glob()
        img = cv2.imread(f)
        if img is None:
            continue
        # Use the previously defined img_size
        img = cv2.resize(img, (img_size[0], img_size[1]))
        data.append(img)
        labels.append(idx)

data = np.array(data, dtype="float32") / 255.0
labels = np.array(labels)

print(f"Loaded {len(data)} images from {len(classes)} classes.")

# --- STEP 4: Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)
y_train = to_categorical(y_train, num_classes=len(classes))
y_test = to_categorical(y_test, num_classes=len(classes))
print("Training samples:", X_train.shape[0], "Testing samples:", X_test.shape[0])

Loaded 567 images from 7 classes.
Training samples: 453 Testing samples: 114


In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import efficientnet
from tensorflow.keras.optimizers import Adam

IMG_SIZE = 224
BATCH_SIZE = 16
EPOCHS = 30
LATENT_DIM = 100

def build_generator():
    model = tf.keras.Sequential([
        layers.Dense(7*7*256, use_bias=False, input_shape=(LATENT_DIM,)),
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Reshape((7, 7, 256)),
        layers.Conv2DTranspose(128, (5,5), strides=(2,2), padding='same', use_bias=False), # Output: 14x14
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Conv2DTranspose(64, (5,5), strides=(2,2), padding='same', use_bias=False), # Output: 28x28
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Conv2DTranspose(32, (5,5), strides=(2,2), padding='same', use_bias=False), # Output: 56x56
        layers.BatchNormalization(),
        layers.LeakyReLU(),
        layers.Conv2DTranspose(3, (5,5), strides=(4,4), padding='same', use_bias=False, activation='sigmoid'), # Output: 224x224
    ])
    return model

generator = build_generator()

# --- Discriminator ---
def build_discriminator():
    model = tf.keras.Sequential([
        layers.Conv2D(64, (5,5), strides=(2,2), padding='same', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
        layers.LeakyReLU(),
        layers.Dropout(0.3),
        layers.Conv2D(128, (5,5), strides=(2,2), padding='same'),
        layers.LeakyReLU(),
        layers.Dropout(0.3),
        layers.Flatten(),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

discriminator = build_discriminator()
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# --- Combined GAN ---
discriminator.trainable = False
gan_input = layers.Input(shape=(LATENT_DIM,))
fake_img = generator(gan_input)
gan_output = discriminator(fake_img)
gan = models.Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')

# --- Train GAN briefly for data augmentation ---
for epoch in range(1):
    for _ in range(100):
        noise = np.random.normal(0, 1, (BATCH_SIZE, LATENT_DIM))
        gen_imgs = generator.predict(noise)
        real_imgs, _ = next(train_gen)

        # Resize gen_imgs to match dataset dims
        # This resize step might not be necessary after fixing the generator output shape
        # gen_imgs = tf.image.resize(gen_imgs, (IMG_SIZE, IMG_SIZE))


        # Labels
        real_y = np.ones((real_imgs.shape[0], 1))
        fake_y = np.zeros((gen_imgs.shape[0], 1))

        # Train discriminator
        d_loss_real = discriminator.train_on_batch(real_imgs, real_y)
        d_loss_fake = discriminator.train_on_batch(gen_imgs, fake_y)

        # Train generator
        noise = np.random.normal(0, 1, (BATCH_SIZE, 100)) # Use LATENT_DIM here
        g_loss = gan.train_on_batch(noise, np.ones((BATCH_SIZE, 1)))

print("✅ GAN training done — augmented data ready.")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 298ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
def build_cnn_branch(input_shape=(img_size[0], img_size[1], 3)):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3,3), activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D(2)(x)
    x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D(2)(x)
    x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(256, activation='relu')(x)
    model = Model(inputs, x, name="CNN_Branch")
    return model

cnn_branch = build_cnn_branch()

In [None]:
base = EfficientNetB0(include_top=False, weights='imagenet', input_shape=(IMG_SIZE,IMG_SIZE,3))
base.trainable = False  # freeze for initial training

# 3) Manual ViT block utilities
class ViTBlock(layers.Layer):
    def __init__(self, num_heads, projection_dim, mlp_dim, dropout=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim)
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.mlp = tf.keras.Sequential([
            layers.Dense(mlp_dim, activation='gelu'),
            layers.Dense(projection_dim),
        ])
        self.dropout = layers.Dropout(dropout)

    def call(self, x, training=False):
        attn = self.att(x, x)
        x = self.norm1(x + self.dropout(attn, training=training))
        mlp_out = self.mlp(x)
        x = self.norm2(x + self.dropout(mlp_out, training=training))
        return x

# Positional embedding layer that learns embeddings per token count
class AddPositionEmbedding(layers.Layer):
    def __init__(self, **kwargs):
        super(AddPositionEmbedding, self).__init__(**kwargs)

    def build(self, input_shape):
        # input_shape: (batch_size, num_patches, embedding_dim)
        self.pos_emb = self.add_weight(
            name="pos_emb",
            shape=(1, input_shape[1], input_shape[2]),
            initializer="random_normal",
            trainable=True
        )

    def call(self, inputs):
        return inputs + self.pos_emb

In [None]:
inp = layers.Input(shape=(img_size[0], img_size[1], 3))

# CNN branch
cnn_feat = build_cnn_branch(input_shape=(img_size[0], img_size[1], 3))(inp) # Pass the input tensor to the built model

# EfficientNet shared feature map
# Use the base EfficientNet model directly with the input tensor
feat_map = base(inp)   # shape e.g., (None, 7,7,1280)

# EfficientNet pooling branch
eff_feat = layers.GlobalAveragePooling2D()(feat_map)
eff_feat = layers.Dense(256, activation='relu')(eff_feat)
eff_feat = layers.Dropout(0.3)(eff_feat)


proj_dim = 64
vit_tokens = layers.Conv2D(proj_dim, kernel_size=1, activation='relu')(feat_map)
shape = tf.keras.backend.int_shape(vit_tokens)
seq_len = shape[1] * shape[2]
vit_tokens = layers.Reshape((seq_len, proj_dim))(vit_tokens)

# Add positional embeddings
vit_tokens = AddPositionEmbedding()(vit_tokens)

# Apply several ViT blocks
for _ in range(3):
    vit_tokens = ViTBlock(num_heads=4, projection_dim=proj_dim, mlp_dim=proj_dim*2)(vit_tokens)

# Pool and project
vit_feat = layers.GlobalAveragePooling1D()(vit_tokens)
vit_feat = layers.Dense(256, activation='relu')(vit_feat)
vit_feat = layers.Dropout(0.3)(vit_feat)

# --- Fusion ---
concat = layers.Concatenate()([cnn_feat, eff_feat, vit_feat])
x = layers.Dense(512, activation='relu')(concat)
x = layers.Dropout(0.4)(x)
out = layers.Dense(len(classes), activation='softmax')(x)

fusion_model = models.Model(inputs=inp, outputs=out, name="Fusion_CNN_Eff_ViT")
fusion_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

fusion_model.summary()

In [None]:
print("\nTraining fused model (CNN + EfficientNet + ViT)...")
history = fusion_model.fit(train_gen, validation_data=val_gen, epochs=10)


Training fused model (CNN + EfficientNet + ViT)...
Epoch 1/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 7s/step - accuracy: 0.8640 - loss: 0.3333 - val_accuracy: 0.9160 - val_loss: 0.2399
Epoch 2/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 7s/step - accuracy: 0.8627 - loss: 0.3192 - val_accuracy: 0.9328 - val_loss: 0.2385
Epoch 3/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 8s/step - accuracy: 0.8839 - loss: 0.2769 - val_accuracy: 0.8739 - val_loss: 0.3005
Epoch 4/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 8s/step - accuracy: 0.9125 - loss: 0.2630 - val_accuracy: 0.9160 - val_loss: 0.2250
Epoch 5/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 7s/step - accuracy: 0.8959 - loss: 0.2687 - val_accuracy: 0.8992 - val_loss: 0.2341
Epoch 6/10
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 7s/step - accuracy: 0.9131 - loss: 0.2377 - val_accuracy: 0.9328 - va

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

preds = fusion_model.predict(val_gen)
y_true = val_gen.classes
y_pred = np.argmax(preds, axis=1)
print(classification_report(y_true, y_pred, target_names=list(train_gen.class_indices.keys())))
cm = confusion_matrix(y_true, y_pred)
print("Confusion matrix:\n", cm)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 5s/step
                precision    recall  f1-score   support

aluminium_foil       0.94      1.00      0.97        17
      corduroy       1.00      0.71      0.83        17
        cotton       0.93      0.82      0.88        17
         linen       0.84      0.94      0.89        17
     sandpaper       1.00      1.00      1.00        17
        sponge       0.77      1.00      0.87        17
     styrofoam       1.00      0.94      0.97        17

      accuracy                           0.92       119
     macro avg       0.93      0.92      0.91       119
  weighted avg       0.93      0.92      0.91       119

Confusion matrix:
 [[17  0  0  0  0  0  0]
 [ 0 12  0  0  0  5  0]
 [ 0  0 14  3  0  0  0]
 [ 0  0  1 16  0  0  0]
 [ 0  0  0  0 17  0  0]
 [ 0  0  0  0  0 17  0]
 [ 1  0  0  0  0  0 16]]
