<a href="https://colab.research.google.com/github/Rahul20037237/Build_your_own_NN/blob/main/Vit_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 1


In [None]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers,Sequential
import tensorflow as tf
from PIL import Image
import timeit
from matplotlib import pyplot as plt

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# Patch Embedding Layer
class PatchEmbedding(layers.Layer):
    def __init__(self, patch_size=16, embed_size=768, img_size=224):
        super().__init__()
        self.proj = layers.Conv2D(embed_size, kernel_size=patch_size, strides=patch_size, padding="VALID", use_bias=False)
        self.patch_size = patch_size
        self.embed_size = embed_size
        self.num_patches = (img_size // patch_size) ** 2
        self.cls_token = self.add_weight(
            shape=(1, 1, embed_size),
            initializer="zeros",
            trainable=True,
            name="cls_token"
        )
        self.pos_embedding = self.add_weight(
            shape=(1, self.num_patches + 1, embed_size),
            initializer="random_normal",
            trainable=True,
            name="pos_embedding"
        )

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        patches = self.proj(inputs)
        patches = tf.reshape(patches, [batch_size, -1, self.embed_size])
        cls_token = tf.broadcast_to(self.cls_token, [batch_size, 1, self.embed_size])
        patches = tf.concat([cls_token, patches], axis=1)
        patches += self.pos_embedding
        return patches

# Transformer Encoder Block
class TransferEncoder(layers.Layer):
    def __init__(self, embed_size=768, num_heads=8):
        super().__init__()
        self.norm1 = layers.LayerNormalization()
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size)
        self.dropout1 = layers.Dropout(0.1)
        self.add1 = layers.Add()
        self.norm2 = layers.LayerNormalization()
        self.dense = layers.Dense(embed_size, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.add2 = layers.Add()

    def call(self, x):
        attn_output = self.attention(x, x, x)
        attn_output = self.dropout1(attn_output)
        x = self.add1([x, attn_output])
        x = self.norm1(x)
        dense_output = self.dense(x)
        dense_output = self.dropout2(dense_output)
        x = self.add2([x, dense_output])
        x = self.norm2(x)
        return x

# Transformer Encoder Stack
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_size, num_heads, depth):
        super().__init__()
        self.encoder_layers = [
            TransferEncoder(embed_size, num_heads) for _ in range(depth)
        ]

    def call(self, x):
        for layer in self.encoder_layers:
            x = layer(x)
        return x

# Classification Head
class ClassificationHead(layers.Layer):
    def __init__(self, emb_size: int = 768, n_classes: int = 100):
        super().__init__()
        self.pool = layers.GlobalAveragePooling1D()
        self.norm = layers.LayerNormalization(epsilon=1e-6, name="layer_norm")
        self.dense = layers.Dense(n_classes)

    def call(self, x):
        x = self.pool(x)
        x = self.norm(x)
        x = self.dense(x)
        return x

# Vision Transformer Model
class ViT(tf.keras.Model):
    def __init__(self, in_channels: int = 3, patch_size: int = 16, emb_size: int = 768, img_size: int = 224, depth: int = 12, n_classes: int = 100):
        super(ViT, self).__init__()
        self.patch_embedding = PatchEmbedding(patch_size, emb_size, img_size)
        self.transformer_encoder = TransformerEncoder(emb_size, num_heads=8, depth=depth)
        self.classification_head = ClassificationHead(emb_size, n_classes)

    def call(self, inputs):
        x = self.patch_embedding(inputs)
        x = self.transformer_encoder(x)
        x = self.classification_head(x)
        return x

class WarmupCosineDecayScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_lr, decay_steps, warmup_steps):
        super().__init__()
        self.initial_lr = initial_lr
        self.decay_steps = decay_steps
        self.warmup_steps = warmup_steps
        self.cosine_decay = tf.keras.optimizers.schedules.CosineDecay(initial_lr, decay_steps)

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        warmup_lr = self.initial_lr * (step / tf.cast(self.warmup_steps, tf.float32))
        cosine_lr = self.cosine_decay(step - self.warmup_steps)
        return tf.where(step < self.warmup_steps, warmup_lr, cosine_lr)

In [None]:
def get_optimizer():
    lr_schedule = WarmupCosineDecayScheduler(initial_lr=1e-4, decay_steps=10000, warmup_steps=1000)
    return tf.keras.optimizers.AdamW(learning_rate=lr_schedule, weight_decay=1e-4)
def preprocess_data(image, label):
    image = tf.image.resize(image, (112, 112))  # Halved from 224x224
    image = image / 255.0
    return image, label

def prepare_dataset(dataset, batch_size):
    return (dataset
            .map(preprocess_data)
            .cache()
            .shuffle(1000)
            .batch(batch_size)
            .prefetch(tf.data.AUTOTUNE))

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar100.load_data()
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
train_dataset = prepare_dataset(train_dataset, batch_size=32)
test_dataset = prepare_dataset(test_dataset, batch_size=32)

# Initialize Model
vit = ViT(patch_size=16, emb_size=512, img_size=112, depth=8, n_classes=100)
optimizer = get_optimizer()
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile Model
vit.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])

# Train Model
vit.fit(train_dataset, validation_data=test_dataset, epochs=20)

Epoch 1/20
[1m   4/1563[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:08:26[0m 33s/step - accuracy: 0.0046 - loss: 5.4880

In [None]:
import pandas as pd
import numpy as np
import re

WFLW_annotations_path_train = '/content/drive/MyDrive/dataset/WFLW_annotations/list_98pt_rect_attr_train_test/list_98pt_rect_attr_train.txt'
WFLW_annotations_path_test = '/content/drive/MyDrive/dataset/WFLW_annotations/list_98pt_rect_attr_train_test/list_98pt_rect_attr_test.txt'
with open(WFLW_annotations_path_train, 'r') as f:
    train_data = f.readlines()
with open(WFLW_annotations_path_test, 'r') as f:
    test_data = f.readlines()
def process_data(data):
    processed_data = []
    for line in data:
        parts = re.split(r'\s+', line.strip())  # Split by whitespace
        landmarks = np.array(parts[:196], dtype=np.float32)  # All but last are labels
        bbox=np.array(parts[196:200],dtype=np.float32)
        Attrib= np.array(parts[200:206],dtype=np.int32)  # Last part is the image path
        image_name=parts[206]
        processed_data.append((landmarks,bbox,Attrib,image_name))
    return processed_data

train_processed = process_data(train_data)
test_processed = process_data(test_data)
train_df=pd.DataFrame(train_processed, columns=['Landmarks','bbox','Attrib', 'ImagePath'])
test_df=pd.DataFrame(test_processed, columns=['Landmarks','bbox','Attrib', 'ImagePath'])
print("First row of training data:")
print(train_df.head(),train_df['Landmarks'][0].shape)
print(train_df.shape,test_df.shape)

First row of training data:
                                           Landmarks  \
0  [309.307, 538.369, 317.85733, 560.12085, 322.2...   
1  [579.003, 167.764, 579.68207, 179.84132, 580.3...   
2  [249.128, 175.463, 249.33041, 188.4176, 249.39...   
3  [812.989, 627.505, 813.7136, 633.25446, 814.45...   
4  [507.288, 280.026, 507.36008, 288.5583, 507.74...   

                           bbox              Attrib  \
0  [306.0, 308.0, 696.0, 870.0]  [0, 0, 1, 0, 0, 0]   
1   [586.0, 73.0, 746.0, 317.0]  [0, 0, 1, 0, 0, 0]   
2  [260.0, 104.0, 420.0, 330.0]  [0, 0, 1, 0, 0, 0]   
3  [809.0, 582.0, 901.0, 697.0]  [0, 0, 0, 0, 0, 1]   
4  [503.0, 212.0, 632.0, 379.0]  [0, 0, 0, 0, 0, 1]   

                                           ImagePath  
0     51--Dresses/51_Dresses_wearingdress_51_377.jpg  
1             19--Couple/19_Couple_Couple_19_340.jpg  
2  15--Stock_Market/15_Stock_Market_Stock_Market_...  
3       44--Aerobics/44_Aerobics_Aerobics_44_543.jpg  
4  5--Car_Accident/5_Car_Acci

In [None]:
import tensorflow as tf
import os
import numpy as np
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

# Preprocessing function
def preprocess(file_path, landmark,attrib):
    # Load and decode the image
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    h, w = tf.shape(img)[0], tf.shape(img)[1]

    # Resize the image to 224x224
    img = tf.image.resize(img, [224, 224])
    img = img / 255.0  # Normalize the image to [0, 1]

    # Scale landmarks based on original image size
    original_shape = tf.shape(landmark)  # Save the original shape
    landmark = tf.reshape(landmark, (-1, 2))  # Reshape to (num_landmarks, 2)

    scaling_factors = tf.convert_to_tensor([224 / tf.cast(w, tf.float32), 224 / tf.cast(h, tf.float32)])
    landmark = landmark * scaling_factors  # Scale landmarks to the new image size

    # Reshape landmarks back to the original shape
    # landmark = tf.reshape(landmark, original_shape)

    return img, landmark/255,attrib

# Loading data function
def load_data(dir, annotation):
    image_paths = [os.path.join(dir, path) for path in annotation['ImagePath'].tolist()]
    landmarks = tf.ragged.constant(annotation['Landmarks'].values.tolist(), dtype=tf.float32)
    attributes = tf.convert_to_tensor(annotation['Attrib'].values.tolist(), dtype=tf.int32)

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, landmarks, attributes))
    dataset = dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(lambda image, landmark, attrib: (image, tf.convert_to_tensor(landmark, dtype=tf.float32), attrib))

    batch_size = 16
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return dataset

# Example Usage
img_dir = '/content/drive/MyDrive/dataset/WFLW_images'

# Assuming `train_df` and `test_df` are pandas DataFrames with required annotations
dataset = load_data(img_dir, train_df)
test_dataset = load_data(img_dir, test_df)

# Check the shape of a batch
for i, j,k in dataset.take(1):  # Take one batch
    print(i.shape, j.shape,k.shape)  # (images, landmarks, attributes)

(16, 224, 224, 3) (16, 98, 2) (16, 6)


In [None]:
for images, landmarks, bboxes, attributes in dataset.take(1):
    # Convert the image to uint8 for visualization
    img_array = np.uint8(images[0].numpy() * 255)
    image = Image.fromarray(img_array)  # Convert to PIL Image

    # Draw the bounding box on the image
    draw = ImageDraw.Draw(image)
    bbox = bboxes[0].numpy()
    s = ((bbox[0], bbox[1]), (bbox[2], bbox[3]))  # Define rectangle coordinates (top-left, bottom-right)
    draw.rectangle(s, outline="red", width=3)

    # Plot the landmarks on the image
    for (x, y) in landmarks[0].numpy():
        # Draw a small circle for each landmark
        draw.ellipse([x - 2, y - 2, x + 2, y + 2], fill='blue', outline='blue')

    # Display the image with bounding box and landmarks
    plt.imshow(image)
    plt.axis("off")
    plt.show()

    # Print the bounding box and landmarks
    print(f"BBox: {bboxes[0]}")
    print(f"Landmarks: {landmarks[0]}")

    # Print dataset shapes
    print(f"Images shape: {images.shape}")  # Example: (batch_size, 224, 224, 3)
    print(f"Landmarks shape: {landmarks.shape}")
    print(f"BBox shape: {bboxes.shape}")
    print(f"Attributes shape: {attributes.shape}")

In [None]:
import timeit
import tensorflow as tf
timeit.timeit()
img_path = "/content/gratisography-cool-cat-800x525.jpg"
img = tf.keras.preprocessing.image.load_img(img_path)
img_array = tf.keras.preprocessing.image.img_to_array(img)
img_array = tf.image.resize(img_array, [224, 224])
img_array = tf.expand_dims(img_array, 0)
print(img_array.shape)
patch_emb=PatchEmbedding()(img_array)
print(patch_emb.shape)

(1, 224, 224, 3)
(1, 197, 768)


In [None]:
class PatchEmbedding(layers.Layer):
    def __init__(self, patch_size=16, embed_size=768,img_size=224):
        super().__init__()
        self.proj = layers.Conv2D(embed_size, kernel_size=patch_size, strides=patch_size, padding="VALID", use_bias=False) #output layer formula :((n+2p-k)/s)+1
        self.patch_size = patch_size
        self.embed_size = embed_size
        self.num_patches = (img_size // patch_size) ** 2
        self.cls_token = self.add_weight(
            shape=(1, 1, embed_size),
            initializer="zeros",
            trainable=True,
            name="cls_token"
        )
        self.pos_embedding = self.add_weight(
            shape=(1, self.num_patches + 1, embed_size),
            initializer="random_normal",
            trainable=True,
            name="pos_embedding"
        )

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        patches = self.proj(inputs)
        patches = tf.reshape(patches, [batch_size, -1, self.embed_size])
        cls_token = tf.broadcast_to(self.cls_token, [batch_size, 1, self.embed_size])
        patches = tf.concat([cls_token, patches], axis=1)
        patches += self.pos_embedding
        return patches

In [None]:
class TransferEncoder(layers.Layer):
    def __init__(self, embed_size=768, num_heads=8, depth=12):
        super().__init__()
        self.embed_size = embed_size
        self.num_heads = num_heads
        self.depth = depth
        self.norm1 = layers.LayerNormalization()
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size)
        self.dropout1 = layers.Dropout(0.1)
        self.add1 = layers.Add()
        self.norm2 = layers.LayerNormalization()
        self.dense = layers.Dense(embed_size, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.add2 = layers.Add()
    def build(self,x):
        self.build=True

    def call(self,x):
        attn_output = self.attention(x, x, x)
        attn_output = self.dropout1(attn_output)
        x = self.add1([x, attn_output])
        x = self.norm1(x)
        dense_output = self.dense(x)
        dense_output = self.dropout2(dense_output)
        x = self.add2([x, dense_output])
        x = self.norm2(x)
        return x

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_size, num_heads, depth):
        super().__init__()
        self.encoder_layers = [
            TransferEncoder(embed_size, num_heads) for _ in range(depth)
        ]

    def call(self, x):
        for layer in self.encoder_layers:
            x = layer(x)
        return x
tt=TransferEncoder()(patch_emb)
tt.shape

TensorShape([1, 197, 768])

In [None]:

class ClassificationHead(layers.Layer):
    def __init__(self, n_classes: int = 1000):
        super().__init__()
        self.model = Sequential([
            layers.Lambda(lambda x: tf.reduce_mean(x, axis=1)),
            layers.LayerNormalization(epsilon=1e-6, name="layer_norm"),
            layers.Dense(n_classes)
        ])

    def call(self, x):
        return self.model(x)

In [None]:
class ViT(tf.keras.Model):
    def __init__(self, in_channels: int = 3, patch_size: int = 16, emb_size: int = 768, img_size: int = 224, depth: int = 12, n_classes: int = 1000, **kwargs):
        super(ViT, self).__init__(**kwargs)
        self.patch_embedding = PatchEmbedding(in_channels, patch_size, emb_size, img_size)
        self.transformer_encoder = TransformerEncoder(emb_size, num_heads=8, depth=depth)
        self.classification_head = ClassificationHead(emb_size, n_classes)

    def call(self, inputs):
        x = self.patch_embedding(inputs)
        x = self.transformer_encoder(x)
        x = self.classification_head(x)
        return x