In [None]:
from google.colab import drive 
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
!cp -av 'drive/MyDrive/CV/final_project/data_pure.zip' './' 
!unzip -u -q "data_pure.zip"

'drive/MyDrive/CV/final_project/data_pure.zip' -> './data_pure.zip'


In [None]:
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import numpy as np
import tensorflow.keras.layers as L
from tensorflow.keras.losses import MeanAbsoluteError

seed = 1
base_dir = './'
image_size=224

df = pd.read_csv(base_dir+"data/train.csv")
df['filename'] = df.apply(lambda row: row['id']+'.jpg',axis=1)

def prep_fn(img):
    img = img.astype(np.float32) / 255.0
    #img = (img - 0.5) * 2
    return img

data_gen_args = dict(preprocessing_function=prep_fn,
                     width_shift_range=0.2,
                     height_shift_range=0.2,
                     zoom_range=0.1,
                     rotation_range=20,
                     horizontal_flip=False,
                     vertical_flip=False,
                     validation_split=0.1)

train_datagen = ImageDataGenerator(**data_gen_args)
val_datagen = ImageDataGenerator(preprocessing_function=prep_fn,validation_split=0.1)

train_generator = train_datagen.flow_from_dataframe(dataframe=df,
                                                    directory=base_dir+'data/train/',
                                                    x_col='filename',
                                                    y_col=['x','y'],
                                                    subset="training",
                                                    batch_size = 32,
                                                    seed=seed,
                                                    shuffle=True,
                                                    class_mode='raw',
                                                    target_size=(image_size,image_size))
val_generator = val_datagen.flow_from_dataframe(dataframe=df,
                                                directory=base_dir+'data/train/',
                                                x_col='filename',
                                                y_col=['x','y'],
                                                subset="validation",
                                                batch_size = 32,
                                                seed=seed,
                                                shuffle=True,
                                                class_mode='raw',
                                                target_size=(image_size,image_size))

Found 6750 validated image filenames.
Found 750 validated image filenames.


In [None]:
df_test = pd.read_csv("data/imagenames.csv")
df_test['filename'] = df_test.apply(lambda row: row['id']+'.jpg',axis=1)
df_test['x']=0
df_test['y']=0
test_datagen = ImageDataGenerator(preprocessing_function=prep_fn)
test_generator = test_datagen.flow_from_dataframe(dataframe=df_test,
                                                directory=base_dir+'data/test/',
                                                x_col='filename',
                                                y_col=['x','y'],
                                                batch_size = 32,
                                                shuffle=False,
                                                class_mode='raw',
                                                target_size=(224,224))

Found 1200 validated image filenames.


In [None]:
patch_size=14
num_patches = (image_size//patch_size) **2
projection_dim = 64
num_heads = 4
transformer_units = [projection_dim*2, projection_dim]
transformer_layers = 5
#mlp_head_units = [256,128,64]

In [None]:
def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = L.Dense(units, activation = tf.nn.gelu)(x)
        x = L.Dropout(dropout_rate)(x)
    return x

In [None]:
class Patches(L.Layer):
    def __init__(self, patch_size):
        super(Patches, self).__init__()
        self.patch_size = patch_size

    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images = images,
            sizes = [1, self.patch_size, self.patch_size, 1],
            strides = [1, self.patch_size, self.patch_size, 1],
            rates = [1, 1, 1, 1],
            padding = 'VALID',
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        # batch_size, n_batches, patch_flattened
        return patches

In [None]:
from tensorflow.keras.optimizers import Adam

In [None]:
class PatchEncoder(L.Layer):
    def __init__(self, num_patches, projection_dim):
        super(PatchEncoder, self).__init__()
        self.num_patches = num_patches
        #self.projection = L.Conv2D(projection_dim,(patch_size,patch_size))
        self.projection = L.Dense(units = projection_dim)
        self.position_embedding = L.Embedding(
            input_dim = num_patches, output_dim = projection_dim
        )

    def call(self, patch):
        positions = tf.range(start = 0, limit = self.num_patches, delta = 1)
        encoded = self.projection(patch) + self.position_embedding(positions)
        return encoded

In [None]:
def convolution_block(img_input):
    x = L.Conv2D(32, (5, 5), activation='relu', padding='same', name='block1_conv1')(img_input)
    x = L.Conv2D(32, (5, 5), activation='relu', padding='same', name='block1_conv2')(x)
    x = L.MaxPooling2D((3, 3), strides=(3, 3), name='block1_pool')(x)
    
    x = L.Conv2D(64, (3, 3), activation='relu', padding='same', name='block2_conv1')(x)
    x = L.Conv2D(64, (3, 3), activation='relu', padding='same', name='block2_conv2')(x)
    x = L.MaxPooling2D((3, 3), strides=(3, 3), name='block2pool')(x)
    
    x = L.Conv2D(128, (3, 3), activation='relu', padding='same', name='block3_conv1')(x)
    x = L.Conv2D(128, (3, 3), activation='relu', padding='same', name='block3_conv2')(x)
    x = L.MaxPooling2D((3, 3), strides=(3, 3), name='block3_pool')(x)
    
    x = L.Conv2D(256, (3, 3), activation='relu', padding='same', name='block4_conv1')(x)
    x = L.Conv2D(256, (3, 3), activation='relu', padding='same', name='block4_conv2')(x)
    x = L.MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)
    
    x = L.Conv2D(256, (3, 3), activation='relu', padding='same', name='block5_conv1')(x)
    x = L.Conv2D(256, (3, 3), activation='relu', padding='same', name='block5_conv2')(x)
    x = L.MaxPooling2D((2, 2), strides=(2, 2), name='block5_pool')(x)

    x = L.Flatten()(x)
    x = L.Dense(512,activation='relu')(x)
    return x

In [None]:
def vision_transformer():
    inputs = L.Input(shape = (image_size, image_size, 3))
    # Create patches.
    patches = Patches(patch_size)(inputs)
    #conv_features= convolution_block(inputs)

    # Encode patches.
    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)

    # Create multiple layers of the Transformer block.
    for _ in range(transformer_layers):
        
        # Layer normalization 1.
        x1 = L.LayerNormalization(epsilon = 1e-6)(encoded_patches)
        
        # Create a multi-head attention layer.
        attention_output = L.MultiHeadAttention(
            num_heads = num_heads, key_dim = projection_dim, dropout = 0.1
        )(x1, x1)
        
        # Skip connection 1.
        x2 = L.Add()([attention_output, encoded_patches])
        
        # Layer normalization 2.
        x3 = L.LayerNormalization(epsilon = 1e-6)(x2)
        
        # MLP.
        x3 = mlp(x3, hidden_units = transformer_units, dropout_rate = 0.1)
        
        # Skip connection 2.
        encoded_patches = L.Add()([x3, x2])

    # Create a [batch_size, projection_dim] tensor.
    representation = L.LayerNormalization(epsilon = 1e-6)(encoded_patches)
    #print(representation[:,0,:].shape)
    representation = representation[:,0,:]
    #representation = L.Flatten()(representation)
    #representation = L.Dropout(0)(representation)
    

    # Add MLP.
    vit_features = L.Dense(128,activation='relu')(representation)
    #x = L.Concatenate(axis=1)([vit_features, conv_features])
    #x = L.Dense(256,activation='relu')(vit_features)
    #x = L.Dense(128,activation='relu')(x)
    # Classify outputs.
    out = L.Dense(2)(vit_features)
    
    # Create the model.
    model = tf.keras.Model(inputs = inputs, outputs = out)
    
    return model

In [None]:
model = vision_transformer()
model.compile(optimizer = Adam(learning_rate=0.001), 
              loss=MeanAbsoluteError())

In [None]:
model.summary()

Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
patches_22 (Patches)            (None, None, 588)    0           input_23[0][0]                   
__________________________________________________________________________________________________
patch_encoder_20 (PatchEncoder) (None, 256, 256)     216320      patches_22[0][0]                 
__________________________________________________________________________________________________
layer_normalization_204 (LayerN (None, 256, 256)     512         patch_encoder_20[0][0]           
___________________________________________________________________________________________

In [None]:
with tf.device('/device:GPU:0'):
    #model.load_weights("drive/MyDrive/CV/Model/vit")
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    r=model.fit(train_generator, validation_data=val_generator,
                validation_steps=20,steps_per_epoch=211,epochs=100,callbacks=[callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [None]:
model.save_weights("drive/MyDrive/CV/Model/vit")

In [None]:
with tf.device('/device:GPU:0'):
    ypred = model.predict(test_generator)
ypred.shape

(1200, 2)

In [None]:
df_test['x'] = ypred[:,0]
df_test['y'] = ypred[:,1]
if 'filename' in df_test.keys():
    df_test=df_test.drop(columns=['filename'])
df_test.head()
df_test.to_csv('pred_vit.csv',index=False)