In [1]:
import os
import sys
import numpy as np
from numpy import asarray,zeros
import pandas as pd 
from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import copy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
import transformers
from transformers import BertTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW, get_linear_schedule_with_warmup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import timm

In [2]:
# if torch.cuda.is_available():
#     device = torch.device("cuda")
#     os.environ['CUDA_ENVIRONMENT_DEVICES'] = "0,1"
# else:
#     device = torch.device("cpu")
device = torch.device("cpu") # Force CPU
print("Using device", device)

Using device cpu


In [3]:
# Load the data
img_data = np.load("../data/image_array.npy")
txt_data = np.load("../data/text_array.npy")
labels_data = np.load("../data/labels.npy")
ids_data = np.load("../data/ids.npy")
# Printing the shapes
print(img_data.shape)
print(txt_data.shape)
print(labels_data.shape)
print(ids_data.shape)

(11766, 2, 224, 224, 3)
(11766, 2)
(11766, 1)
(11766, 1)


In [4]:
# Reshape image to -> num_images, sources, num_channels, width, heigth
#NOTE: Can convert image data to tensor only in training loop with very less batch size
num_images, sources, width, height, num_channels = img_data.shape
img_data_reshape = np.reshape(img_data, newshape=(num_images, sources, num_channels, width, height))
img_data_target = torch.tensor(img_data_reshape[:,0,:,:,:]) # Don't convert to GPU
img_data_source = torch.tensor(img_data_reshape[:,1,:,:,:]) # Don't convert to GPU
print('New Target Shape', img_data_target.shape)
print('New Source Shape', img_data_source.shape)

New Target Shape torch.Size([11766, 3, 224, 224])
New Source Shape torch.Size([11766, 3, 224, 224])


In [16]:
# Vision Model
class ViTBottom(nn.Module):
    def __init__(self, original_model):
        super(ViTBottom, self).__init__()
        self.features = nn.Sequential(*list(original_model.children())[:-2])
        
    def forward(self, x):
        x = self.features(x)
        return x

In [17]:
pretrained_v = timm.create_model('vit_base_patch16_224', pretrained=True)
vit_inside = ViTBottom(pretrained_v)
print(pretrained_v)

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn

In [18]:
vit_inside

ViTBottom(
  (features): Sequential(
    (0): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (norm): Identity()
    )
    (1): Dropout(p=0.0, inplace=False)
    (2): Sequential(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU()
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): LayerNorm((768,), e

In [23]:
out = vit_inside(img_data_source[0:32,:,:,:])
print(out.shape)
maxpool = nn.MaxPool2d((out.shape[1], 1))
pooled_out = maxpool(out).squeeze(1)
print(pooled_out.shape)

torch.Size([32, 196, 768])
torch.Size([32, 768])
