![mmeda2.jpg](mmeda2.jpg)

In [1]:
#! tar -xvf ../images.tar

In [2]:
# import tarfile
# my_tar = tarfile.open('../dataset/images.tar')
# my_tar.extractall('../dataset/images/') # specify which folder to extract to
# my_tar.close()

In [3]:
import torchvision.models as models
import torch
from torchvision import datasets, transforms
from PIL import Image
import os
import numpy as np

In [4]:
import torch.nn as nn
import torch.functional as F

#### Data Loading

In [5]:
# data_dir = '../dataset/images/images/'
data_dir = "../dataset/images/"
input_size = 224

In [6]:
data_transforms = transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

image_files = os.listdir(data_dir)
# image_datasets = datasets.ImageFolder(data_dir, data_transforms)
# dataloaders = torch.utils.data.DataLoader(image_datasets, batch_size=batch_size, shuffle=True, num_workers=4)

In [7]:
image_tensor = Image.open(data_dir + image_files[0])
image_tensor = image_tensor.convert('RGB')
image_tensor = data_transforms(image_tensor)
image_tensor = torch.unsqueeze(image_tensor, 0)
error_files = []
count = 0
for image_file in image_files[1:]:
    try:
        img = Image.open(data_dir + image_file)
        img = img.convert('RGB')
        img_t = data_transforms(img)
        img_t = torch.unsqueeze(img_t, 0)
        image_tensor = torch.cat((image_tensor, img_t), 0)
    except:
        error_files.append(image_file)
    count += 1
    if count % 100 == 0:
        print(count)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


In [8]:
image_tensor.size()

torch.Size([5000, 3, 224, 224])

In [9]:
# torch.save(image_tensor, "image_tensor.pkl")

In [10]:
X1_file ="../dataset/word2vec_emb_tensor.pkl"
# X1_file = "./word2vec_emb_tensor.pkl"
X1 = torch.load(X1_file)
print(X1.size())

torch.Size([5000, 100])


In [11]:
X1_t = X1.transpose(0, 1)

In [12]:
# Convolution Encoder
class ConvEncoder(nn.Module):
    def __init__(self, embedding_dim):
        super(ConvEncoder, self).__init__()
        #Encoder
        self.conv1 = nn.Conv2d(3, 16, 3, padding=1)  
        self.conv2 = nn.Conv2d(16, 4, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.lin1 = nn.Linear(12544, 512)
        self.lin2 = nn.Linear(512, embedding_dim)


    def forward(self, x):
        x = nn.ReLU()(self.conv1(x))
#         print(x.size())
        x = self.pool(x)
        x = nn.ReLU()(self.conv2(x))
        x = self.pool(x)
        x = torch.flatten(x, start_dim=1)
#         print(x.size())
        x = nn.ReLU()(self.lin1(x))
        x = nn.ReLU()(self.lin2(x))
#         print(x.size())
        return x


In [13]:
# Encoder for real numbered matrix
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super().__init__()
        self.enc_linear1 = nn.Linear(input_dim, input_dim//2)
        self.enc_linear2 = nn.Linear(input_dim//2, embedding_dim)
        self.emb = None
        
    def forward(self, x):
        x = self.enc_linear1(x)
        x = torch.relu(x)
        x = self.enc_linear2(x)
        self.emb = x # return embedding from encoder
        return x # use x for training
    

In [14]:
# Convolution Decoder
class ConvDecoder(nn.Module):
    def __init__(self, embedding_dim):
        super(ConvDecoder, self).__init__()
        #Decoder
        self.lin1 = nn.Linear(embedding_dim, 512)
        self.lin2 = nn.Linear(512, 12544)
        self.deconv1 = nn.ConvTranspose2d(1, 16, 3, stride=2, output_padding=1)
        self.deconv2 = nn.ConvTranspose2d(16, 3, 3, stride=1, padding = 2, output_padding=0)
        


    def forward(self, x):
        x = nn.ReLU()(self.lin1(x))
        x = nn.ReLU()(self.lin2(x)).view(-1, 1, 112, 112)
        # print(x.size())
        x = nn.ReLU()(self.deconv1(x))
        x = self.deconv2(x)
        # print(x.size())
        return x


In [15]:
# Encoder for real numbered matrix
class Decoder(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super().__init__()
        self.enc_linear1 = nn.Linear(embedding_dim, input_dim // 2)
        self.enc_linear2 = nn.Linear(input_dim//2, input_dim)
        
    def forward(self, x):
        x = self.enc_linear1(x)
        x = torch.relu(x)
        x = self.enc_linear2(x)
        return x # use x for training

In [16]:
class Arch(nn.Module):
    def __init__(self, embedding_dim, book_size, word2vec_size):
        super().__init__()
        self.conv_encoder = ConvEncoder(embedding_dim)
        self.encoder_row = Encoder(word2vec_size, embedding_dim)
        self.encoder_col = Encoder(book_size, embedding_dim)
        self.conv_decoder = ConvDecoder(embedding_dim)
        self.decoder_row = Decoder(word2vec_size, embedding_dim)
        self.decoder_col = Decoder(book_size, embedding_dim)
        self.intermediate_lin1 = nn.Linear(embedding_dim * 2, embedding_dim)
        
    def forward(self, matrices): # replace with one cell recon
        # currently 3 inputs - [image batch, word vec row batch, word vec col batch]
        # ensure the entity of interest is the row entity in m eg: books x images => books are entity of interest
        m_conv_encoded = self.conv_encoder(matrices[0])
        m_row_encoded = self.encoder_row(matrices[1])
        m_col_encoded = self.encoder_col(matrices[2])
        m_row_cat = torch.cat((m_conv_encoded, m_row_encoded), axis = 1)
        m_row_emb = self.intermediate_lin1(m_row_cat)
        X0_prime = self.conv_decoder(m_row_emb)
        X1_row_prime = self.decoder_row(m_row_encoded)
        X1_col_prime = self.decoder_col(m_col_encoded)
        
        X1_prime = torch.mm(m_row_emb, m_col_encoded.transpose(0, 1))
        return X0_prime, X1_prime, m_row_emb, X1_row_prime, X1_col_prime
        


In [17]:
# Hyperparams
epoch_count = 50
convergence_threshold = 1e-3
batch_size = 50
book_size = 5000
word2vec_size = 100

In [18]:
net = Arch(50, book_size, word2vec_size)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(), lr = 0.01)

In [19]:
def test(sample):
    m_conv_encoded = net.conv_encoder(sample[0].unsqueeze(0))
    m_row_encoded = net.encoder_row(sample[1].unsqueeze(0))
    m_row_cat = torch.cat((m_conv_encoded, m_row_encoded), axis = 1)
    m_row_emb = net.intermediate_lin1(m_row_cat)
    return m_row_emb

In [20]:
def get_embeddings():
    # embeddings = []
    embeddings = np.empty((book_size, embedding_dim), float)
    for i in range(0, book_size):
      sample = []
      sample.append(image_tensor[i])
      sample.append(X1[i])
      # embeddings.append(test(sample).numpy())
      embeddings[i] = test(sample).detach().numpy()
      # print(embeddings.shape)
    np.save("embeddings_alt_arch_with_decoder.npy", embeddings)
    # return torch.Tensor(embeddings)

In [21]:
def train():
    # training
    prev_losses = []
    for epoch in range(0,epoch_count):
        avg_loss = 0
        counter = 0
        for r_count in range(0, 5000, batch_size):
            matrix0 = (image_tensor[r_count:r_count + batch_size])
            matrix1 = (X1[r_count:r_count + batch_size])
            for c_count in range(0, 100, batch_size):
              counter += 1
              matrix2 = (X1_t[c_count:c_count + batch_size])
              m0_prime, m1_prime, m_row_emb, X1_row_prime, X1_col_prime = net.forward([matrix0, matrix1, matrix2])
              loss = criterion(matrix0, m0_prime) + criterion(matrix1[:,c_count: c_count+batch_size], m1_prime) + criterion(matrix1, X1_row_prime) + criterion(matrix2, X1_col_prime)
              avg_loss += loss.item()
              optimizer.zero_grad()
              loss.backward()
              optimizer.step()
        per_epoch_loss = avg_loss/counter
        prev_losses.append(per_epoch_loss)
        if epoch % 10 == 0:
            print(f"Average loss for epoch {epoch} = {per_epoch_loss}")
        if  (epoch > 100) and (len(prev_losses) > 0) and (abs(prev_losses[-1] - loss) < convergence_threshold):
            print('Convergence!')
            break
        prev_losses.append(loss)
        #torch.save(net.state_dict(), f"./model/alternate_arch_with_decoder_epoch{epoch}.pkl")


In [22]:
# from google.colab import drive
# drive.mount('/content/drive')

In [23]:
train()

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Average loss for epoch 0 = 2.216674911379814
Average loss for epoch 10 = 1.9791384303569795
Average loss for epoch 20 = 1.8730588269233703
Average loss for epoch 30 = 1.8653783345222472
Average loss for epoch 40 = 1.8646367859840394


In [24]:
embedding_dim=50

In [25]:
get_embeddings()

In [26]:
# torch.save(emb, "embeddings_new_arch_2.pkl")

In [27]:
# from google.colab import files
# files.download('embeddings_new_arch_2.npy')