In [16]:
import os
import json
import time
import numpy as np
from PIL import Image
from shutil import copyfile

In [17]:
def read_cap(filepath):
    cap_dict = {}
    with open(filepath) as cap:
        for line in cap:
            line_split= line.split('\t', 1) #\t is tab; maxsplit is 1
            caption = line_split[1][:-1]
            imgid=line_split[0].split(sep='#')[0]
            if imgid not in cap_dict:
                cap_dict[imgid]=[caption]
            else:
                cap_dict[imgid].append(caption)
    return cap_dict


In [18]:
dir_cap = 'f8k_cap'
filename_token = 'Flickr8k.token.txt'
filepath_token = os.path.join(dir_cap, filename_token)
cap_dict = read_cap(filepath_token)

In [21]:
cap_dict['1000268201_693b08cb0e.jpg']

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

In [22]:
def get_id(filepath):
    ids = []
    with open(filepath) as file:
        for line in file:
            ids.append(line[:-1])
    return ids

In [35]:
def copyfiles(dir_output, dir_input, ids):
    if not os.path.exists(dir_output):
        os.makedirs(dir_output)
    for i in ids:
        path_in=os.path.join(dir_input, i)
        path_out=os.path.join(dir_output, i)
        copyfile(path_in, path_out)
            

In [37]:
def write_cap(dir_output, ids, cap_dict):
    path_out=os.path.join(dir_output, 'captions.txt')
    output=[]
    for i in ids:
        dic={i: cap_dict[i]}
        output.append(json.dumps(i))
    with open(path_out, mode='w') as file:
            file.write('\n'.join(output))
            

In [5]:
def load_cap(cap_dir):
    cap=os.path.join(cap_dir, 'captions.txt')
    cap_dict={}
    with open(cap) as cap:
        for line in cap:
            dic=json.loads(line)
            for i, j in dic.items():
                cap_dict[i]=j
    return cap_dict

In [34]:
def re_allocate(dir_img, token, cap_path):
    dir_out={'train': 'train', 'val': 'val', 'test': 'test'}
    
    cap_dict = read_cap(token) #get caption dictionary 
    
    img=os.listdir(dir_img) #train, val, test mix; all img
    
    id_train=get_id(cap_path['train']) #ger ids
    id_val =get_id(cap_path['val']) 
    id_test=get_id(cap_path['test'])
    
    copyfiles(dir_out['train'], dir_img, id_train) #sort files to new dir
    copyfiles(dir_out['val'], dir_img, id_val)
    copyfiles(dir_out['test'], dir_img ,id_test)
    
    write_cap(dir_out['train'], id_train, cap_dict)
    write_cap(dir_out['val'], id_val, cap_dict)
    write_cap(dir_out['test'], id_test, cap_dict)

In [38]:
dir_img='f8k_img'
dir_text='f8k_text'
file_token='Flickr8k.token.txt'
file_train='Flickr_8k.trainImages.txt'
file_val='Flickr_8k.devImages.txt'
file_test='Flickr_8k.testImages.txt'
filepath_token=os.path.join(dir_text, file_token)

cap_path={'train': os.path.join(dir_text, file_train), 'val': os.path.join(dir_text, file_val), 
         'test': os.path.join(dir_text, file_test)}

re_allocate(dir_img, filepath_token, cap_path)


In [1]:
import os
import torch
import numpy as np
from PIL import Image
import torch.nn as nn
from torch.autograd import Variable
import torchvision.models as models
import torchvision.transforms as transforms


class Alexnet(nn.Module):
    def __init__(self, embedding_dim=512):
        super(Alexnet, self).__init__()
        self.alexnet = models.alexnet(pretrained=True)
        in_features = self.alexnet.classifier[6].in_features
        self.linear = nn.Linear(in_features, embedding_dim)
        self.alexnet.classifier[6] = self.linear
        # self.batch_norm = nn.BatchNorm1d(embedding_dim, momentum=0.01)
        self.init_weights()
    
    def init_weights(self):
        self.linear.weight.data.normal_(0.0, 0.02)
        self.linear.bias.data.fill_(0)
    
    def forward(self, images):
        embed = self.alexnet(images)
        # embed = Variable(embed.data)
        # embed = embed.view(embed.size(0), -1)
        # embed = self.linear(embed)
        # embed = self.batch_norm(embed)
        return embed

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable


class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        super(RNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        self.word_embeddings.weight.data.uniform_(-0.1, 0.1)
        self.linear.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.fill_(0)
        
    def forward(self, features, caption):
        seq_length = len(caption) + 1
        embeds = self.word_embeddings(caption)
        embeds = torch.cat((features, embeds), 0)
        lstm_out, _ = self.lstm(embeds.unsqueeze(1))
        out = self.linear(lstm_out.view(seq_length, -1))
        return out

    '''def greedy(self, cnn_out, seq_len = 20):
        ip = cnn_out
        hidden = None
        ids_list = []
        for t in range(seq_len):
            lstm_out, hidden = self.lstm(ip.unsqueeze(1), hidden)
            # generating single word at a time
            linear_out = self.linear(lstm_out.squeeze(1))
            word_caption = linear_out.max(dim=1)[1]
            ids_list.append(word_caption)
            ip = self.word_embeddings(word_caption)
        return ids_list'''