In [1]:
import json
import pandas as pd
import numpy as np
import pickle
import h5py
from collections import Counter
import nltk
#nltk.download('punkt') if needed
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM, Flatten, Embedding, Merge
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import base_filter
from keras.utils import np_utils
from keras.layers import Embedding
import scipy as sc
from keras.layers.core import Reshape

Using TensorFlow backend.


In [3]:
train_question_file = 'data/Q_T.json'
train_answer_file = 'data/A_T.json'
val_question_file = 'data/Q_V.json'
val_answer_file = 'data/A_V.json'

# column for question -> 'questions'
#            val      -> 'annotations'

def json_pandas(json_file,column):
    with open(json_file) as f:
        json_file = json.loads(f.read())

    df = pd.DataFrame(json_file[column])
    del json_file
    
    return df

train_question = json_pandas(train_question_file,'questions')
train_answer = json_pandas(train_answer_file,'annotations')
test_question = json_pandas(val_question_file,'questions')
test_answer = json_pandas(val_answer_file,'annotations')

In [4]:
def clean_raw_data(ques_df,ans_df):
    ques = ques_df.ix[:,[0,2,3]]
    ques['answer']=ans_df.ix[:,3]
    df = ques.drop(['question_id'],axis=1)
    del ques_df,ans_df,ques
    return df

train = clean_raw_data(train_question,train_answer)
test = clean_raw_data(test_question,test_answer)

In [6]:
train[['question']] = train[['question']].astype(str) 
train[['answer']] = train[['answer']].astype(str) 
test[['question']] = test[['question']].astype(str) 
test[['answer']] = test[['answer']].astype(str) 
hdf = pd.HDFStore('data.h5')
hdf.put('train',train,format='table',data_columns=True)
hdf.put('test',test,format='table',data_columns=True)
hdf.close()

In [2]:
test = pd.read_hdf('data.h5','test')
train = pd.read_hdf('data.h5','train')

In [3]:
def dataframe_list(df):
#     ques_list = [s.encode('ascii') for s in list(df.question.values)]
#     ans_list = [s.encode('ascii') for s in list(df.answer.values)]
    ques_list = df.question.values.tolist()
    ans_list = df.answer.values.tolist()
    image_list = df.image_id.values.tolist()
    return ques_list,ans_list,image_list

train_q,train_a,train_img = dataframe_list(train)
#test_q,test_a,test_img = dataframe_list(test)

In [9]:
def get_image_features(img_ids,vgg_model_path):
    features_struct = sc.io.loadmat(vgg_model_path)
    VGGfeatures = features_struct['feats']
    id_map = {}
    for ids in img_ids:
        ids_split = ids.split()
        id_map[id_split[0]] = int(id_split[1])
    nb_samples = len(img_ids)
    nb_dimensions = VGGfeatures.shape[0]
    image_matrix = np.zeros((nb_samples, nb_dimensions))
    for j in range(nb_samples):
        image_matrix[j,:] = VGGfeatures[:,id_map[img_ids[j]]]
    return image_matrix

get_image_features(train_img,'')

In [4]:
def create_dic_question(ques_list,file_path='glove.840B.300d.txt',ques_size=25):
    embedding_index = {}
    with open(file_path) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = coefs
    embedding_martix = embedding_index.values()
    hdf = h5py.File('embedded_matrix.h5', 'w')
    hdf.create_dataset('matrix', data=embedding_martix)
    hdf.close()
    dic = {}
    embedding_ids = embedding_index.keys()
    for ids,words in enumerate(embedding_ids):
        dic[words] = ids
    
    return dic

dic = create_dic_question(train_q)

In [7]:
with open('emb_dic.pickle', 'wb') as f:
    pickle.dump(dic, f)

In [4]:
with open('emb_dic.pickle', 'rb') as f:
    dic = pickle.load(f)

In [16]:
def create_ans_dic(train_a,max_val=1000):
    import operator
    from collections import defaultdict
    freq = defaultdict(int)
    for line in train_a:
        for word in nltk.word_tokenize(str(line)):
            freq[word.lower()]+=1

    sort_freq = sorted(freq.items(),key=operator.itemgetter(1),reverse=True)[0:max_val]
    top_answers, top_fq = zip(*sort_freq)
    dic = {}
    for ids,words in enumerate(top_answers):
        dic[words] = ids

    return dic

In [8]:
def get_ques_matrix(ques_list,dic,ques_size=25):
    
    ques_matrix = np.zeros(shape=(len(ques_list),ques_size))
    
    for i in range(len(ques_list)):
        words = nltk.word_tokenize(ques_list[i])
        for j in range(len(words)):
            try :
                ques_matrix[i,-len(words)+j] = dic[words[j]]
            except:
                ques_matrix[i,-len(words)+j] = 0
    
    return ques_matrix

In [9]:
X_train = get_ques_matrix(train_q,dic)

In [17]:
ans_dic = create_ans_dic(train_a)

In [56]:
def get_ans_matrix(ans_list,dic,max_value=1001):    
    ans_matrix = np.zeros(shape=(len(ans_list),max_value))
    
    for i in range(len(ans_list[:10])):
#         print ans_list[i]
#         print dic[ans_list[i]]
        try:
            ans_matrix[i,dic[ans_list[i]]]=1
        except:
            ans_matrix[i,1000]=1
    return ans_matrix

In [57]:
Y_train = get_ans_matrix(train_a,ans_dic)

In [None]:
max_word = 1000
max_seq = 20

def dataframe_list(df):
    ques_list = [s.encode('ascii') for s in list(df.question.values)]
    ans_list = [s.encode('ascii') for s in list(df.answer.values)]
    image_list = df.image_id.values.tolist()
    return ques_list,ans_list,image_list

train_q,train_a,train_img = dataframe_list(train)
test_q,test_a,test_img = dataframe_list(test)

def tokeniz(txt,mode='default',max_word_size=None):
    
    if mode is 'default':
        tokenizer = Tokenizer(nb_words=None, filters=base_filter(), lower=True, split=" ")
        tokenizer.fit_on_texts(txt)
        sequences = tokenizer.texts_to_sequences(txt)
        
    elif mode is 'question':
        tokenizer = Tokenizer(nb_words=max_word_size, filters=base_filter(), lower=True, split=" ")
        tokenizer.fit_on_texts(txt)
        sequences = tokenizer.texts_to_sequences(txt)
        word_index = tokenizer.word_index
        data = pad_sequences(sequences, maxlen=max_seq)
        return data
    
    elif mode is 'answer':
        tokenizer = Tokenizer(nb_words=None, filters=base_filter(), lower=True, split=" ")
        tokenizer.fit_on_texts(txt)
        sequences = tokenizer.texts_to_sequences(txt)
        word_index = tokenizer.word_index
        sort_freq = sorted(word_index.items(),key=operator.itemgetter(1),reverse=True)[0:max_word]
        top_answers, top_fq = zip(*sort_freq)
        labels = np_utils.to_categorical(np.asarray(top_fq))
        data = np.zeros(shape(len(txt),max_word))
        for i in range(len(txt)):
            data[i]=labels[top_answer.index(txt[i])]
        return labels

X_train = tokeniz(train_q,'question')
Y_train = tokeniz(train_a,'answer')
X_test = tokeniz(test_q,'question')
Y_test = tokeniz(test_a,'answer')

In [None]:
def get_image_labels(pickle_file_path):
    data_frame = pd.read_pickle(pickle_file_path)
    labels = data_frame[['image_id']].values
    return labels



X_train_img = get_image_features(train_img)
X_test_img = get_image_features(test_img)

In [None]:
embed_dim = 300

with open('embeddings/embedding_matrix','r') as f:
    embedding = pickle.load(file)
with open('embeddings/word_idx','r') as f:
    word_idx = pickle.load(file)

embedding_matrix = np.zeros(shape=(len(word_index)+1), embed_dim)
for word,freq in word_index.items():
    embedding_matrix[freq] = embedding[word_idx[word]]

# Model

In [None]:
left_vgg = Sequential()
left_vgg.add(Dense(300, input_dim=4096, activation='relu'))

centre_w2vec = Sequential()
embedding_layer = Embedding(len(word_index) + 1,embed_dim,weights=[embedding_matrix],input_length=max_seq,trainable=False)
#centre_w2vec.add(Dense(500,input_dim=300))

right_vgg = Sequential()
right_vgg.add(Dense(300, input_dim=4096, activation='relu'))

merge_layer = Merge([left_vgg,centre_w2vec,right_vgg], mode='concat')

lstm_model = Sequential()
lstm_model.add(merge_layer)
lstm_model.add(Dropout(dropout_rate))
lstm_model.add(LSTM(1000, input_shape=(1+max_seq+1,300)))
lstm_model.add(Dense(1000, activation='softmax'))

lstm_model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
lstm_model.fit([X_train_img, X_train, X_train_img], Y_train)

In [2]:
embedding_matrix = np.zeros(shape=(400,500))

In [20]:
embedding_matrix.shape[1]

500

In [22]:
def create_network():
#         embedding_matrix = embedding.load()
        embedding_model = Sequential()
        embedding_model.add(Embedding(embedding_matrix.shape[0],embedding_matrix.shape[1],weights = [embedding_matrix],input_length = 25,trainable = False ))
        image_model = Sequential()
        image_model.add(Dense(embedding_matrix.shape[1],input_dim=4096,activation='linear' ))
        image_model.add(Reshape((1,embedding_matrix.shape[1])))
        main_model = Sequential()
        main_model.add(Merge([image_model,embedding_model],mode = 'concat',concat_axis = 1))
        main_model.add(LSTM(1001))
        main_model.add(Dense(1001,activation='softmax'))
        return main_model

In [23]:
model = create_network()

In [16]:
from keras.utils.visualize_util import plot

In [19]:
plot(model, to_file='model1.png',show_shapes=True)

# ROUGH WORK

In [6]:
cd ..

/home/tron/Desktop/VQA


In [4]:
ls

A_T.json  A_V.json  Q_T.json  Q_V.json
