In [1]:
import os
from util.text2vec import get_text_encoder

rootpath=os.path.join(os.environ['HOME'],'VisualSearch')
w2v_style = "word2vec_filterstop"
w2v_data_path = os.path.join(rootpath, "word2vec", 'flickr', 'vec500flickr30m')
w2v2vec = get_text_encoder(w2v_style)(w2v_data_path)

Using TensorFlow backend.


10/12/2019 12:44:45 INFO [util/text2vec.pyc.AveWord2VecFilterStop] initializing ...
[BigFile] 1743364x500 instances loaded from /home/oleh/VisualSearch/word2vec/flickr/vec500flickr30m


In [2]:
%%time

def get_text_features(filepath):
    parsed_text = []
    
    with open(filepath) as f:
        for i, line in enumerate(f):
#             if i % 100 == 0: print("iteration ", i)
            
            line_id, text = line.rstrip().split(" ", 1)
            w2v_res = w2v2vec.mapping(text)
            text_features = w2v_res if w2v_res is not None else [0] * w2v2vec.ndims
            
            img_id, _, num = line_id.split("#")
            parsed_text.append({
                "img_id": img_id,
                "num": num,
                "features": text_features
            })
        
    return parsed_text

filepath = ['/home/oleh/VisualSearch/data_w2vvtest/TextData/data_w2vvtest.caption.txt',
           '/home/oleh/VisualSearch/data_w2vvval/TextData/data_w2vvval.caption.txt',
           '/home/oleh/VisualSearch/data_w2vvtrain/TextData/data_w2vvtrain.caption.txt']

text_features = []
for f in filepath:
    text_features += get_text_features(f)

CPU times: user 2min 11s, sys: 6.23 s, total: 2min 18s
Wall time: 2min 18s


In [3]:
from sklearn.model_selection import train_test_split
from util.losser import get_losser
import numpy as np


X = np.array([x for x in text_features if x['num'] == "1"])
y = np.array([x for x in text_features if x['num'] == "0"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1000, random_state=1234
)

y_features = np.array([x["features"] for x in y])
X_test_features = np.array([x["features"] for x in X_test])

losser = get_losser("cosine")()

In [4]:
similarity = np.array(losser.calculate(X_test_features, y_features))
similarity.shape

(1000, 45643)

In [5]:
predictions = []
for s in similarity:
    top_indeces = s.argsort()[:10]
    predictions.append([x["img_id"] for x in y[top_indeces]])

In [7]:
list2str = lambda l: " ".join([str(x) for x in l])

with open('text_similiarity_result.txt', 'w') as f:
    for i in range(len(predictions)):
        img_id = X_test[i]["img_id"]
        
        f.write('%s#%s\n' % (img_id, list2str(predictions[i])))

# Word2VisualVec for Sentence Representation

This note answers the following two questions:
1. How to load a trained Word2VisualVec model?
2. How to predict visual features from a new sentence?

## 0. Setup

Use the following script to download and extract a Word2VisalVec model trained on flickr30k.
Notice that please refer to [here](https://github.com/danieljf24/w2vv#required-data) to download the dataset 


```shell
ROOTPATH=$HOME/trained_w2vv_model
mkdir -p $ROOTPATH && cd $ROOTPATH

# download and extract the pre-trained model
wget http://lixirong.net/data/w2vv-tmm2018/flickr30k_trained_model.tar.gz
tar zxf flickr30k_trained_model.tar.gz
```

In [1]:
import os
import keras
from basic.common import readPkl
from w2vv_pred import W2VV_MS_pred, pred_mutual_error_ms
from util.text import encode_text
from util.text2vec import get_text_encoder
from util.util import readImgSents 
from simpleknn.bigfile import BigFile
from util.losser import get_losser
from util.evaluation import i2t

Using TensorFlow backend.


In [2]:
use_flickr = False

model_name = "flickr30k_trained_model" if use_flickr else "14_epoch_1000_descr"
# model_name = "flickr30k_trained_model" if use_flickr else "1000chars_description_trained_model"
trainCollection = "flickr30kenctrain" if use_flickr else 'data_w2vvtrain'
testCollection='data_w2vvtest'

## 1. Load a trained Word2Visual model

In [3]:
model_path = os.path.join(os.environ['HOME'],'trained_w2vv_model/' + model_name)
abs_model_path = os.path.join(model_path, 'model.json')
weight_path = os.path.join(model_path, 'best_model.h5')
predictor = W2VV_MS_pred(abs_model_path, weight_path)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
10/12/2019 14:08:51 INFO [w2vv_pred.pyc.W2VV_MS_pred] loaded a trained Word2VisualVec model successfully


## 2. Precision of prediction on test dataset

In [6]:
# setup multi-scale sentence vectorization
opt = readPkl(os.path.join(model_path, 'option.pkl'))
# opt.n_caption = 2

rootpath=os.path.join(os.environ['HOME'],'VisualSearch')
rnn_style, bow_style, w2v_style = opt.text_style.strip().split('@')
text_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", "bow", opt.rnn_vocab)
bow_data_path = os.path.join(rootpath, trainCollection, "TextData", "vocabulary", bow_style, opt.bow_vocab)
w2v_data_path = os.path.join(rootpath, "word2vec", opt.corpus,  opt.word2vec)

text2vec = get_text_encoder(rnn_style)(text_data_path)
bow2vec = get_text_encoder(bow_style)(bow_data_path)
w2v2vec = get_text_encoder(w2v_style)(w2v_data_path)

# similarity function
losser = get_losser(opt.simi_fun)()

10/12/2019 14:08:58 INFO [util/text2vec.pyc.Index2Vec] initializing ...
10/12/2019 14:08:58 INFO [util/text2vec.pyc.BoW2VecFilterStop] initializing ...
10/12/2019 14:08:58 INFO [util/text2vec.pyc.BoW2VecFilterStop] 50105 words
10/12/2019 14:08:58 INFO [util/text2vec.pyc.AveWord2VecFilterStop] initializing ...
[BigFile] 1743364x500 instances loaded from /home/oleh/VisualSearch/word2vec/flickr/vec500flickr30m


In [6]:
# img2vec
img_feats_path = os.path.join(rootpath, testCollection, 'FeatureData', opt.img_feature)
img_feats = BigFile(img_feats_path)

test_sent_file = os.path.join(rootpath, testCollection, 'TextData','%s.caption.txt' % testCollection)
img_list, sents_id, sents = readImgSents(test_sent_file)
all_errors = pred_mutual_error_ms(img_list, sents, predictor, text2vec, bow2vec, w2v2vec, img_feats, losser, opt=opt)


# compute performance
(r1i, r5i, r10i, medri, meanri) = i2t(all_errors, n_caption=opt.n_caption)
print "Image to text: %.1f, %.1f, %.1f, %.1f, %.1f" % (r1i, r5i, r10i, medri, meanri)

[BigFile] 45643x2048 instances loaded from /home/oleh/VisualSearch/data_w2vvtest/FeatureData/pyresnet152-pool5os
embedding all sentences ...
len(text_batch) 2000
len(text_batch[0]) 50605
len(text_rnn_batch) 2000

embedding all images ...
matching image and text ...
(2000, 1000)
Image to text: 22.6, 42.8, 52.6, 9.0, 60.7


Image to text(flickr run) : 45.6, 72.1, 81.5, 2.0, 13.3

Image to text(up to 1000 words, vocab from flickr): 1.2, 3.3, 6.4, 115.0, 122.5

Image to text(entire article, flickr vocab): 0.4, 2.1, 3.7, 362.0, 405.4

## 3. Specific Output Example

### Read Data

In [7]:
import numpy as np
import string
import json
import shutil

import os
from os import listdir, mkdir
from os.path import isfile, isdir, join, exists, abspath
from keras.preprocessing import image
from IPython.display import display, Image
import regex as re

In [8]:
def _remove_punctuation(text):
    return re.sub(ur"\p{P}+", "", text)

def _getJSON(path):
    with open(path) as json_file:
        return json.loads(json.load(json_file))

def _getTextFeatures(text_path):
    data = _getJSON(text_path)
    text = _remove_punctuation(data['text'].replace("\n", " "))
    text = text[:1000].rsplit(' ', 1)[0]
    # onyshchak: only checking first 1000 characters, will need to extract summary propely
    data["text"] = text
    return data

def _getImagesMeta(path):
    return _getJSON(path)['img_meta']

def _getValidImagePaths(article_path):
    img_path = join(article_path, 'img/')
    return [join(img_path, f) for f in listdir(img_path) if isfile(join(img_path, f)) and f[-4:].lower() == ".jpg"]

def _dump(path, data):
    with open(path, 'w', encoding='utf8') as outfile:
        json.dump(data, outfile, indent=2, ensure_ascii=False)

def GetArticleData(article_path):
    article_data = _getTextFeatures(join(article_path, 'text.json'))
    article_data["img"] = _getImagesMeta(join(article_path, 'img/', 'meta.json'))
    
    return article_data

def ReadArticles(data_path, offset=0, limit=None):
    print("Reading in progress...")
    article_paths = [join(data_path, f) for f in listdir(data_path) if isdir(join(data_path, f))]
    limit = limit if limit else len(article_paths) - offset
    
    articles = []
    for i in range(offset, offset + limit):
        path = article_paths[i]
        if (i - offset + 1) % 251 == 0: print(i - offset, "articles have been read")
        article_data = GetArticleData(path)
        articles.append(article_data)
        if len(articles) >= limit: break  # useless?
        
    print(limit, "articles have been read")
    return articles

In [9]:
import random
import hashlib
from urllib import quote

random.seed(1234)

def get_matched_article_id(img_features, articles):
    for tries in range(50):
        i = int(random.random() * len(articles))
        page = articles[i]
        text = page["text"]
        print(i, page['title'])
        
        rnn_vec, bow_w2v_vec = encode_text(opt, text2vec, bow2vec, w2v2vec, text)
        predicted_features = predictor.predict_one(rnn_vec, bow_w2v_vec).reshape(1, -1)

        similarity = np.array(losser.calculate(predicted_features, img_features)[0])
        true_img = [x["filename"] for x in page["img"]]
        for x in images[similarity.argsort()[:10]]:
            if x["filename"] in true_img:
                print("FOUND", x["filename"])
                print(x["url"])
                return i
    return -1

def get_url(img_title, size=600):
    img_name = img_title.replace("\"", "")
    for forbidden in ':*?/\\ ':
        img_name = img_name.replace(forbidden, '_')
        
    img_name = img_name.encode('utf-8')
    url_prefix = "https://upload.wikimedia.org/wikipedia/commons/thumb/"
    md5 = hashlib.md5(img_name).hexdigest()
    sep = "/"
    
    img_name = quote(img_name)
    url = url_prefix + sep.join((md5[0], md5[:2], img_name)) + sep + str(size) + "px-" + img_name
    if url[-4:] != ".jpg" and url[-4:] != "jpeg":
        url += ".jpg"
        
    return url

In [10]:
%%time
articles = ReadArticles('../data/', offset=0, limit=None)

Reading in progress...
(250, 'articles have been read')
(501, 'articles have been read')
(752, 'articles have been read')
(1003, 'articles have been read')
(1254, 'articles have been read')
(1505, 'articles have been read')
(1756, 'articles have been read')
(2007, 'articles have been read')
(2258, 'articles have been read')
(2509, 'articles have been read')
(2760, 'articles have been read')
(3011, 'articles have been read')
(3262, 'articles have been read')
(3513, 'articles have been read')
(3764, 'articles have been read')
(4015, 'articles have been read')
(4266, 'articles have been read')
(4517, 'articles have been read')
(4768, 'articles have been read')
(5019, 'articles have been read')
(5270, 'articles have been read')
(5521, 'articles have been read')
(5638, 'articles have been read')
CPU times: user 51.4 s, sys: 6.05 s, total: 57.5 s
Wall time: 1min 2s


In [17]:
images = {i["filename"]: i for a in articles for i in a['img']}
images = np.array([x for x in images.values() if "features" in x])

### Text Similarity Model

In [11]:
y = []
predictions = []
with open("text_similiarity_result.txt") as f:
    for i, line in enumerate(f):
        label, pred = line.rstrip().split("#")
        y.append(label)
        predictions.append(pred.split())

In [13]:
def get_precision(predictions, labels):
    K1, K3, K10 = 0, 0, 0
    for i in range(len(predictions)):
        img_id = labels[i]
        pred = predictions[i]
    
        if img_id == pred[0]:
            K1 += 1
        if img_id in pred[:3]:
            K3 += 1
        if img_id in pred:
            K10 += 1
    
    return np.array([K1, K3, K10], dtype=np.float32) / len(predictions) * 100

get_precision(predictions, y)

array([ 2.3,  7.8, 17.7], dtype=float32)

In [14]:
get_article_by_img = lambda img_id: [a for a in articles for i in a['img'] if img_id in i["filename"]][0]

article_index = 17
a = get_article_by_img(y[article_index])

In [15]:
for x in a["img"]:
    img_url = get_url(x['title'])
    display(Image(url=img_url))

In [18]:
get_id = lambda filename: filename.rsplit('.', 1)[0]

pred = predictions[article_index]
pred_img = [i for i in images if get_id(i['filename']) in pred]
for x in pred_img:
    img_url = get_url(x['title'])
    display(Image(url=img_url))

In [19]:
a2 = get_article_by_img("eea20277ca3be6faca8b1e90b8364294")
a2["text"]

u'John Sidney McCain III August 29 1936\xa0 August 25 2018 was an American politician and military officer who served as a United States senator from Arizona from January 1987 until his death in 2018 He previously served two terms in the United States House of Representatives and was the Republican nominee for president of the United States in the 2008 election which he lost to Barack Obama  McCain graduated from the United States Naval Academy in 1958 and received a commission in the United States Navy He became a naval aviator and flew groundattack aircraft from aircraft carriers During the Vietnam War he almost died in the 1967 USS Forrestal fire While on a bombing mission during Operation Rolling Thunder over Hanoi in October 1967 he was shot down seriously injured and captured by the North Vietnamese He was a prisoner of war until 1973 He experienced episodes of torture and refused an outofsequence early release During the war he sustained wounds that left him with lifelong physic

In [20]:
for x in a2["img"]:
    img_url = get_url(x['title'])
    display(Image(url=img_url))

In [21]:
pred = predictions[y.index("eea20277ca3be6faca8b1e90b8364294")]
pred_img = [i for i in images if get_id(i['filename']) in pred]
for x in pred_img:
    img_url = get_url(x['title'])
    display(Image(url=img_url))

In [22]:
del y
del predictions
del pred_img
del pred

### Well-performing case of 'Maserati MC12' article

In [23]:
img_features = np.array([x["features"] for x in images], dtype=np.float32)

In [24]:
matched_article_id = 837  # get_matched_article_id(img_features, articles)

In [25]:
# page = [x for x in articles if x["title"] == "Barack Obama"][0]
page = articles[matched_article_id]
text = page["text"]
# text = page["img"][1]["description"]
print(text)
rnn_vec, bow_w2v_vec = encode_text(opt, text2vec, bow2vec, w2v2vec, text)
predicted_features = predictor.predict_one(rnn_vec, bow_w2v_vec).reshape(1, -1)
predicted_features

The Maserati MC12 Tipo M144S is a limited production twoseater sports car produced by Italian car maker Maserati to allow a racing variant to compete in the FIA GT Championship The car entered production in 2004 with 25 cars produced A further 25 were produced in 2005 making a total of 50 cars available for customers each of which was presold for €600000 US$670541 With the addition of 12 cars produced for racing only a total of 62 of these cars were ever produced  Maserati designed and built the car on the chassis of the Enzo Ferrari but the final car is much larger and has a lower drag coefficient The MC12 is longer wider and taller and has a sharper nose and smoother curves than the Enzo Ferrari which has faster acceleration better braking performance shorter braking distance and a higher top speed The top speed of the Maserati MC12 is 330 kilometres per hour 205 mph whereas the top speed of the Enzo Ferrari is 350 kilometres per hour 2175 mph  The MC12 was developed to signal



array([[10.244272 ,  2.7473078,  6.2928286, ...,  4.8201237,  7.270011 ,
         6.8462605]], dtype=float32)

In [26]:
similarity = np.array(losser.calculate(predicted_features, img_features)[0])
# res = res + 1
similarity

array([-0.70040362, -0.59022799, -0.74309799, ..., -0.74211699,
       -0.65287192, -0.81330535])

Double-checking that `similarity` and `img_features` have the same order

In [27]:
print(similarity[:3])
print(similarity[-3:])

[-0.70040362 -0.59022799 -0.74309799]
[-0.74211699 -0.65287192 -0.81330535]


In [28]:
print(losser.calculate(img_features[:3], predicted_features))
print(losser.calculate(img_features[-3:], predicted_features))

[[-0.7004036208993722], [-0.590227985478179], [-0.7430979903591116]]
[[-0.7421169916424962], [-0.6528719174246803], [-0.8133053530987427]]


Double-checking that `images` and `img_features` have the same order

In [18]:
get_features = lambda img: np.array(img['features']).astype(np.float32)
all([(get_features(images[i]) == img_features[i]).all() for i in range(len(images))])

True

* 1 double check that we have the same order, because similarities are very big and results bad
* 5 then if doesnt work, train on single image per article (the most relevant one)
* 4 finish with text2text similarity (the last priority)
* 2 identify article with high precision and check images (is it for real?)
* 3 check that we have the same precision

In [29]:
min(similarity), max(similarity)

(-0.9000764798330435, -0.19678824772487769)

In [30]:
page[u"title"]

u'Maserati MC12'

Real images on `Maserati MC12` Wikipedia page

In [31]:
for x in page["img"]:
    img_url = get_url(x['title'])
    display(Image(url=img_url))

Top-10 ranked images predicted by the model for `Maserati MC12` page

In [32]:
print(similarity[similarity.argsort()[:10]])

[-0.90007648 -0.89766622 -0.89473127 -0.89458867 -0.89275377 -0.89187519
 -0.89154553 -0.89129715 -0.89044287 -0.8899288 ]


In [33]:
for x in images[similarity.argsort()[:10]]:
    img_url = get_url(x['title'])
    display(Image(url=img_url))

Note: in case of this article, when taking description of its images as an input, performance is poor

### Random Article Performance

In [34]:
def wiki_predict(page, topK=10):
    text = page["text"]
    rnn_vec, bow_w2v_vec = encode_text(opt, text2vec, bow2vec, w2v2vec, text)
    predicted_features = predictor.predict_one(rnn_vec, bow_w2v_vec).reshape(1, -1)

    similarity = np.array(losser.calculate(predicted_features, img_features)[0])
    true_img_url = [get_url(x["title"]) for x in page["img"]]
    pred_img = images[similarity.argsort()[:topK]]
    pred_img_url = [get_url(x["title"]) for x in pred_img]
    
    return true_img_url, pred_img_url

In [36]:
obama_page = [x for x in articles if x["title"] == "Barack Obama"][0]
obama_page["text"]

u'Barack Hussein Obama II  January 20 2009 born August 4 1961 is an American attorney and politician who served as the 44th president of the United States from 2009 to 2017 A member of the Democratic Party he was the first African American to be elected to the presidency He previously served as a US senator from Illinois from 2005 to 2008 and an Illinois state senator from 1997 to 2004  Obama was born in Honolulu Hawaii After graduating from Columbia University in 1983 he worked as a community organizer in Chicago In 1988 he enrolled in Harvard Law School where he was the first black president of the Harvard Law Review After graduating he became a civil rights attorney and an academic teaching constitutional law at the University of Chicago Law School from 1992 to 2004 He represented the 13th district for three terms in the Illinois Senate from 1997 until 2004 when he ran for the US Senate He received national attention in 2004 with his March primary win his wellreceived July Democrati

In [35]:
true_img_url, pred_img_url = wiki_predict(obama_page)

# Barack Obama's images
for x in pred_img_url:
    display(Image(url=x))

**TODO:** Make sure you checking on examples from **test** subset

## 3. Predict visual features of a novel sentence

In [26]:
sent='a dog is playing with a cat'
rnn_vec, bow_w2v_vec = encode_text(opt,text2vec,bow2vec,w2v2vec,sent)
predicted_text_feat = predictor.predict_one(rnn_vec,bow_w2v_vec)
print len(predicted_text_feat)
print predicted_text_feat

2048
[ 6.1648006  5.2095037  8.546985  ...  7.519165  10.404728   7.7131257]
