In [1]:
from sentence_transformers import SentenceTransformer, util
import json
import torch
import numpy as np
import random
from tqdm.notebook import tqdm
# import os
%config Completer.use_jedi = False
# os.environ["CUDA_VISIBLE_DEVICES"] = str(0)

In [2]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1').cuda()

In [29]:
def getCaptionLabels(json_dict):
    
    annotations = {ann["image_id"]:ann["category"] for ann in json_dict["annotations"]}
    captions    = {im["image_id"]:im["caption"] for im in json_dict["metadata"]}
    capLabels   = [(imid, captions[imid], annotations[imid]) for imid in annotations.keys()]
    
    return zip(*capLabels)

In [54]:
## load sentences from GeoPlaces
# geop = json.load(open("/home/tarun/metadata/geoPlaces_metadata.json"))
geop = json.load(open("/home/tarun/metadata/geoImnet_metadata.json"))

In [55]:
usa_train = geop["usa_train"]
usa_fids, usa_captions, usa_labels = getCaptionLabels(usa_train)

asia_train = geop["asia_train"]
asia_fids, asia_captions, asia_labels = getCaptionLabels(asia_train)

In [69]:
id_to_name = {int(cat["category_id"]):cat["category_name"] for cat in geop["categories"]}

In [58]:
annotations = {ann["image_id"]:ann["category"] for ann in usa_train["annotations"]}
captions    = {im["image_id"]:im["caption"] for im in usa_train["metadata"]}

flickrIds = list(annotations.keys())

for j in range(10):
    fid = random.choice(flickrIds)
    print("{}:{}:{}".format(fid, id_to_name[annotations[fid]], captions[fid]))

14716148186:palace:Fantasmic! at Disney's Hollywood Studios
8568509223:jean:Outgrowing my Pants
855424645:dik-dik:Günther's Dik-dik
3601624346:promontory:2009-05_sf_visit-02
899731661:ammunition:Spent Shotgun Shells
7717612526:control_center:Launch Control Center
2787788207:leafhopper:A colorful red-banded leafhopper
6358711985:tramway:Queen Street, east from James Street
6087478847:control_center:TWiT Control Center
8670554260:marigold:Marigolds


In [59]:
usa_caption_embeddings = model.encode(usa_captions)

In [60]:
asia_caption_embeddings = model.encode(asia_captions)

In [61]:
def get_similarity_acc(source_embed, target_embed, source_label, target_label, within=False, topks=[1,5]):
    
    topk = max(topks)
        
    if within:
        similarity_matrix = util.cos_sim(source_embed, source_embed)
        mostSimilar = similarity_matrix.topk(topk+1, 1).indices
        mostSimilar = mostSimilar[:,1:]
    else:
        similarity_matrix = util.cos_sim(source_embed, target_embed)
        mostSimilar = similarity_matrix.topk(topk, 1).indices
    
    similarLabels = torch.Tensor(target_label)[mostSimilar.long().reshape(-1)].reshape(-1, topk)
    source_label = torch.Tensor(source_label).view(-1,1).repeat(1,topk)
    
    matched_labels = (torch.Tensor(similarLabels) == torch.Tensor(source_label))
                      
    top1_acc = matched_labels[:,0].sum()/len(source_label)
    topk_acc = matched_labels.any(1).sum()/len(source_label)
        
    return top1_acc, topk_acc

In [62]:
get_similarity_acc(usa_caption_embeddings, asia_caption_embeddings, usa_labels, asia_labels)

(tensor(0.4303), tensor(0.5031))

In [63]:
get_similarity_acc(asia_caption_embeddings, usa_caption_embeddings, asia_labels, usa_labels)

(tensor(0.4758), tensor(0.5380))

In [64]:
get_similarity_acc(usa_caption_embeddings, usa_caption_embeddings, usa_labels, usa_labels, within=True)

(tensor(0.7101), tensor(0.7818))

In [65]:
get_similarity_acc(asia_caption_embeddings, asia_caption_embeddings, asia_labels, asia_labels, within=True)

(tensor(0.7500), tensor(0.8121))

In [66]:
usa_caption_embeddings.shape

(154908, 384)

In [67]:
similarity_matrix = util.cos_sim(usa_caption_embeddings, asia_caption_embeddings)

mostSimilar = similarity_matrix.topk(2, 1).indices
mostSimilar = mostSimilar[:,0]

In [79]:
mostSimilar_labels = torch.Tensor(asia_labels)[mostSimilar.long()]
equal = torch.Tensor(mostSimilar_labels) == torch.Tensor(usa_labels)
torch.sum(equal)/len(usa_labels)

tensor(0.4333)

In [93]:
equal_ids = torch.where((~equal).float())[0]

In [94]:
for j in np.random.choice(equal_ids, 10):
    asia_id = mostSimilar[j]
    print("{}:{}:{} <=> {}:{}:{}\n".format(usa_fids[j],
        id_to_name[usa_labels[j]], \
                                     usa_captions[j], \
                                           asia_fids[asia_id],
                                     id_to_name[asia_labels[asia_id]], \
                                                  asia_captions[asia_id]))

11819430984:traffic_light:Bicycle signs and pavement markings don't match, Pennsylvania Avenue, Washington, DC USA <=> 17341121131:street_sign:Street signs

9425801330:bikini:All Women Lifeguard Tournament 2013 <=> 177367307:billboard:Nike woman

13340337515:beacon:New Jersey's "Big Ass" Lightbulb... <=> 26803144466:bulbul:The lovely bulbul....

10849409385:warplane:Color of NAS Whidbey Island's A-6E Intruder & EA-6B Prowler Gate Guards & Night <=> 8440422131:picket_fence:SDIM1263_5  Dom 5, Village Skrepyaschevo (Скрепящево).   Purple garden gate in picket fence.

2640339648:tender:Tender boat "E-Z Rider" at Scituate Harbor <=> 6194147253:bucket:Sailboat

14108517504:gold_plate:Monmouth University, West Long Branch, New Jersey <=> 8894954789:fortress:The NW Tower

15824366586:cemetery:96.LeonardMatlovich.CongressionalCemetery.WDC.11November2014 <=> 10477340126:fountain:0808 Kerman - Rayen - 239

4969483171:sand:Exodus stretches (and soccer!) <=> 8700098374:african_elephant:Elephant exo

In [None]:
for j in np.random.choice(range(170000), 10):
    asia_id = mostSimilar[j]
    print("{}:{}:{} <=> {}:{}:{}\n".format(usa_fids[j],
        id_to_name[usa_labels[j]], \
                                     usa_captions[j], \
                                           asia_fids[asia_id],
                                     id_to_name[asia_labels[asia_id]], \
                                                  asia_captions[asia_id]))

In [None]:
for j in np.random.choice(range(170000), 20):
    asia_id = mostSimilar[j]
    print("{}:{}:{} <=> {}:{}:{}\n".format(asia_fids[j],
        id_to_name[asia_labels[j]], \
                                     asia_captions[j], \
                                           asia_fids[asia_id],
                                     id_to_name[asia_labels[asia_id]], \
                                                  asia_captions[asia_id]))

## Train city prediction model using caption input

In [None]:
from sentence_transformers import SentenceTransformer, util
import json
import torch
import numpy as np
import pandas as pd
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
def getCaptionLabels(json_dict, usa=True):
    
    filenames = {im["id"]:im["filename"] for im in json_dict["images"]}
        
    captions    = {im["image_id"]:random.choice([im["blip_cap_1"], im["blip_cap_2"]]) for im in json_dict["metadata"]}
    capLabels   = {filenames[imid]:captions[imid] for imid in filenames.keys()}
    
    return capLabels

In [None]:
geop = json.load(open("/newfoundland2/tarun/datasets/Places205/data/vision/torralba/deeplearning/GeoDA/geoPlaces.json"))

In [None]:
usa_train = geop["usa_train"]
fn2cap = getCaptionLabels(usa_train)

asia_train = geop["asia_train"]
fn2cap.update(getCaptionLabels(asia_train))

In [None]:
usa_test = geop["usa_test"]
fn2cap.update(getCaptionLabels(usa_test))

asia_test = geop["asia_test"]
fn2cap.update(getCaptionLabels(asia_test))q

In [None]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1').cuda()

In [None]:
datafile = "../geoData/places205/latlon_train_cities.txt"

In [None]:
latlon = pd.read_csv(datafile, header=None, sep=' ').to_dict('list')
fn2label = dict(zip(latlon[0], latlon[1]))

cap2label = []
for fn in tqdm(list(fn2label.keys())):
    cap2label.append((fn2cap[fn], fn2label[fn]))
#     if fn in fn2cap:
        
#     else:
#         print(fn)

features = model.encode([c[0] for c in tqdm(cap2label)])
labels = [c[1] for c in cap2label]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
clf = LogisticRegression(random_state=0, multi_class="multinomial", max_iter=500, verbose=10).fit(X_train, y_train)

In [None]:
# latlon = pd.read_csv(datafile.replace("train", "test"), header=None, sep=' ').to_dict('list')
# fn2label = dict(zip(latlon[0], latlon[1]))

# cap2label = []
# for fn in tqdm(list(fn2label.keys())):
#     if fn in fn2cap:
#         cap2label.append((fn2cap[fn], fn2label[fn]))

# features_test = model.encode([c[0] for c in cap2label])
# labels_test = [c[1] for c in cap2label]

In [None]:
predictions = clf.decision_function(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
accuracy(torch.Tensor(predictions), torch.Tensor(y_test).view(-1))

In [None]:
def accuracy(output, target, topk=(1,5)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []

    for k in topk:
        #correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        correct_k = correct[:k].float().sum()
        res.append(correct_k.mul_(100.0 / batch_size).item())

    return res