# Init

This file contains code to compute the image concreteness. The concreteness has two versions:

- Entropy version
    - Entropy I: Compute the entropy of label_id distribution.
    - Entropy II: 
    
- MNI version (based on the shared paper)

- Distance-based

- Dictionary-based (sum up the concreteness score of all object names in the image)

# Init

This file contains code to compute the image concreteness. The concreteness has two versions:

- Entropy version
    - Entropy I: Compute the entropy of label_id distribution.
    - Entropy II: 
    
- MNI version (based on the shared paper)

- Distance-based

- Dictionary-based (sum up the concreteness score of all object names in the image)

# MNI concreteness

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import os
import PIL
import pyarrow.feather as feather
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
import torchmetrics

from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import ImageFolder
from torchvision.models import resnext101_32x8d
from torchvision.transforms import Compose, ToTensor, Normalize, Resize, ToPILImage, CenterCrop, RandomResizedCrop, RandomHorizontalFlip
from tqdm.auto import tqdm

pl.seed_everything(42)

wdir = '/home/yu/OneDrive/Construal'
IMG_DIR = f'{wdir}/data/kickstarter-data'
MODEL_DIR = f'{wdir}/pretrained_models/mmdetection'

os.chdir(wdir)

Global seed set to 42


42

## get representation

For every project image, use a backbone to generate a representation for it.

In [11]:
%%capture

# Get the `pid` of all projects of all projects
pjson = feather.read_feather(f'{WORK_DIR}/data/pjson.feather')
pjson = dt.Frame(pjson)
pids = pjson[(f.category=='Product Design') | (f.category=='Accessories'), f.pid].to_list()[0]

In [12]:
# define dataset
class ReprDataset(Dataset):
    def __init__(self, pids):
        # check all pids exists
        pids = [pid for pid in pids
                if os.path.exists(f'{IMG_DIR}/{pid}/profile_full.jpg')]
        self.pids = pids
        self.transform = Compose([Resize(256),
                                  CenterCrop(224),
                                  ToTensor(),
                                  Normalize(mean=[0.485, 0.456, 0.406],
                                            std=[0.229, 0.224, 0.225])])

    def __len__(self):
        return len(self.pids)

    def __getitem__(self, idx):
        pid = self.pids[idx]
        img_path = f'{IMG_DIR}/{pid}/profile_full.jpg'
        with PIL.Image.open(img_path) as img:
            img = img.convert('RGB')
            img = self.transform(img)
        return pid, img

# load and freeze model     
model = resnext101_32x8d(pretrained=True)   
model.fc = nn.Identity()

device = 'cuda:1'
model.to(device)

# make dataset/dataloader
ds = ReprDataset(pids)
dl = DataLoader(ds, shuffle=False, batch_size=32, drop_last=False)
print(f'{len(dl)=}')
results = {}

# run!
with torch.no_grad():
    for i, (pid, img) in enumerate(tqdm(dl)):
        img = img.to(device)
        img_repr = model(img)

        for p, r in zip(pid, img_repr):
            results[p] = r

torch.save(results, f'{wdir}/data/obj_repr.pt')

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1

len(dl)=118


  0%|          | 0/118 [00:00<?, ?it/s]



## get label-level MNI

In [2]:
# make text description from label name (R)
setwd('/home/yu/OneDrive/Construal')
ld('df_objdet', force=T)
ld('lvis_dist', force=T)

label_id_name_link = lvis_dist[, .(label_id=id, label_name=name)] %>% unique()

project_label_names = df_objdet[label_id_name_link, on=.(label_id)
      ][prob>=0.5, 
        .(label_name, label_id),
        keyby=.(pid)] %>% unique()

sv(project_label_names)

ERROR: Error in ld("df_objdet", force = T): unused argument (force = T)


In [2]:
# Load image reprs into an annoy tree
reprs = torch.load(f'{wdir}/data/obj_repr.pt')
valid_pids = list(reprs.keys())

# load image reprs into an annoy tree
from annoy import AnnoyIndex

t = AnnoyIndex(2048, 'angular')  # Length of item vector that will be indexed

pid2id = {}
for i, (pid, vec) in enumerate(reprs.items()): 
    # create a map from label_id to an int
    pid2id[pid] = i

    # add to annoy tree
    vec = vec.cpu().numpy()
    t.add_item(i, vec)
    
t.build(1000)

True

In [3]:
# build a set for unique objects
ld('project_label_names', 'text')
text = text[[f.pid == p for p in valid_pids],:]
W = dt.unique(text['label_name']).to_list()[0]

# set k
k = 25

def make_mni(k, t):
    mni_dict = {}

    for w in tqdm(W):
        Vw = text[f.label_name==w, [f.pid]]
        Vw = dt.unique(Vw).to_list()[0]
        Vw = [pid2id[pid] for pid in Vw]
        Vw = set(Vw)
        
        a = 0
        for v in Vw:
            NN_v = set(t.get_nns_by_item(v, k)) - set([v])
            a += len(Vw.intersection(NN_v))

        mni = a/len(Vw)
        adj_mni = mni/(len(Vw)*k)*819

        mni_dict[w] = adj_mni

    frame = dt.Frame(label=list(mni_dict.keys()), mni=list(mni_dict.values()))[:,:,dt.sort(-f.mni)]

    return frame

dt_k10 = make_mni(10, t)
dt_k25 = make_mni(25, t)
dt_k50 = make_mni(50, t)
dt_k100 = make_mni(100, t)

sv('dt_k10')
sv('dt_k25')
sv('dt_k50')
sv('dt_k100')

"project_label_names.feather" (231.7 KB) loaded as "text" (<1s) (2021-10-18 3:41 PM)


100%|██████████| 561/561 [01:04<00:00,  8.67it/s]
100%|██████████| 561/561 [01:11<00:00,  7.89it/s]
100%|██████████| 561/561 [01:18<00:00,  7.11it/s]
100%|██████████| 561/561 [01:25<00:00,  6.55it/s]

"dt_k10" saved as "dt_k10.feather" (9.5 KB) (<1s) (2021-10-18 3:47 PM)
"dt_k25" saved as "dt_k25.feather" (9.7 KB) (<1s) (2021-10-18 3:47 PM)
"dt_k50" saved as "dt_k50.feather" (9.8 KB) (<1s) (2021-10-18 3:47 PM)
"dt_k100" saved as "dt_k100.feather" (10.0 KB) (<1s) (2021-10-18 3:47 PM)





In [1]:
ld(pid_weighted_mni_entropy, path='/home/yu/OneDrive/Construal/data/sharing')
pid_weighted_mni_entropy

ERROR: Error in ld(pid_weighted_mni_entropy, path = "/home/yu/OneDrive/Construal/data/sharing"): Cannot find "pid_weighted_mni_entropy" with possible extensions ("rds", "feather")


In [1]:
# R

setwd('/home/yu/OneDrive/Construal')
ld(dt_k10)
ld(dt_k25)
ld(dt_k50)
ld(dt_k100)

setnames(dt_k10, 'mni', 'mni_k10')
setnames(dt_k25, 'mni', 'mni_k25')
setnames(dt_k50, 'mni', 'mni_k50')
setnames(dt_k100, 'mni', 'mni_k100')

mni = dt_k10[dt_k25, on='label'
    ][dt_k50, on='label'
    ][dt_k100, on='label']
sv(mni)

"dt_k10.feather" (9.5 KB) loaded (0.02 secs) (2021-10-18 3:48 PM)
"dt_k25.feather" (9.7 KB) loaded (0 secs) (2021-10-18 3:48 PM)
"dt_k50.feather" (9.8 KB) loaded (0 secs) (2021-10-18 3:48 PM)
"dt_k100.feather" (10 KB) loaded (0 secs) (2021-10-18 3:48 PM)
"mni" saved as "mni.feather" (16.6 KB) (0.01 secs, 2021-10-18 15:48:31)


## get pid-level MNI

> Notes:
> - Only select the "project_full.jpg"

### prob is NOT normalized

In [3]:
wdir = '/home/yu/OneDrive/Construal'
setwd(wdir)

ld(df_objdet, force=T)
ld(mni, force=T)
ld(lvis_dist)

label_id_name_link = lvis_dist[, .(label_id=id, label_name=name)] %>% unique()

pid_mni_weighted = df_objdet[label_id_name_link, on=.(label_id)
    ][jpg=='profile_full.jpg', .(pid, label_name, inst_id, prob)
    ][mni, on=c('label_name==label'), nomatch=NULL
    ][, .(mni_k10_weighted_unnormalized=sum(prob*mni_k10),
          mni_k10_weighted_normalized=sum(prob*mni_k10)/sum(prob),
          mni_k25_weighted_unnormalized=sum(prob*mni_k25),
          mni_k25_weighted_normalized=sum(prob*mni_k25)/sum(prob),
          mni_k50_weighted_unnormalized=sum(prob*mni_k50),
          mni_k50_weighted_normalized=sum(prob*mni_k50)/sum(prob),
          mni_k100_weighted_unnormalized=sum(prob*mni_k100),
          mni_k100_weighted_normalized=sum(prob*mni_k100)/sum(prob)),
      keyby=.(pid)
    ][order(pid)]

sv(pid_mni_weighted)
pid_mni_weighted %>% head()

"df_objdet.feather" (82.1 MB) loaded (0.68 secs) (2021-10-18 3:49 PM)
"mni.feather" (16.6 KB) loaded (0 secs) (2021-10-18 3:49 PM)
"lvis_dist.feather" (80.4 KB) loaded (0 secs) (2021-10-18 3:49 PM)
"pid_mni_weighted" saved as "pid_mni_weighted.feather" (280.5 KB) (0 secs, 2021-10-18 15:49:08)


pid,mni_k10_weighted_unnormalized,mni_k10_weighted_normalized,mni_k25_weighted_unnormalized,mni_k25_weighted_normalized,mni_k50_weighted_unnormalized,mni_k50_weighted_normalized,mni_k100_weighted_unnormalized,mni_k100_weighted_normalized
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1000117510,6.813139,0.5636056,6.839113,0.5657542,6.583594,0.5446169,5.5777506,0.4614101
1000234595,3.579877,0.7318273,3.249338,0.6642557,2.681237,0.54812,2.3289726,0.4761073
1000426032,3.050856,0.6679252,3.516629,0.7698972,3.024262,0.6621031,2.6430163,0.5786367
1001190550,4.963197,1.0224062,4.201131,0.8654224,3.76235,0.7750346,3.2009327,0.6593841
1001259618,1.29799,0.6367848,1.01239,0.4966715,1.152132,0.5652277,0.9731432,0.4774172
1003704820,4.909002,0.6681071,4.994311,0.6797174,4.712035,0.6413001,4.1226527,0.5610862


# Entropy

Image concreteness based entropy. The steps are:
- Compute the entropy for every label_id
- For every PID, aggregate the label-level entropy of all the containing images.

## get label-level entropy

In [4]:
library(arrow)
library(stringr)

wdir = '/home/yu/OneDrive/Construal/'
setwd(wdir)

# Substitute with your data path
ld(df_objdet)

# compute probabilities of labels
obj = df_objdet[jpg=='profile_full.jpg'
    ][, .(prob=sum(prob)), keyby=.(label_id, pid)
    ][, .(prob=prob/sum(prob), id=1:.N), keyby=.(label_id)]

# extroploate so that every label has 819 obs
CJ = data.table(label_id=rep(1:1202, each=819), id=rep(1:819, 1202))
obj = obj[CJ, on=c('label_id', 'id')
    ][is.na(prob), ':='(prob=0)
    ][, ':='(prob=prob+1e-5)
    ][, ':='(prob=prob/sum(prob)), keyby=.(label_id)]

# compute entropy
entropy = obj[, .(entropy=-sum(prob*log(prob))), keyby=.(label_id)]
sv(entropy)
entropy %>% head()

df_objdet (82.1 MB) already loaded, will NOT load again! (0 secs) (2021-10-18 3:49 PM)
"entropy" saved as "entropy.feather" (15.3 KB) (0 secs, 2021-10-18 15:49:53)


label_id,entropy
<int>,<dbl>
1,4.239553
2,5.161556
3,5.32402
4,3.382207
5,6.059459
6,1.802409


## get pid-level entropy

In [5]:
ld(df_objdet, force=T)
ld(entropy, force=T)
ld(lvis_dist)

label_id_name_link = lvis_dist[, .(label_id=id, label_name=name)] %>% unique()

pid_entropy_weighted = df_objdet[label_id_name_link, on=.(label_id)
    ][jpg=='profile_full.jpg', .(pid, label_id, inst_id, prob)
    ][entropy, on=c('label_id'), nomatch=NULL
    ][, .(entropy_weighted_unnormalized=sum(prob*entropy),
          entropy_weighted_normalized=sum(prob*entropy)/sum(prob)),
      keyby=.(pid)
    ][order(pid)]

sv(pid_entropy_weighted)

pid_entropy_weighted %>% head()

"df_objdet.feather" (82.1 MB) loaded (0.52 secs) (2021-10-18 3:50 PM)
"entropy.feather" (15.3 KB) loaded (0 secs) (2021-10-18 3:50 PM)
lvis_dist (80.4 KB) already loaded, will NOT load again! (0 secs) (2021-10-18 3:50 PM)
"pid_entropy_weighted" saved as "pid_entropy_weighted.feather" (102.6 KB) (0 secs, 2021-10-18 15:50:09)


pid,entropy_weighted_unnormalized,entropy_weighted_normalized
<chr>,<dbl>,<dbl>
1000117510,71.16351,5.375395
1000234595,29.46907,5.201943
1000426032,24.25037,5.239405
1001190550,25.03649,4.954908
1001259618,10.87286,5.065028
1003704820,39.81941,5.240943


In [6]:
# Combine both entropy and MNI into one dataset
ld(pid_entropy_weighted)
ld(pid_mni_weighted)

pid_weighted_mni_entropy = pid_entropy_weighted[pid_mni_weighted, on=.(pid), nomatch=NULL]
pid_weighted_mni_entropy %>% head()
sv(pid_weighted_mni_entropy)

pid_entropy_weighted (102.6 KB) already loaded, will NOT load again! (0 secs) (2021-10-18 3:50 PM)
pid_mni_weighted (280.5 KB) already loaded, will NOT load again! (0 secs) (2021-10-18 3:50 PM)


pid,entropy_weighted_unnormalized,entropy_weighted_normalized,mni_k10_weighted_unnormalized,mni_k10_weighted_normalized,mni_k25_weighted_unnormalized,mni_k25_weighted_normalized,mni_k50_weighted_unnormalized,mni_k50_weighted_normalized,mni_k100_weighted_unnormalized,mni_k100_weighted_normalized
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1000117510,71.16351,5.375395,6.813139,0.5636056,6.839113,0.5657542,6.583594,0.5446169,5.5777506,0.4614101
1000234595,29.46907,5.201943,3.579877,0.7318273,3.249338,0.6642557,2.681237,0.54812,2.3289726,0.4761073
1000426032,24.25037,5.239405,3.050856,0.6679252,3.516629,0.7698972,3.024262,0.6621031,2.6430163,0.5786367
1001190550,25.03649,4.954908,4.963197,1.0224062,4.201131,0.8654224,3.76235,0.7750346,3.2009327,0.6593841
1001259618,10.87286,5.065028,1.29799,0.6367848,1.01239,0.4966715,1.152132,0.5652277,0.9731432,0.4774172
1003704820,39.81941,5.240943,4.909002,0.6681071,4.994311,0.6797174,4.712035,0.6413001,4.1226527,0.5610862


"pid_weighted_mni_entropy" saved as "pid_weighted_mni_entropy.feather" (339.8 KB) (0 secs, 2021-10-18 15:50:17)


In [7]:
wdir = '/home/yu/OneDrive/Construal'
setwd(wdir)
ld(pid_weighted_mni_entropy)
fwrite(pid_weighted_mni_entropy, 'data/sharing/pid_weighted_mni_entropy.csv')

pid_weighted_mni_entropy (339.8 KB) already loaded, will NOT load again! (0 secs) (2021-10-18 3:50 PM)


# Image-text Interaction

Distance-baesd concreteness. Steps are:
- For every PID, Tokenize the image headline. 
- Compute the embedding of every token in the headline. These embeddings form a cluster.
- For every label_name of the PID, compute its embedding. Again, the label_names form another cluster.
- Compute the distance:
    - Method I: compute the average distance between the clusters
    - Method II: compute the shorted distance between the clusters

In [1]:
import numpy as np
import spacy
import torch
import torch.nn.functional as F
import torchtext

from datatable import f
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, manhattan_distances
from sklearn.neighbors import DistanceMetric
from tqdm.auto import tqdm
from utilpy import sv, ld

wdir = '/home/yu/OneDrive/Construal'
IMG_DIR = f'{wdir}/data/kickstarter-data'
MODEL_DIR = f'{wdir}/pretrained_models/mmdetection'

os.chdir(wdir)

## get emb from title text

In [9]:
# tokenize the image titles
nlp = spacy.load("en_core_web_sm")

glove = torchtext.vocab.GloVe()
fasttext = torchtext.vocab.FastText()
charngram = torchtext.vocab.CharNGram()

In [None]:
# ------ covert text to tokens --------------
pjson = ld('pjson')

titles = pjson[
    (f.category=='Product Design') | (f.category=='Accessories'), 
    f.title].to_list()[0]

descs = pjson[
    (f.category=='Product Design') | (f.category=='Accessories'), 
    f.project_desc].to_list()[0]
    
pids = pjson[
    (f.category=='Product Design') | (f.category=='Accessories'), 
    f.pid].to_list()[0]

# get the tokenized title
title_tokens = []
for i, title in enumerate(nlp.pipe(titles)):
    tokens = [t.text for t in title
              if t.pos_ in ['NOUN', 'PROPN', 'PRON']]

    # if there's no NOUN tokens in the title, we include them all
    if len(tokens) == 0:
        tokens = [t.text for t in title]
    title_tokens.append(tokens)

# get the tokenized proj_desc
desc_tokens = []
for i, desc in enumerate(nlp.pipe(descs)):
    tokens = [t.text for t in desc[:200]
              if t.pos_ in ['NOUN', 'PROPN', 'PRON']]

    # if there's no NOUN tokens in the title, we include them all
    if len(tokens) == 0:
        tokens = [t.text for t in title]
    desc_tokens.append(tokens)

In [16]:
# ---------- convert tokens to embs ----------------

def get_emb(vocab, tokens):
    # get the embedding for every title token
    emb_output = {}
    for i, (pid, token) in enumerate(zip(pids, tokens)):
        emb = []
        for t in token:
            t = t.lower() if isinstance(vocab, torchtext.vocab.FastText) else t
            emb.append(vocab[t].squeeze())
        
        emb = torch.stack(emb)

        emb_output[pid] = emb
    return emb_output

title_emb_glove = get_emb(glove, title_tokens)
title_emb_fasttext = get_emb(fasttext, title_tokens)
title_emb_charngram = get_emb(charngram, title_tokens)

desc_emb_glove = get_emb(glove, desc_tokens)
desc_emb_fasttext = get_emb(fasttext, desc_tokens)
desc_emb_charngram = get_emb(charngram, desc_tokens)

torch.save(title_emb_glove, 'data/title_emb_glove.pt')
torch.save(title_emb_fasttext, 'data/title_emb_fasttext.pt')
torch.save(title_emb_charngram, 'data/title_emb_charngram.pt')

torch.save(desc_emb_glove, 'data/desc_emb_glove.pt')
torch.save(desc_emb_fasttext, 'data/desc_emb_fasttext.pt')
torch.save(desc_emb_charngram, 'data/desc_emb_charngram.pt')

## get emb from image obj name

Here we generate the "centrioid" of object name embeddings.

In [1]:
# tokenize the images classes (R)
wdir = '/home/yu/OneDrive/Construal'
setwd(wdir)

ld(df_objdet, force=T)

# objtxt: the text of image objects, grouped by pid
# - Frist we aggreate prob of same labels within a pid
# - Then we normalize prob so that the probs of each pid sum to one
objtxt = df_objdet[!is.na(label_name) & !is.na(prob) & jpg=='profile_full.jpg',
    ][, .(prob=sum(prob), label_name=label_name[1]), keyby=.(pid, label_id)
    ][, ':='(prob=prob/sum(prob)), keyby=.(pid)
    ][, .(pid, prob, label_id, label_name)][order(pid, label_id)]
sv(objtxt)
objtxt %>% head()

"df_objdet.feather" (112.2 MB) loaded (1 secs) (2021-10-18 4:47 PM)
"objtxt" saved as "objtxt.feather" (9.1 MB) (0.05 secs, 2021-10-18 16:47:20)


pid,prob,label_id,label_name
<chr>,<dbl>,<int>,<chr>
1000117510,0.006866673,5,alcohol
1000117510,0.005171574,11,antenna
1000117510,0.006819463,27,avocado
1000117510,0.003172611,61,basket
1000117510,0.00457759,63,bass_horn
1000117510,0.003549974,75,beanie


In [1]:
# tokenize the images classes
wdir = '/home/yu/OneDrive/Construal'
os.chdir(wdir)

nlp = spacy.load("en_core_web_sm")

glove = torchtext.vocab.GloVe()
fasttext = torchtext.vocab.FastText()
charngram = torchtext.vocab.CharNGram()

In [3]:
def get_objname_emb(vocab):
    objname_emb = {}
    
    for pid in tqdm(set(objtxt['pid'].to_list()[0])):
        # get embs from object names
        prob_objname, objname = objtxt[f.pid==pid, [f.prob, f.label_name]].to_list()
        prob_objname = np.array(prob_objname) # (N,)

        embs = []
        for tokens in nlp.pipe(objname):
            tokens = tokens.text.split('_')
            embs_temp = []
            for t in tokens:
                t = t.lower()
                embs_temp.append(vocab[t])
            embs_temp = torch.stack(embs_temp).mean(0).squeeze()
            embs.append(embs_temp)

        objname_emb[pid] = torch.stack(embs).numpy() # (N,300)

    return objname_emb

ld('objtxt', force=True)

objname_emb_glove = get_objname_emb(glove)
objname_emb_fasttext = get_objname_emb(fasttext)
objname_emb_charngram = get_objname_emb(charngram)

torch.save(objname_emb_glove, 'data/objname_emb_glove.pt')
torch.save(objname_emb_fasttext, 'data/objname_emb_fasttext.pt')
torch.save(objname_emb_charngram, 'data/objname_emb_charngram.pt')

  0%|          | 2/3749 [00:00<05:02, 12.38it/s]

"objtxt.feather" (9.1 MB) loaded (<1s) (2021-10-18 4:48 PM)


100%|██████████| 3749/3749 [02:26<00:00, 25.54it/s]
100%|██████████| 3749/3749 [02:27<00:00, 25.42it/s]
100%|██████████| 3749/3749 [03:15<00:00, 19.13it/s]


## compute dist_concreteness

In [20]:
# cretae embeddings
wdir = '/home/yu/OneDrive/Construal'
os.chdir(wdir)

nlp = spacy.load("en_core_web_sm")

title_emb_glove = torch.load('data/title_emb_glove.pt')
title_emb_fasttext = torch.load('data/title_emb_fasttext.pt')
title_emb_charngram = torch.load('data/title_emb_charngram.pt')

desc_emb_glove = torch.load('data/desc_emb_glove.pt')
desc_emb_fasttext = torch.load('data/desc_emb_fasttext.pt')
desc_emb_charngram = torch.load('data/desc_emb_charngram.pt')

objname_emb_glove = torch.load('data/objname_emb_glove.pt')
objname_emb_fasttext = torch.load('data/objname_emb_fasttext.pt')
objname_emb_charngram = torch.load('data/objname_emb_charngram.pt')

In [25]:
def get_distance(text_embs, objname_embs, objtxt):
    import datatable as dt

    valid_pids = set(objtxt['pid'].to_list()[0]).intersection(set(text_embs.keys()))
    valid_pids = list(valid_pids)

    results = []
    for pid in tqdm(valid_pids):
        # get embs from text
        text_emb = text_embs[pid]
        cent_text = text_emb.numpy().mean(0).reshape(1,-1)

        # get embs from object names
        objname_emb = objname_embs[pid]
        objname_prob = np.array(objtxt[f.pid==pid, f.prob].to_list()[0])
        cent_objname = np.matmul(objname_prob, objname_emb).reshape(1,-1)

        # compute cluster avg distance
        linf_dist = DistanceMetric.get_metric('chebyshev')

        cluster_cos_dist = cosine_distances(cent_text, cent_objname)[0][0]
        cluster_l2_dist = euclidean_distances(cent_text, cent_objname)[0][0]
        cluster_l1_dist = manhattan_distances(cent_text, cent_objname)[0][0]
        cluster_linf_dist = linf_dist.pairwise(cent_text, cent_objname)[0][0]

        # compute shortest pairwise dist
        k_pmax = 3
        pmax_idx = objname_prob.argsort()[-k_pmax:]
        objname_emb = objname_emb[pmax_idx, :]

        shortest_cos_dist = cosine_distances(text_emb, objname_emb).min()
        shortest_l2_dist = euclidean_distances(text_emb, objname_emb).min()
        shortest_l1_dist = manhattan_distances(text_emb, objname_emb).min()
        shortest_linf_dist = linf_dist.pairwise(text_emb, objname_emb).min()
        
        # append to results
        results.append((pid, cluster_cos_dist, cluster_l1_dist, cluster_l2_dist, cluster_linf_dist, shortest_cos_dist, shortest_l1_dist, shortest_l2_dist, shortest_linf_dist))

    return dt.Frame(results, names=['pid', 'cluster_cos_dist', 'cluster_l1_dist', 'cluster_l2_dist', 'cluster_linf_dist', 'shortest_cos_dist', 'shortest_l1_dist', 'shortest_l2_dist', 'shortest_linf_dist'])

    
objtxt = ld('objtxt')

# dist_glove_title = get_distance(
#     title_emb_glove, objname_emb_glove, objtxt)
# dist_fasttext_title = get_distance(
#     title_emb_fasttext, objname_emb_fasttext, objtxt)
# dist_charngram_title = get_distance(
#     title_emb_charngram, objname_emb_charngram, objtxt)

# dist_glove_title.to_csv('data/sharing/dist_glove_title.csv')
# dist_fasttext_title.to_csv('data/sharing/dist_fasttext_title.csv')
# dist_charngram_title.to_csv('data/sharing/dist_charngram_title.csv')


dist_glove_desc = get_distance(
    desc_emb_glove, objname_emb_glove, objtxt)
# dist_fasttext_desc = get_distance(
#     desc_emb_fasttext, objname_emb_fasttext, objtxt)
# dist_charngram_desc = get_distance(
#     desc_emb_charngram, objname_emb_charngram, objtxt)

# dist_glove_desc.to_csv('data/sharing/dist_glove_desc.csv')
# dist_fasttext_desc.to_csv('data/sharing/dist_fasttext_desc.csv')
# dist_charngram_desc.to_csv('data/sharing/dist_charngram_desc.csv')

"objtxt.feather" (9.1 MB) loaded (<1s) (2022-01-12 6:23 PM)


  0%|          | 0/3749 [00:00<?, ?it/s]

# Dictionary-based (added in Sep 2023)
Task 1:

- Get the object names in an image
- Get the concreteness score for each object name. The score is based on a dictionary.
- The concreteness score of an image is defined as the sum of the concreteness scores of all the containing objects.

Task 2:
- Same as before, get the object names.
- Get the "frequency" score based on the google dictionary.
- The frequency score is the aggregation of the component object frequency.

In [2]:
suppressMessages({
    library(arrow)
})

wdir = '/home/yu/OneDrive/Construal'
setwd(wdir)

### Concreteness (Task 1)

Notes:
- In the beginning, I only keep objects with prob>=0.5 in the `objdet` dataset. However, this will remove about 1000 pids.
- Now, I lower the threshold to prob>=0.1.

In [3]:
# get the object names
objdet = read_feather('data/df_objdet.feather') %>% setDT()
# objdet[, uniqueN(pid)]

objdet = objdet[prob>=0.1, .(pid, object=label_name, inst_id, prob)]
# objdet[, uniqueN(pid)]

# get pjson
pjson = read_feather('data/pjson.feather') %>% setDT()

# get the dictionary
bscore_dict = fread('data/concreteness_score.csv')[, .(word=str_trim(Word), score=Conc.M)]

In [5]:
# get the bscore of each object
get_bscore <- function(obj) {
    # obj: string, a single object name
    # dict: data.table, bscore dictionary
    # output: numeric, the concreteness score of the object name. 0 if not found.
    bscore_dict[word==obj, fcase(length(score)==0, 0, length(score)>0, score)]
}

bscore = objdet[, 
    .(pid, inst_id, prob, object=str_extract_all(object, '[A-Za-z]+') %>% lapply(str_trim))
    ][, {
    # if object name are composite (e.g., brass handhold), we take the mean 
    score = sapply(object[[1]], get_bscore) %>% mean()
    list(pid, object, prob, inst_id, score)
    },
    keyby=seq_len(nrow(objdet))
    ][, ':='(seq_len=NULL)][]

In [6]:
# compute the bscore for each project
img_conc_bscore = bscore[,
    .(bscore_sum=sum(score),  # sum of all objects' bscore
      bscore_mean=mean(score),  # mean of all objects' bscore 
      bscore_pw_sum=sum(score*prob),  # sum of all objects' bscore weighted by prob
      bscore_pw_mean=sum(score*prob)/sum(prob)), # mean of all objects' bscore weighted by prob 
    keyby=.(pid)
    ]

img_conc_bscore[1]
fwrite(img_conc_bscore, 'data/sharing/img_conc_bscore_dict_based.csv')

pid,bscore_sum,bscore_mean,bscore_pw_sum,bscore_pw_mean
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1000117510,155.896,4.585176,31.0826,4.593648


### Debug (Task 1)

In [8]:
# get used pids
used_pids = fread('data/sharing/missing-pids/used-pids.csv')[, V1]  # outcome is a vector
length(used_pids)

# pids from img_conc_bscore
copy(img_conc_bscore)[, ':='(is_used=pid %in% used_pids)
    ][, .(total_pids=.N, used_pids=sum(is_used))]

total_pids,used_pids
<int>,<int>
3687,3577


### Frequency (Task 2)

In [9]:
# get the object names
objdet = read_feather('data/df_objdet.feather') %>% setDT()
objdet = objdet[prob>=0.1, .(pid, object=label_name, inst_id, prob)]

# get pjson
pjson = read_feather('data/pjson.feather') %>% setDT()

# get the dictionary
freq_dict = read_feather('data/google_freqdict.feather') %>% setDT()
freq_dict = freq_dict[, .(word=str_trim(word), freq=freq_google_withoutstop)]

In [10]:
# get the freq of each object
get_freq <- function(obj) {
    # obj: string, a single object name
    # dict: data.table, bscore dictionary
    # output: numeric, the concreteness score of the object name. 0 if not found.
    freq_dict[word==obj, fcase(length(freq)==0, 0, length(freq)>0, freq)]
}

freq = objdet[, 
    .(pid, inst_id, prob, object=str_extract_all(object, '[A-Za-z]+') %>% lapply(str_trim))
    ][, {
    # if object name are composite (e.g., brass handhold), we take the mean 
    freq = sapply(object[[1]], get_freq) %>% mean()  
    list(pid, object, prob, inst_id, freq)
    },
    keyby=seq_len(nrow(objdet))
    ]

In [11]:
# compute the freq
img_freq = freq[,
    .(freq_sum=sum(freq),  # sum of all objects' bscore
      freq_mean=mean(freq),  # mean of all objects' bscore 
      freq_pw_sum=sum(freq*prob),  # sum of all objects' bscore weighted by prob
      freq_pw_mean=sum(freq*prob)/sum(prob)), # mean of all objects' bscore weighted by prob 
    keyby=.(pid)
    ]

img_freq[1]
fwrite(img_freq, 'data/sharing/img_freq_dict_based.csv')

pid,freq_sum,freq_mean,freq_pw_sum,freq_pw_mean
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
1000117510,0.9276249,0.02728309,0.1880324,0.02778901


In [12]:
img_freq[, uniqueN(pid)]

# (fiver) Image-text Interaction

Distance-baesd concreteness. Steps are:
- For every PID, Tokenize the image headline. 
- Compute the embedding of every token in the headline. These embeddings form a cluster.
- For every label_name of the PID, compute its embedding. Again, the label_names form another cluster.
- Compute the distance:
    - Method I: compute the average distance between the clusters
    - Method II: compute the shorted distance between the clusters

In [11]:
import numpy as np
import pandas as pd
import os
import spacy
import torch
import torchtext
from pyarrow.feather import write_feather, read_feather

from sklearn.metrics import DistanceMetric
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, manhattan_distances
from tqdm.auto import tqdm

wdir = '/home/yu/OneDrive/Construal'
os.chdir(wdir)

## get emb from title text

In [61]:
# tokenize the image titles
nlp = spacy.load("en_core_web_sm")

glove = torchtext.vocab.GloVe()

In [62]:
# ------ covert text to tokens --------------
pjson = read_feather('data/fiver/pjson.feather')

titles = pjson.title.to_list()
pids = [str(pid) for pid in pjson.pid.to_list()]

# get the tokenized title
title_tokens = []
for i, title in enumerate(nlp.pipe(titles)):
    tokens = [t.text for t in title
              if t.pos_ in ['NOUN', 'PROPN', 'PRON']]

    # if there's no NOUN tokens in the title, we include them all
    if len(tokens) == 0:
        tokens = [t.text for t in title]
    title_tokens.append(tokens)

In [64]:
# ---------- convert tokens to embs ----------------

def get_emb(vocab, tokens):
    # get the embedding for every title token
    emb_output = {}
    for i, (pid, token) in enumerate(zip(pids, tokens)):
        emb = []
        for t in token:
            t = t.lower() if isinstance(vocab, torchtext.vocab.FastText) else t
            emb.append(vocab[t].squeeze())
        
        emb = torch.stack(emb)

        emb_output[pid] = emb
    return emb_output

title_emb_glove = get_emb(glove, title_tokens)

torch.save(title_emb_glove, 'data/fiver/image-text/title_emb_glove.pt')

In [65]:
len(set(title_emb_glove.keys()))

2615

## get emb from image obj name

Here we generate the "centrioid" of object name embeddings.

In [71]:
# tokenize the images classes (R)
object_detect_res = read_feather('data/fiver/object-detect-res.feather')

# objtxt: the text of image objects, grouped by pid
# - Frist we aggreate prob of same labels within a pid
# - Then we normalize prob so that the probs of each pid sum to one
tmp = object_detect_res
objtxt = tmp[(~tmp.label_name.isna()) & (~tmp.prob.isna())] \
    .groupby(['pid', 'label_id']) \
    .agg({'prob': 'sum', 'label_name': 'first'}) \
    .reset_index() \
    .groupby(['pid'], group_keys=True) \
    .apply(lambda x: x.assign(prob=x.prob/x.prob.sum())) \
    .reset_index(drop=True) \
    .loc[:, ['pid', 'prob', 'label_id', 'label_name']] \
    .sort_values(['pid', 'label_id'])

# write_feather(objtxt, 'data/fiver/image-text/objtxt.feather')
objtxt.head(1)

Unnamed: 0,pid,prob,label_id,label_name
0,1000342273,0.001513,27,avocado


In [8]:
# tokenize the images objects
nlp = spacy.load("en_core_web_sm")
glove = torchtext.vocab.GloVe()

In [9]:
def get_objname_emb(vocab):
    objname_emb = {}
    
    for pid in tqdm(set(objtxt['pid'].to_list())):
        # get embs from object names
        objname = objtxt.loc[objtxt.pid==pid, 'label_name'].to_list()  # list of object names for a pid

        embs = []
        for tokens in nlp.pipe(objname):
            tokens = tokens.text.split('_')
            embs_temp = []
            for t in tokens:
                t = t.lower()
                embs_temp.append(vocab[t])
            embs_temp = torch.stack(embs_temp).mean(0).squeeze()
            embs.append(embs_temp)

        objname_emb[pid] = torch.stack(embs).numpy() # (N,300)

    return objname_emb

objname_emb_glove = get_objname_emb(glove)

torch.save(objname_emb_glove, 'data/fiver/image-text/objname_emb_glove.pt')

100%|██████████| 2604/2604 [02:01<00:00, 21.50it/s]


## compute dist_concreteness

In [10]:
# cretae embeddings
nlp = spacy.load("en_core_web_sm")
title_emb_glove = torch.load('data/title_emb_glove.pt')

In [75]:
def get_distance(text_embs, objname_embs, objtxt):

    valid_pids = set(objtxt['pid'].to_list()) \
        .intersection(set(text_embs.keys()))
    valid_pids = list(valid_pids)

    results = []
    for pid in tqdm(valid_pids):
        # get embs from text
        text_emb = text_embs[pid]  # (n_tokens, dim_emb)
        cent_text = text_emb.numpy().mean(0).reshape(1,-1)  # (1, dim_emb)

        # get embs from object names
        objname_emb = objname_embs[pid]
        objname_prob = np.array(objtxt.loc[objtxt.pid==pid, 'prob'].to_list())
        cent_objname = np.matmul(objname_prob, objname_emb).reshape(1,-1)

        # compute cluster avg distance
        linf_dist = DistanceMetric.get_metric('chebyshev')

        cluster_cos_dist = cosine_distances(cent_text, cent_objname)[0][0]
        cluster_l2_dist = euclidean_distances(cent_text, cent_objname)[0][0]
        cluster_l1_dist = manhattan_distances(cent_text, cent_objname)[0][0]
        cluster_linf_dist = linf_dist.pairwise(cent_text, cent_objname)[0][0]

        # compute shortest pairwise dist
        k_pmax = 3
        pmax_idx = objname_prob.argsort()[-k_pmax:]
        objname_emb = objname_emb[pmax_idx, :]

        shortest_cos_dist = cosine_distances(text_emb, objname_emb).min()
        shortest_l2_dist = euclidean_distances(text_emb, objname_emb).min()
        shortest_l1_dist = manhattan_distances(text_emb, objname_emb).min()
        shortest_linf_dist = linf_dist.pairwise(text_emb, objname_emb).min()
        
        # append to results
        results.append((pid, cluster_cos_dist, cluster_l1_dist, cluster_l2_dist, cluster_linf_dist, shortest_cos_dist, shortest_l1_dist, shortest_l2_dist, shortest_linf_dist))

    return pd.DataFrame(results, columns=['pid', 'cluster_cos_dist', 'cluster_l1_dist', 'cluster_l2_dist', 'cluster_linf_dist', 'shortest_cos_dist', 'shortest_l1_dist', 'shortest_l2_dist', 'shortest_linf_dist'])

    
dist_glove_title = get_distance(
    title_emb_glove, objname_emb_glove, objtxt)

dist_glove_title.to_csv('data/sharing/dist_glove_title.csv')



100%|██████████| 2604/2604 [01:15<00:00, 34.44it/s]


# (fiver) MNI concreteness

In this version I used Pandas. pydatatable is removed.

In [53]:
import numpy as np
import os
import pandas as pd
import PIL
import torch
import torchmetrics

from pyarrow.feather import read_feather, write_feather
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision.models import resnext101_32x8d
from torchvision.transforms import Compose, ToTensor, Normalize, Resize, ToPILImage, CenterCrop, RandomResizedCrop, RandomHorizontalFlip
from tqdm.auto import tqdm

wdir = '/home/yu/OneDrive/Construal'
os.chdir(wdir)

## get representation

For every project image, use a backbone to generate a representation for it.

In [6]:
# Get the `pid` of all projects of all projects
pjson = read_feather('data/fiver/pjson.feather')
pids = [str(pid) for pid in pjson.pid.to_list()]

In [29]:
# define dataset
class ReprDataset(Dataset):
    def __init__(self, pids):
        self.img_dir = '/home/yu/chaoyang/research-resources/kickstart-raw-from-amrita/fiver-image/Image Folder'

        # check all pids exists
        valid_pids = []
        for pid in pids:
            img_path = f'{self.img_dir}/{pid}/{pid}_main.jpg'
            try:
                with PIL.Image.open(img_path) as img:
                    valid_pids.append(pid)
            except:
                pass

        # pids = [pid for pid in pids
        #         if os.path.exists(f'{self.img_dir}/{pid}/{pid}_main.jpg')]
        self.pids = valid_pids
        self.transform = Compose([Resize(256),
                                  CenterCrop(224),
                                  ToTensor(),
                                  Normalize(mean=[0.485, 0.456, 0.406],
                                            std=[0.229, 0.224, 0.225])])

    def __len__(self):
        return len(self.pids)

    def __getitem__(self, idx):
        pid = self.pids[idx]
        img_path = f'{self.img_dir}/{pid}/{pid}_main.jpg'
        with PIL.Image.open(img_path) as img:
            img = img.convert('RGB')
            img = self.transform(img)
        return pid, img

# load and freeze model     
model = resnext101_32x8d(pretrained=True)   
model.fc = nn.Identity()

device = 'cuda:0'
model.to(device)

# make dataset/dataloader
ds = ReprDataset(pids)
dl = DataLoader(ds, shuffle=False, batch_size=32, drop_last=False)
print(f'{len(dl)=}')
results = {}

# run!
with torch.no_grad():
    for i, (pid, img) in enumerate(tqdm(dl)):
        img = img.to(device)
        img_repr = model(img)

        for p, r in zip(pid, img_repr):
            results[p] = r

torch.save(results, 'data/fiver/mni-concreteness/fiverobj_repr.pt')



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1

len(dl)=82


100%|██████████| 82/82 [00:50<00:00,  1.62it/s]


## get label-level MNI

In [40]:
object_detect_res = read_feather('data/fiver/object-detect-res.feather')
project_label_names = object_detect_res \
    .loc[object_detect_res.prob>=0.5, ['pid', 'label_name', 'label_id']] \
    .drop_duplicates()
    
project_label_names.head(1)

Unnamed: 0,pid,label_name,label_id
0,1630789067,suit_(clothing),1033


In [41]:
# Load image reprs into an annoy tree
reprs = torch.load('data/fiver/mni-concreteness/fiverobj_repr.pt')
valid_pids = list(reprs.keys())

# load image reprs into an annoy tree
from annoy import AnnoyIndex

t = AnnoyIndex(2048, 'angular')  # Length of item vector that will be indexed

pid2id = {}
for i, (pid, vec) in enumerate(reprs.items()): 
    # create a map from label_id to an int
    pid2id[pid] = i

    # add to annoy tree
    vec = vec.cpu().numpy()
    t.add_item(i, vec)
    
t.build(1000)

True

In [54]:
# run neighbor search
text = project_label_names
text = text[text.pid.isin(valid_pids)]
W = text.label_name.unique().tolist()


def make_mni(k, t):
    mni_dict = {}

    for w in tqdm(W):
        # Vw = text[f.label_name==w, [f.pid]]
        # Vw = dt.unique(Vw).to_list()[0]

        Vw = text[text.label_name==w].pid.unique().tolist()
        Vw = [pid2id[pid] for pid in Vw]
        Vw = set(Vw)
        
        a = 0
        for v in Vw:
            NN_v = set(t.get_nns_by_item(v, k)) - set([v])
            a += len(Vw.intersection(NN_v))

        mni = a/len(Vw)
        adj_mni = mni/(len(Vw)*k)*819

        mni_dict[w] = adj_mni

    frame = pd.DataFrame({'label': list(mni_dict.keys()), 'mni': list(mni_dict.values())})
    frame = frame.sort_values('mni', ascending=False)

    return frame

dt_k10 = make_mni(10, t)
dt_k25 = make_mni(25, t)
dt_k50 = make_mni(50, t)
dt_k100 = make_mni(100, t)

100%|██████████| 445/445 [00:07<00:00, 56.20it/s] 
100%|██████████| 445/445 [00:09<00:00, 46.13it/s] 
100%|██████████| 445/445 [00:12<00:00, 36.43it/s] 
100%|██████████| 445/445 [00:17<00:00, 25.40it/s] 


In [60]:
# merge different neighbor sizes
mni = dt_k10.merge(dt_k25, on='label', suffixes=('_k10', '_k25')) \
    .merge(dt_k50, on='label', suffixes=('', '_k50')) \
    .merge(dt_k100, on='label', suffixes=('_k50', '_k100'))

write_feather(mni, 'data/fiver/mni-concreteness/mni.feather')

## get pid-level MNI

In [8]:
suppressMessages({
    library(arrow)
    library(data.table)
})

wdir = '/home/yu/OneDrive/Construal'
setwd(wdir)

object_detect_res = read_feather("data/fiver/object-detect-res.feather") %>% as.data.table()
mni = read_feather('data/fiver/mni-concreteness/mni.feather') %>% as.data.table()

pid_mni_weighted = object_detect_res[, .(pid, label_name, inst_id, prob)
    ][mni, on=c('label_name==label'), nomatch=NULL
    ][, .(mni_k10_weighted_unnormalized=sum(prob*mni_k10),
          mni_k10_weighted_normalized=sum(prob*mni_k10)/sum(prob),
          mni_k25_weighted_unnormalized=sum(prob*mni_k25),
          mni_k25_weighted_normalized=sum(prob*mni_k25)/sum(prob),
          mni_k50_weighted_unnormalized=sum(prob*mni_k50),
          mni_k50_weighted_normalized=sum(prob*mni_k50)/sum(prob),
          mni_k100_weighted_unnormalized=sum(prob*mni_k100),
          mni_k100_weighted_normalized=sum(prob*mni_k100)/sum(prob)),
      keyby=.(pid)
    ][order(pid)]

pid_mni_weighted[1]
fwrite(pid_mni_weighted, 'data/fiver/mni-concreteness/pid_mni_weighted.csv')

pid,mni_k10_weighted_unnormalized,mni_k10_weighted_normalized,mni_k25_weighted_unnormalized,mni_k25_weighted_normalized,mni_k50_weighted_unnormalized,mni_k50_weighted_normalized,mni_k100_weighted_unnormalized,mni_k100_weighted_normalized
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1000342273,12.02429,1.864386,10.59487,1.642752,9.425896,1.461501,7.652324,1.186506
