# Add IV

Steps:
1. For every product title, only extract the nouns,
2. For every noun, find its K nearest neighbors according to word embedding
3. Among the K+1 words, select the most common word as the "generic form" of the noun.
4. Repeat for every noun
5. compute the average concreteness score of the nouns

In [5]:
import datatable as dt
import numpy as np
import os
import torch
import torch.nn.functional as F
import torchtext

from datatable import f, fread
from sklearn.metrics.pairwise import cosine_distances
from tqdm import tqdm
from utilpy import ld, sv

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

WORK_DIR = '/home/yu/OneDrive/Construal/'
os.chdir(WORK_DIR)

## extract the nouns from title


In [2]:
# get titles
pjson = ld('pjson')
titles = pjson[(f.category=='Product Design') | (f.category=='Accessories'), f.title].to_list()[0]
titles = [t.strip() for t in titles]

# get description (only select product design)
descs = pjson[(f.category=='Product Design') | (f.category=='Accessories'), f.project_desc].to_list()[0]
descs = [d.strip() for d in descs]

"pjson.feather" (167.5 MB) loaded (<1s) (2022-09-22 1:03 AM)


In [4]:
# find out all the nouns in the noun_chunk
# here I create two versions:
# - v1: include all the nouns in the noun chunk
# - v2: only the root of the noun chunk
# Output: title_nouns
import spacy

nlp = spacy.load('en_core_web_sm')

# parse title
titles = list(nlp.pipe(titles))

# parse desc (keep first 200 words)
descs = list(nlp.pipe(descs))

In [None]:
def get_nouns(docs, keep_first=None):
    '''
    Args:
        keep_first: only keep first N words
    '''
    output_v1 = []
    output_v2 = []
    for doc in tqdm(docs):
        if keep_first is not None:
            doc = doc[0:keep_first]
        nouns_v1 = []
        nouns_v2 = []
        for chunk in doc.noun_chunks:
            # v1
            for token in chunk:
                if token.pos_ in ['NOUN', 'PROPN']:
                    nouns_v1.append(token.text)   
            # v2
            if chunk.root.pos_ in ['NOUN', 'PROPN']:
                nouns_v2.append(chunk.root.text)
        output_v1.append(nouns_v1)
        output_v2.append(nouns_v2)
    return output_v1, output_v2

title_nouns_v1, title_nouns_v2 = get_nouns(titles)
desc_nouns_v1, desc_nouns_v2 = get_nouns(descs, keep_first=200)

In [None]:
title_nouns_v1 = torch.load('title_nouns_v1.pt')
title_nouns_v2 = torch.load('title_nouns_v2.pt')
desc_nouns_v1 = torch.load('desc_nouns_v1.pt')
desc_nouns_v2 = torch.load('desc_nouns_v2.pt')

## find nearest k neighbors

In [4]:
# load word embeddings

glove = torchtext.vocab.GloVe()
fasttext = torchtext.vocab.FastText()
charngram = torchtext.vocab.CharNGram()

In [5]:
# get k nearest neighbors
def get_most_frequent_token(freqdict, tokens) -> str:
    max_ix = np.array([freqdict.get(t.lower(), -np.inf) for t in tokens]).argmax()
    return tokens[max_ix]
    
def get_title_generic_form(vocab, title_nouns, k_list=[0, 3, 5, 10], device='cuda:0', include_self=True):
    # import frequency dict
    freqdict = fread('data/freqdict.csv')
    freqdict = {token: freq for (token, freq) in freqdict.to_tuples()}

    # get the embedding for every title token
    generic_nouns = []
    vocab_embs = F.normalize(vocab.vectors.to(device), p=2, dim=-1)
    for title_noun in tqdm(title_nouns):
        # title_noun: the List of all nouns in one title
        generic_noun = []
        for noun in title_noun:
            noun = noun.lower() if isinstance(vocab, torchtext.vocab.FastText) else noun
            noun_emb = vocab[noun].unsqueeze(0).to(device)

            with torch.no_grad():
                dist = torch.matmul(vocab_embs, noun_emb.T).squeeze()

            generic_noun_k = {}
            for k in k_list:
                if include_self:
                    knn_ix = dist.argsort(descending=True)[:(k+1)]
                elif not include_self:
                    knn_ix = dist.argsort(descending=True)[1:(k+1)]

                knn_tokens = [vocab.itos[ix] for ix in knn_ix]
                generic_form = get_most_frequent_token(freqdict, knn_tokens)
                generic_noun_k[k] = generic_form
                
            generic_noun.append(generic_noun_k)
        
        generic_nouns.append(generic_noun)
    
    # collate results by k
    from collections import defaultdict

    output = defaultdict(list)
    for k in k_list:
        for title_nouns in generic_nouns:
            values = []
            for d in title_nouns:
                values.append(d[k])
            output[k].append(values)

    # clean cuda memory
    del vocab_embs, noun_emb, dist, knn_ix

    return output

# ---- title ----
# include self
# title_generic_nouns_v1_glove = get_title_generic_form(glove, title_nouns_v1, k_list=[0, 3, 5, 10])
# torch.save(title_generic_nouns_v1_glove, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_glove.pt')
# title_generic_nouns_v2_glove = get_title_generic_form(glove, title_nouns_v2, k_list=[0, 3, 5, 10])
# torch.save(title_generic_nouns_v2_glove, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_glove.pt')
# title_generic_nouns_v1_fasttext = get_title_generic_form(fasttext, title_nouns_v1, k_list=[0, 3, 5, 10])
# torch.save(title_generic_nouns_v1_fasttext, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_fasttext.pt')
# title_generic_nouns_v2_fasttext = get_title_generic_form(fasttext, title_nouns_v2, k_list=[0, 3, 5, 10])
# torch.save(title_generic_nouns_v2_fasttext, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_fasttext.pt')

# not include self
title_generic_nouns_v1_glove_exclude_self = get_title_generic_form(glove, title_nouns_v1, k_list=[3, 5, 10], include_self=False)
torch.save(title_generic_nouns_v1_glove_exclude_self, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_glove_exclude_self.pt')
title_generic_nouns_v2_glove_exclude_self = get_title_generic_form(glove, title_nouns_v2, k_list=[3, 5, 10], include_self=False)
torch.save(title_generic_nouns_v2_glove_exclude_self, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_glove_exclude_self.pt')
title_generic_nouns_v1_fasttext_exclude_self = get_title_generic_form(fasttext, title_nouns_v1, k_list=[3, 5, 10], include_self=False)
torch.save(title_generic_nouns_v1_fasttext_exclude_self, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_fasttext_exclude_self.pt')
title_generic_nouns_v2_fasttext_exclude_self = get_title_generic_form(fasttext, title_nouns_v2, k_list=[3, 5, 10], include_self=False)
torch.save(title_generic_nouns_v2_fasttext_exclude_self, '/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_fasttext_exclude_self.pt')

# ---- desc ----
# include self
# desc_generic_nouns_v1_glove = get_title_generic_form(glove, desc_nouns_v1, k_list=[0, 3, 5, 10])
# torch.save(desc_generic_nouns_v1_glove, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_glove.pt')
# desc_generic_nouns_v2_glove = get_title_generic_form(glove, desc_nouns_v2, k_list=[0, 3, 5, 10])
# torch.save(desc_generic_nouns_v2_glove, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_glove.pt')
# desc_generic_nouns_v1_fasttext = get_title_generic_form(fasttext, desc_nouns_v1, k_list=[0, 3, 5, 10])
# torch.save(desc_generic_nouns_v1_fasttext, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_fasttext.pt')
# desc_generic_nouns_v2_fasttext = get_title_generic_form(fasttext, desc_nouns_v2, k_list=[0, 3, 5, 10])
# torch.save(desc_generic_nouns_v2_fasttext, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_fasttext.pt')

# not include self
desc_generic_nouns_v1_glove_exclude_self = get_title_generic_form(glove, desc_nouns_v1, k_list=[3, 5, 10], include_self=False)
torch.save(desc_generic_nouns_v1_glove_exclude_self, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_glove_exclude_self.pt')
desc_generic_nouns_v2_glove_exclude_self = get_title_generic_form(glove, desc_nouns_v2, k_list=[3, 5, 10], include_self=False)
torch.save(desc_generic_nouns_v2_glove_exclude_self, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_glove_exclude_self.pt')
desc_generic_nouns_v1_fasttext_exclude_self = get_title_generic_form(fasttext, desc_nouns_v1, k_list=[3, 5, 10], include_self=False)
torch.save(desc_generic_nouns_v1_fasttext_exclude_self, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_fasttext_exclude_self.pt')
desc_generic_nouns_v2_fasttext_exclude_self = get_title_generic_form(fasttext, desc_nouns_v2, k_list=[3, 5, 10], include_self=False)
torch.save(desc_generic_nouns_v2_fasttext_exclude_self, '/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_fasttext_exclude_self.pt')


100%|██████████| 3759/3759 [02:09<00:00, 29.08it/s]
100%|██████████| 3759/3759 [00:56<00:00, 66.90it/s]
100%|██████████| 3759/3759 [02:30<00:00, 25.01it/s]
100%|██████████| 3759/3759 [01:05<00:00, 57.00it/s]
100%|██████████| 3759/3759 [23:29<00:00,  2.67it/s]
100%|██████████| 3759/3759 [17:18<00:00,  3.62it/s]
100%|██████████| 3759/3759 [27:25<00:00,  2.28it/s]
100%|██████████| 3759/3759 [20:12<00:00,  3.10it/s]


In [5]:
nouns = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_glove_exclude_self.pt')


In [12]:
nouns[3][4]

['Shade', 'sunglasses', 'Head']

## compute concreteness

In [2]:
def get_avg_bscore(title_generic_nouns):
    # load bscore dict
    bscore_dict = fread('/home/yu/OneDrive/Construal/data/concreteness_score.csv')
    bscore_dict = bscore_dict[:, {'word': f.Word, 'score': f['Conc.M']}].to_tuples()
    bscore_dict = {word.strip(): score for word, score in bscore_dict}

    # compute avg bscore
    output = {}
    for k, generic_nouns in tqdm(title_generic_nouns.items()):
        scores = []
        for doc in generic_nouns:
            score = [bscore_dict.get(token.lower(), None) for token in doc]
            score = list(filter(None, score))
            # some doc may not contain any "concrete" words
            if len(score) == 0:
                avg_score = None
            else:
                avg_score = sum(score)/len(score)
            scores.append(avg_score)
        output[k] = scores

    return output

# ---- title ----
# include self
title_generic_nouns_v1_glove = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_glove.pt')
title_generic_nouns_v2_glove = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_glove.pt')
title_generic_nouns_v1_fasttext = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_fasttext.pt')
title_generic_nouns_v2_fasttext = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_fasttext.pt')

title_bscore_v1_glove = get_avg_bscore(title_generic_nouns_v1_glove)
title_bscore_v2_glove = get_avg_bscore(title_generic_nouns_v2_glove)
title_bscore_v1_fasttext = get_avg_bscore(title_generic_nouns_v1_fasttext)
title_bscore_v2_fasttext = get_avg_bscore(title_generic_nouns_v2_fasttext)

# exclude self
title_generic_nouns_v1_glove_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_glove_exclude_self.pt')
title_generic_nouns_v2_glove_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_glove_exclude_self.pt')
title_generic_nouns_v1_fasttext_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v1_fasttext_exclude_self.pt')
title_generic_nouns_v2_fasttext_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/title_generic_nouns_v2_fasttext_exclude_self.pt')

title_bscore_v1_glove_exclude_self = get_avg_bscore(title_generic_nouns_v1_glove_exclude_self)
title_bscore_v2_glove_exclude_self = get_avg_bscore(title_generic_nouns_v2_glove_exclude_self)
title_bscore_v1_fasttext_exclude_self = get_avg_bscore(title_generic_nouns_v1_fasttext_exclude_self)
title_bscore_v2_fasttext_exclude_self = get_avg_bscore(title_generic_nouns_v2_fasttext_exclude_self)

# ---- desc ----
# include self
desc_generic_nouns_v1_glove = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_glove.pt')
desc_generic_nouns_v2_glove = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_glove.pt')
desc_generic_nouns_v1_fasttext = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_fasttext.pt')
desc_generic_nouns_v2_fasttext = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_fasttext.pt')

desc_bscore_v1_glove = get_avg_bscore(desc_generic_nouns_v1_glove)
desc_bscore_v2_glove = get_avg_bscore(desc_generic_nouns_v2_glove)
desc_bscore_v1_fasttext = get_avg_bscore(desc_generic_nouns_v1_fasttext)
desc_bscore_v2_fasttext = get_avg_bscore(desc_generic_nouns_v2_fasttext)

# exclude self
desc_generic_nouns_v1_glove_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_glove_exclude_self.pt')
desc_generic_nouns_v2_glove_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_glove_exclude_self.pt')
desc_generic_nouns_v1_fasttext_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v1_fasttext_exclude_self.pt')
desc_generic_nouns_v2_fasttext_exclude_self = torch.load('/home/yu/OneDrive/Construal/data/desc_generic_nouns_v2_fasttext_exclude_self.pt')

desc_bscore_v1_glove_exclude_self = get_avg_bscore(desc_generic_nouns_v1_glove_exclude_self)
desc_bscore_v2_glove_exclude_self = get_avg_bscore(desc_generic_nouns_v2_glove_exclude_self)
desc_bscore_v1_fasttext_exclude_self = get_avg_bscore(desc_generic_nouns_v1_fasttext_exclude_self)
desc_bscore_v2_fasttext_exclude_self = get_avg_bscore(desc_generic_nouns_v2_fasttext_exclude_self)

100%|██████████| 4/4 [00:00<00:00, 29.39it/s]
100%|██████████| 4/4 [00:00<00:00, 125.11it/s]
100%|██████████| 4/4 [00:00<00:00, 120.86it/s]
100%|██████████| 4/4 [00:00<00:00, 213.90it/s]
100%|██████████| 3/3 [00:00<00:00, 117.26it/s]
100%|██████████| 3/3 [00:00<00:00, 88.67it/s]
100%|██████████| 3/3 [00:00<00:00, 87.72it/s]
100%|██████████| 3/3 [00:00<00:00, 55.78it/s]
100%|██████████| 4/4 [00:00<00:00,  5.90it/s]
100%|██████████| 4/4 [00:00<00:00, 34.51it/s]
100%|██████████| 4/4 [00:00<00:00, 13.01it/s]
100%|██████████| 4/4 [00:00<00:00, 20.84it/s]
100%|██████████| 3/3 [00:00<00:00, 27.12it/s]
100%|██████████| 3/3 [00:00<00:00, 22.40it/s]
100%|██████████| 3/3 [00:00<00:00,  7.57it/s]
100%|██████████| 3/3 [00:00<00:00, 10.97it/s]


In [3]:
def combine(inputs):
    pjson = ld('pjson')
    pjson = pjson[(f.category=='Product Design') | (f.category=='Accessories'), ['pid', 'category']]

    for emb_type, bscores in inputs.items():
        # variables with "ex" in their names are excluding itself in searching of
        # the generic form.
        if 'excludeself' in emb_type.split('_'):
            doctype, version, vocab, excludeself = emb_type.split('_')
            for k, bscore in bscores.items():
                pjson[f'gbscore_{doctype}_{version}_{vocab}_k{k}_ex'] = dt.Frame(bscore)
        else:
            doctype, version, vocab = emb_type.split('_')
            for k, bscore in bscores.items():
                pjson[f'gbscore_{doctype}_{version}_{vocab}_k{k}'] = dt.Frame(bscore)

    return pjson

generic_bscore = combine({
    'title_v1_glove': title_bscore_v1_glove,
    'title_v1_glove_excludeself': title_bscore_v1_glove_exclude_self,
    'title_v2_glove': title_bscore_v2_glove,
    'title_v2_glove_excludeself': title_bscore_v2_glove_exclude_self,
    'title_v1_fasttext': title_bscore_v1_fasttext,
    'title_v1_fasttext_excludeself': title_bscore_v1_fasttext_exclude_self,
    'title_v2_fasttext': title_bscore_v2_fasttext,
    'title_v2_fasttext_excludeself': title_bscore_v2_fasttext_exclude_self,
    'desc_v1_glove': desc_bscore_v1_glove,
    'desc_v1_glove_excludeself': desc_bscore_v1_glove_exclude_self,
    'desc_v2_glove': desc_bscore_v2_glove,
    'desc_v2_glove_excludeself': desc_bscore_v2_glove_exclude_self,
    'desc_v1_fasttext': desc_bscore_v1_fasttext,
    'desc_v1_fasttext_excludeself': desc_bscore_v1_fasttext_exclude_self,
    'desc_v2_fasttext': desc_bscore_v2_fasttext,
    'desc_v2_fasttext_excludeself': desc_bscore_v2_fasttext_exclude_self
})



# save data
sv(generic_bscore, 'generic_bscore')  # feather
generic_bscore.to_pandas().to_stata('/home/yu/OneDrive/Construal/data/sharing/generic_bscore.dta')  # stata

"pjson.feather" (167.5 MB) loaded (1s) (2022-07-29 2:06 PM)
Saved as "generic_bscore.feather" (1.2 MB) (<1s) (2022-07-29 2:06 PM)


## Generate examples

Steps:

- Given one project title
- Find out the nouns in the title
- Let's pick one noun, "X", for example.
- Find X's closest neighbors based on GloVe.
- Plot them in a 2D space.

In [6]:
import plotly.graph_objs as go
import torchtext
from datatable import f, fread

# init spacy
import spacy
nlp = spacy.load('en_core_web_sm')

# load word embeddings
glove = torchtext.vocab.GloVe()
# fasttext = torchtext.vocab.FastText()
# charngram = torchtext.vocab.CharNGram()

# get freq dict
freqdict = fread('data/freqdict.csv')
freqdict = {token: freq for (token, freq) in freqdict.to_tuples()}

In [7]:
# get titles
pjson = ld('pjson')
titles = pjson[(f.category=='Product Design') | (f.category=='Accessories'), f.title].to_list()[0]
titles = [t.strip() for t in titles]

# tokenize the titles
titles = list(nlp.pipe(titles[:10]))

# get the nouns in the titles
def get_nouns(docs, keep_first=None):
    '''
    Args:
        keep_first: only keep first N words
    '''
    output_v1 = []
    output_v2 = []
    for doc in tqdm(docs):
        if keep_first is not None:
            doc = doc[0:keep_first]
        nouns_v1 = []
        nouns_v2 = []
        for chunk in doc.noun_chunks:
            # v1
            for token in chunk:
                if token.pos_ in ['NOUN', 'PROPN']:
                    nouns_v1.append(token.text)   
            # v2
            if chunk.root.pos_ in ['NOUN', 'PROPN']:
                nouns_v2.append(chunk.root.text)
        output_v1.append(nouns_v1)
        output_v2.append(nouns_v2)
    return output_v1, output_v2

titles_nouns_v1, titles_nouns_v2 = get_nouns(titles)

"pjson.feather" (167.5 MB) loaded (<1s) (2022-09-22 3:42 PM)


100%|██████████| 10/10 [00:00<00:00, 58991.62it/s]


In [18]:
# given a word, visualize its nearest neighbors
noun = 'parasol'  # carafe, 'parasol'
vocab = glove
device = 'cuda:0'
k = 10
include_self = False
title_nouns = titles_nouns_v1

# get the embedding of the token
generic_nouns = []
vocab_embs = F.normalize(vocab.vectors.to(device), p=2, dim=-1)

# find the nearest neighbors 
noun_emb = vocab[noun].unsqueeze(0).to(device)

with torch.no_grad():
    dist = torch.matmul(vocab_embs, noun_emb.T).squeeze()

generic_noun = {}
if include_self:
    knn_ix = dist.argsort(descending=True)[:(k+1)]
elif not include_self:
    knn_ix = dist.argsort(descending=True)[1:(k+1)]

knn_tokens = [vocab.itos[ix] for ix in knn_ix]
knn_embs = [vocab[token].cpu() for token in knn_tokens]

# final output
tokens = [noun] + knn_tokens
embs = [noun_emb.cpu().squeeze()] + knn_embs
embs = np.stack(embs)
freq = [freqdict.get(token, 0) for token in tokens]
# vis
from sklearn.manifold import TSNE

two_dim = TSNE(n_components=2, random_state=0).fit_transform(embs)[:,:3]

trace = go.Scatter(
    x = two_dim[:,0], 
    y = two_dim[:,1],  
    text = tokens,
    textposition = "top center",
    textfont_size = 20,
    mode = 'markers+text',
    marker = {
        'size': 10,
        'opacity': 0.8,
        'color': ['red'] + ['blue'] * (len(tokens)-1),
    })

go.Figure(data=[trace], layout=go.Layout())
print({token: freq for token, freq in zip(tokens, freq)})


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



{'parasol': 1.45193e-05, 'parasols': 0, 'Parasol': 0, 'umbrellas': 6.26483e-05, 'sunshade': 0, 'chaise': 2.59201e-05, 'lounger': 0, 'umbrella': 0.000182717, 'loungers': 0, 'arbour': 0, 'sunloungers': 0}


# (R) B-score of overlapping projects

In [1]:
suppressMessages({
    library(RPostgres)
    library(haven)
    library(DBI)
    library(utilr)
})

WORK_DIR = '/home/yu/OneDrive/Construal'
setwd(WORK_DIR)

Yu's data science toolbox loaded! 


In [17]:
# get bscore of all projects
pjson = fread('/home/yu/OneDrive/Construal/data/sharing/final_dataset.csv')
pjson = pjson[, .(pid, category, created_at, deadline, bscore_nostopwords, bscore_nostopwords_title, 
                  mni_k100_weighted_normalized, fasttext_cluster_cos_dist_title, fasttext_cluster_cos_dist_desc)]
pjson_key = copy(pjson)[, ':='(start=created_at, end=deadline, 
                               bscore_nostopwords=NULL, bscore_nostopwords_title=NULL,
                               mni_k100_weighted_normalized=NULL, fasttext_cluster_cos_dist_title=NULL, fasttext_cluster_cos_dist_desc=NULL)]
pjson_query = copy(pjson)[, .(pid_query=pid, category_query=category, start_query=created_at, end_query=deadline, 
                              bscore_nostopwords, bscore_nostopwords_title,
                              mni_k100_weighted_normalized, fasttext_cluster_cos_dist_title, fasttext_cluster_cos_dist_desc)]

# match by overlapping
output = pjson_key[pjson_query, on=.(start<=end_query, end>=start_query)
    ][pid!=pid_query
    ][, ':='(overlap_days=as.duration(start-created_at) %>% as.numeric('days') %>% round(1))
    ][order(pid), .(pid, category, pid_query, category_query, created_at, deadline, overlap_days, 
                    bscore_nostopwords, bscore_nostopwords_title, 
                    mni_k100_weighted_normalized, fasttext_cluster_cos_dist_title, 
                    fasttext_cluster_cos_dist_desc)]

# compute within/cross bscore
output = output[, .(category=category[1], created_at=created_at[1], deadline=deadline[1],
                    bscore_nostopwords=mean(bscore_nostopwords, na.rm=T), 
                    bscore_nostopwords_title=mean(bscore_nostopwords_title, na.rm=T),
                    mni_k100_weighted_normalized=mean(mni_k100_weighted_normalized, na.rm=T), 
                    fasttext_cluster_cos_dist_title=mean(fasttext_cluster_cos_dist_title, na.rm=T),
                    fasttext_cluster_cos_dist_desc=mean(fasttext_cluster_cos_dist_desc, na.rm=T)),
                keyby=.(pid, category_query)
    ]
output_wide = output %>% dcast(pid + category + created_at + deadline ~ category_query, 
    value.var=c('bscore_nostopwords', 'bscore_nostopwords_title', 'mni_k100_weighted_normalized', 'fasttext_cluster_cos_dist_title', 'fasttext_cluster_cos_dist_desc'), 
    verbose=T)

# clean variable names
nm = names(output_wide)
setnames(output_wide, nm, str_replace(tolower(nm), ' ', '_'))
output_wide[1]

pid,category,created_at,deadline,bscore_nostopwords_accessories,bscore_nostopwords_product_design,bscore_nostopwords_title_accessories,bscore_nostopwords_title_product_design,mni_k100_weighted_normalized_accessories,mni_k100_weighted_normalized_product_design,fasttext_cluster_cos_dist_title_accessories,fasttext_cluster_cos_dist_title_product_design,fasttext_cluster_cos_dist_desc_accessories,fasttext_cluster_cos_dist_desc_product_design
<int>,<chr>,<dttm>,<dttm>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
160274,Accessories,2015-05-18 20:04:17,2015-06-14 19:00:00,836.0214,,12.17446,,0.5224166,,0.4599312,,0.3220672,


In [22]:
write_dta(output_wide, '~/OneDrive/Construal/data/sharing/within-cross-category-concreteness.dta')