# Text Concreteness (R)

In [2]:
suppressMessages({
    library(spacyr)
    library(quanteda)
})

WORK_DIR = '/home/yu/OneDrive/Construal'
setwd(WORK_DIR)

## Create `dfm`

In [4]:
spacy_initialize(model='en_core_web_sm')

pjson = read_feather('data/v1/pjson.feather') |> setDT()
pjson = pjson[category %in% c('Product Design', 'Accessories')]

spaCy is already initialized



NULL

In [7]:
# create corpus for "project description" and "title"
corpus_desc = pjson[, .(pid, project_desc)] %>%
    corpus(docid_field='pid', text_field='project_desc')
corpus_title = pjson[, .(pid, title)] %>%
    corpus(docid_field='pid', text_field='title')

In [8]:
# tokenize with spacy (2m 46s)
# the results is a data.frame
# 1) keep both "raw" and "lemma" tokens 
# 2) tokens are case-sensitive

tokens_as_df_desc = corpus_desc %>% spacy_parse(pos=F, entity=F)
tokens_as_df_title = corpus_title %>% spacy_parse(pos=F, entity=F)

In [9]:
# convert `tokens_as_df` to quanteda `tokens` object
# 1) we use the lemmatized tokens, because the lookup table is also lemmatized 
# 2) tokens are case-sensitive

tokens_as_qeda_desc = tokens_as_df_desc %>% as.tokens(use_lemma=T)
tokens_as_qeda_title = tokens_as_df_title %>% as.tokens(use_lemma=T)

tokens_as_qeda_title[1]

Tokens consisting of 1 document.
1000117510 :
[1] "Kawaii"  "Animals" "in"      "Mugs"    "Hard"    "Enamel"  "Pins"   


In [10]:
# get the number of tokens of each doc
ntoken_corpus_desc = ntoken(tokens_as_qeda_desc)
ntoken_corpus_desc = data.table(pid=names(ntoken_corpus_desc), ntoken=ntoken_corpus_desc)

ntoken_corpus_title = ntoken(tokens_as_qeda_title)
ntoken_corpus_title = data.table(pid=names(ntoken_corpus_title), ntoken_title=ntoken_corpus_title)

ntoken_corpus_title[1]

pid,ntoken_title
<chr>,<int>
1000117510,7


In [11]:
# Convert tokens to dfm
tokens_to_dfm <- function(tokens_as_qeda, startpos=1, endpos=-1) {
    # select tokens
    tokens = tokens_select(tokens_as_qeda, startpos=startpos, endpos=endpos)
    
    # create ngram
    tokens_ngram = tokens %>%
        tokens_ngrams(n=1:2, concatenator = " ")
    
    # create dfm
    cs_dfm = tokens_ngram %>% dfm(tolower=T, stem=F)
}

cs_dfm_desc = tokens_to_dfm(tokens_as_qeda_desc)
cs_dfm_first200_desc = tokens_to_dfm(tokens_as_qeda_desc, endpos=200)
cs_dfm_title = tokens_to_dfm(tokens_as_qeda_title)

“'stem' is deprecated; use dfm_wordstem() instead”
“'stem' is deprecated; use dfm_wordstem() instead”
“'stem' is deprecated; use dfm_wordstem() instead”


## Compute B-score

In [6]:
# get stopwords from nltk (Python code)
import nltk
import os
import pandas as pd
from utilpy import sv, ld
from nltk.corpus import stopwords
nltk.data.path.append('/home/yu/LocalData/nltk-data')
os.chdir('/home/yu/OneDrive/Construal')

stopwords = set(stopwords.words('english'))
df = pd.DataFrame({'word':list(stopwords)})
sv('df', svname='nltk_stopwords')

Saved as "nltk_stopwords.pkl" (17.0 B) (<1s) (2022-01-12 3:43 PM)


In [4]:
# ------------ Create bscore dict ----------------
suppressMessages(library(utilr))
WORK_DIR = '/home/yu/OneDrive/Construal'
setwd(WORK_DIR)

# load stop word list
ld(nltk_stopwords, force=T)
nltk_stopwords = nltk_stopwords[,word]

# read raw bscore
bscore_dt = fread('/home/yu/OneDrive/Construal/data/concreteness score.csv')[, .(word=str_trim(Word), score=Conc.M)]

# create TWO bscore, one has stopwords, one doesn't 
bscore = bscore_dt$score
names(bscore) = bscore_dt$word

bscore_nostopwords = bscore[!(names(bscore)%in%nltk_stopwords)]

sprintf('%s%% words are stopwords', (1-length(bscore_nostopwords)/length(bscore))*100%>%round(2)) %>% cat()

"nltk_stopwords.feather" (2.5 KB) loaded (0.02 secs) (2022-01-13 1:04 PM)
0.310356910447018% words are stopwords

In [26]:
# ------------ Create bscore from dtm (for desc) ----------------

ld(ntoken_corpus_desc)
ld(ntoken_corpus_title)
ld(cs_dfm_desc, force=T)
ld(cs_dfm_first200_desc, force=T)
ld(cs_dfm_title, force=T)
ld(bscore_nostopwords, force=T)


dfm_to_bscore <- function(cs_dfm, bscore_dict, type_name='') {
    ntoken_name = str_c('ntoken_bscore', type_name)
    ntoken_unique_name = str_c('ntoken_unique', type_name)
    ntoken_bscore_unique_name = str_c('ntoken_bscore_unique', type_name)
    bscore_name = str_c('bscore', type_name)
    
    output_name = c('pid', bscore_name)

    ntoken_unique = ntype(cs_dfm)

    dfm_bscore = dfm_match(cs_dfm, names(bscore_dict))
    ntoken_bscore = ntoken(dfm_bscore)
    ntoken_bscore_unique = ntype(dfm_bscore) 
    ntoken_bscore_dt = data.table(pid=names(ntoken_bscore))
    ntoken_bscore_dt = ntoken_bscore_dt[, (ntoken_name) := ntoken_bscore
        ][, (ntoken_unique_name) := ntoken_unique
        ][, (ntoken_bscore_unique_name) := ntoken_bscore_unique]
    
    
    dfm_bscore_weighted = dfm_weight(dfm_bscore, weights=bscore_dict)
    dfm_bscore_weighted = convert(dfm_bscore_weighted, 'data.frame',
                                  docid_field='pid'
                                 ) %>% as.data.table()
    
    bscore_by_pid = dfm_bscore_weighted[, (bscore_name) := rowSums(.SD),
                                        .SDcols=is.numeric
        ][, ..output_name]
    
    bscore = bscore_by_pid[ntoken_bscore_dt, on=.(pid)]
}

bscore_bypid_desc = dfm_to_bscore(cs_dfm_desc, bscore)

bscore_bypid_nostopwords_desc = dfm_to_bscore(
    cs_dfm_desc, bscore_nostopwords, 
    type_name='_nostopwords')

bscore_bypid_firstn_desc = dfm_to_bscore(
    cs_dfm_first200_desc, bscore, 
    type_name='_first200')

bscore_bypid_firstn_nostopwords_desc = dfm_to_bscore(
    cs_dfm_first200_desc,
    bscore_nostopwords,
    type_name='_first200_nostopwords')

bscore_bypid_title = dfm_to_bscore(
    cs_dfm_title,
    bscore)

bscore_bypid_nostopwords_title = dfm_to_bscore(
    cs_dfm_title,
    bscore_nostopwords,
    type_name='_nostopwords')


ntoken_corpus_desc (607.9 KB) already loaded, will NOT load again! (0 secs) (2022-01-12 4:38 PM)
ntoken_corpus_title (562.3 KB) already loaded, will NOT load again! (0 secs) (2022-01-12 4:38 PM)
"cs_dfm_desc.rds" (119.1 MB) loaded (3.55 secs) (2022-01-12 4:38 PM)
"cs_dfm_first200_desc.rds" (43.9 MB) loaded (1.4 secs) (2022-01-12 4:38 PM)
"cs_dfm_title.rds" (3.1 MB) loaded (0.15 secs) (2022-01-12 4:38 PM)
"bscore_nostopwords.rds" (232.4 KB) loaded (0.03 secs) (2022-01-12 4:38 PM)


In [9]:
# ------------ Create bscore from dtm (for title) ----------------
ld(cs_dfm_title, force=T)
ld(bscore_nostopwords, force=T)
ld(ntoken_corpus_title)

dfm_to_bscore <- function(cs_dfm, bscore_dict, type_name='') {
    ntoken_name = str_c('ntoken_bscore', type_name)
    ntoken_unique_name = str_c('ntoken_unique', type_name)
    ntoken_bscore_unique_name = str_c('ntoken_bscore_unique', type_name)
    bscore_name = str_c('bscore', type_name)
    
    output_name = c('pid', bscore_name)

    ntoken_unique = ntype(cs_dfm)

    dfm_bscore = dfm_match(cs_dfm, names(bscore_dict))
    ntoken_bscore = ntoken(dfm_bscore)
    ntoken_bscore_unique = ntype(dfm_bscore) 
    ntoken_bscore_dt = data.table(pid=names(ntoken_bscore))
    ntoken_bscore_dt = ntoken_bscore_dt[, (ntoken_name) := ntoken_bscore
        ][, (ntoken_unique_name) := ntoken_unique
        ][, (ntoken_bscore_unique_name) := ntoken_bscore_unique]
    
    
    dfm_bscore_weighted = dfm_weight(dfm_bscore, weights=bscore_dict)
    dfm_bscore_weighted = convert(dfm_bscore_weighted, 'data.frame',
                                  docid_field='pid'
                                 ) %>% as.data.table()
    
    bscore_by_pid = dfm_bscore_weighted[, (bscore_name) := rowSums(.SD),
                                        .SDcols=is.numeric
        ][, ..output_name]
    
    bscore = bscore_by_pid[ntoken_bscore_dt, on=.(pid)]
}

bscore_bypid_title = dfm_to_bscore(
    cs_dfm_title,
    bscore)

old_names = names(bscore_bypid_title)[-1]
new_names = str_c(old_names, '_title')
setnames(bscore_bypid_title, old_names, new_names)

bscore_bypid_nostopwords_title = dfm_to_bscore(
    cs_dfm_title,
    bscore_nostopwords,
    type_name='_nostopwords')
    
old_names = names(bscore_bypid_nostopwords_title)[-1]
new_names = str_c(old_names, '_title')
setnames(bscore_bypid_nostopwords_title, old_names, new_names)

"cs_dfm_title.rds" (3.1 MB) loaded (0.13 secs) (2022-01-13 1:06 PM)
"bscore_nostopwords.rds" (232.4 KB) loaded (0.02 secs) (2022-01-13 1:06 PM)
ntoken_corpus_title (562.4 KB) already loaded, will NOT load again! (0 secs) (2022-01-13 1:06 PM)


In [75]:
# --------------- combine all bscore datasets ----------------
bscore_bypid_final = bscore_bypid_desc[bscore_bypid_nostopwords_desc, on=.(pid)
    ][bscore_bypid_firstn_desc, on=.(pid)
    ][bscore_bypid_firstn_nostopwords_desc, on=.(pid)
    ][ntoken_corpus_desc, on=.(pid)
    ][bscore_bypid_nostopwords_title, on=.(pid)
    ][bscore_bypid_title, on=.(pid)
    ][ntoken_corpus_title, on=.(pid)
    ]


sv(bscore_bypid_final, 'bscore_bypid')
fwrite(bscore_bypid_final, './data/bscore_bypid.csv')

"bscore_bypid_final" saved as "bscore_bypid.feather" (3.2 MB) (0.01 secs, 2022-01-12 17:22:48)


# Word Freq (R)

In [12]:
suppressWarnings({
})

wdir = '/home/yu/OneDrive/Construal'
setwd(wdir)

NULL

### Compute the word frequency from the Kickstarer dataset

The result is in `data/v1/google_freqdict.feather`

### Compute the word frequency from the Kickstarer dataset

In [None]:
punct = c(',', '.', '-', '?', '!', '(', ')', '$', '/', ':', ' ', '"', intToUtf8(160))

kck_freqdict_withstopwords = tokens_as_df[, .(doc_id, word=tolower(token))
    ][!(word %in% punct)
    ][, .(n=.N), keyby=.(word)
    ][, ':='(freq_kck_withstop=n/max(n))
    ][order(-freq_kck_withstop), .(word, freq_kck_withstop)]

kck_freqdict_withoutstopwords = tokens_as_df[, .(doc_id, word=tolower(token))
    ][(!(word %in% punct)) & (!(word %in% nltk_stopwords$word))
    ][, .(n=.N), keyby=.(word)
    ][, ':='(freq_kck_withoutstop=n/max(n))
    ][order(-freq_kck_withoutstop), .(word, freq_kck_withoutstop)]

fwrite(kck_freqdict_withstopwords, 'data/Sharing/kck_freqdict_withstopwords.csv')
fwrite(kck_freqdict_withoutstopwords, 'data/Sharing/kck_freqdict_withoutstopwords.csv')

kck_freqdict = kck_freqdict_withoutstopwords[kck_freqdict_withstopwords, on=.(word)
    # ][, ':='(freq_kck_withoutstop=nafill(freq_kck_withoutstop, 'const', 0))
    ][order(-freq_kck_withstop)]

In [None]:
fwrite(kck_freqdict, '../data/word_freq_kick.csv')

In [None]:
sv(kck_freqdict)
fwrite(kck_freqdict, 'data/Sharing/kck_freqdict.csv')

### Merge Kickstar_freq with Google_freq

In [None]:
freq_dict = google_freqdict[kck_freqdict, on=.(word)]
freq_dict = freq_dict[order(-freq_google_withstop)
    ][, ':='(
      top_google_withoutstop=word %in% word[!is.na(freq_google_withoutstop)][1:5000],
      top_google_withstop=word %in% word[!is.na(freq_google_withstop)][1:5000]
      )
    ][order(-freq_kck_withstop)
    ][, ':='(
      top_kck_withoutstop=word %in% word[!is.na(freq_kck_withoutstop)][1:5000],
      top_kck_withstop=word %in% word[!is.na(freq_kck_withstop)][1:5000]
      )]
      
sv(freq_dict)

## Compute Title/Desc Freq Score

Given one title or proj_desc, calculate the freq score defined as: `mean(word_i)` where `word_i` is the freq of each word.

Return:
- `freq_title_withstop`: include stop words
- `freq_title_withoutstop`: exclude stop words
- `freq_desc_withoutstop`: exclude stop words
- `freq_desc_withoutstop`: exclude stop words

In [15]:
# load pjson
pjson = read_feather('data/v1/pjson.feather') |> setDT()

# load freq_dict (include both google freq and kickstarter freq)
freq_dict = read_feather('data/v1/freq_dict.feather') |> setDT()

In [None]:
punct = c(',', '.', '-', '?', '!', '(', ')', '$', '/', ':', ' ', '"', intToUtf8(160))

score = tokens_as_df[,.(doc_id, word=tolower(token))
    ][!(word %in% punct)
    ][freq_dict, 
      on=.(word), nomatch=NULL
    ][, .(score_google_all_withstop=sum(freq_google_withstop, na.rm=T),
          score_google_all_withoutstop=sum(freq_google_withoutstop, na.rm=T),
          score_google_top5000_withstop=sum(freq_google_withstop[top_google_withstop], na.rm=T),
          score_google_top5000_withoutstop=sum(freq_google_withoutstop[top_google_withstop], na.rm=T),
          score_kck_all_withstop=sum(freq_kck_withstop, na.rm=T),
          score_kck_all_withoutstop=sum(freq_kck_withoutstop, na.rm=T),
          score_kck_top5000_withstop=sum(freq_kck_withstop[top_google_withstop], na.rm=T),
          score_kck_top5000_withoutstop=sum(freq_kck_withoutstop[top_google_withstop], na.rm=T),
          n_words_withstop=.N,
          n_words_withoutstop=sum(!(word%in%(nltk_stopwords$word)))
        ),
      keyby=.(doc_id)]

setnames(score, 'doc_id', 'pid')

In [14]:
sv(score)
fwrite(score, 'data/sharing/text_freq_score.csv')

"score" saved as "score.feather" (3.3 MB) (0.01 secs, 2021-10-16 22:54:03)


In [None]:
score %>% names()