# Text Concreteness (R)

## Create `dfm`

In [None]:
library(spacyr)
library(quanteda)

spacy_initialize(model='en_core_web_lg',
                 save_profile = T)

ld(pjson, ldtype='feather')

In [17]:
# create corpus
corpus = pjson[, .(pid, project_desc)] %>%
    corpus(docid_field='pid', text_field='project_desc')

In [None]:
# tokenize with spacy
# the results is a data.frame
# 1) keep both "raw" and "lemma" tokens 
# 2) tokens are case-sensitive
tokens_as_df = corpus %>%
    spacy_parse(pos=F, entity=F)

sv(tokens_as_df)
sv()

In [144]:
# convert `tokens_as_df` to quanteda `tokens` object
# 1) we use the lemmatized tokens, because the lookup table is also lemmatized 
# 2) tokens are case-sensitive
tokens_as_qeda = tokens_as_df %>%
    as.tokens(use_lemma=T)

tokens_as_qeda[1]

sv(tokens_as_qeda)

Tokens consisting of 1 document.
1000064918 :
 [1] "the"     "Beard"   "be"      "a"       "comedy"  "base"    "comic"  
 [8] "about"   "an"      "average" "guy"     "that"   
[ ... and 108 more ]
-tokens_lemmatized- saved  (10.24 secs)


In [None]:
# get the number of tokens of each doc
ntoken_corpus = ntoken(tokens_as_qeda)
ntoken_corpus = data.table(pid=names(ntoken_corpus), ntoken=ntoken_corpus)
ntoken_corpus[1]
sv(ntoken_corpus)

In [33]:
# Convert tokens to dfm
ld(tokens_as_qeda, force=T)

tokens_to_dfm <- function(tokens_as_qeda, startpos=1, endpos=-1) {
    # select tokens
    tokens = tokens_select(tokens_as_qeda, startpos=startpos, endpos=endpos)
    
    # create ngram
    tokens_ngram = tokens %>%
        tokens_ngrams(n=1:2, concatenator = " ")
    
    # create dfm
    cs_dfm = tokens_ngram %>% dfm(tolower=T, stem=F)
}

cs_dfm = tokens_to_dfm(tokens_as_qeda)
cs_dfm_first200 = tokens_to_dfm(tokens_as_qeda, endpos=200)

# sv(cs_dfm)
sv(cs_dfm_first200)

tokens_as_qeda (47.5 MB) already loaded, will NOT load again! (0 secs) (2021-03-07 5:55 PM)
"cs_dfm_first200" saved as "cs_dfm_first200.rds" (43.1 MB) (7.55 secs, 2021-03-07 17:56:10)


## Compute B-score

In [4]:
# get stopwords from nltk (Python code)
import nltk
import pandas as pd
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
df = pd.DataFrame({'word':list(stopwords)})
sv('df', svname='nltk_stopwords')

"df" saved as "nltk_stopwords.feather" (2.5 KB) (<1s)


In [2]:
# ------------ Create bscore dict ----------------
# load stop word list
ld(nltk_stopwords, force=T)
nltk_stopwords = nltk_stopwords[,word]

# read raw bscore
bscore_dt = fread('data/concreteness score.csv')[, .(word=str_trim(Word), score=Conc.M)]

# create TWO bscore, one has stopwords, one doesn't 
bscore = bscore_dt$score
names(bscore) = bscore_dt$word

bscore_nostopwords = bscore[!(names(bscore)%in%nltk_stopwords)]

sprintf('%s%% words are stopwords', (1-length(bscore_nostopwords)/length(bscore))*100%>%round(2)) %>% cat()

"nltk_stopwords.feather" (2.5 KB) loaded (0 secs) (2021-03-07 6:00 PM)
"bscore_nostopwords" saved as "bscore_nostopwords.rds" (232.4 KB) (0.04 secs, 2021-03-07 18:00:19)
0.310356910447018% words are stopwords

In [72]:
# ------------ Create bscore from dtm ----------------

ld(ntoken_corpus, force=T)
ld(cs_dfm)
ld(cs_dfm_first200)
ld(bscore_nostopwords)


dfm_to_bscore <- function(cs_dfm, bscore_dict, type_name='') {
    ntoken_name = str_c('ntoken_bscore', type_name)
    bscore_name = str_c('bscore', type_name)
    
    output_name = c('pid', bscore_name)
    
    dfm_bscore = dfm_match(cs_dfm, names(bscore_dict))
    ntoken_bscore = ntoken(dfm_bscore)
    ntoken_bscore_dt = data.table(pid=names(ntoken_bscore))
    ntoken_bscore_dt[, (ntoken_name) := ntoken_bscore]
    
    
    dfm_bscore_weighted = dfm_weight(dfm_bscore, weights=bscore_dict)
    dfm_bscore_weighted = convert(dfm_bscore_weighted, 'data.frame',
                                  docid_field='pid'
                                 ) %>% as.data.table()
    
    bscore_by_pid = dfm_bscore_weighted[, (bscore_name) := rowSums(.SD),
                                        .SDcols=is.numeric
        ][, ..output_name]
    
    bscore = bscore_by_pid[ntoken_bscore_dt, on=.(pid)]
}

bscore_bypid = dfm_to_bscore(cs_dfm, bscore)
bscore_bypid_nostopwords = dfm_to_bscore(cs_dfm, bscore_nostopwords, 
                                         type_name='_nostopwords')
bscore_bypid_firstn = dfm_to_bscore(cs_dfm_first200, bscore,
                                    type_name='_first200')
bscore_bypid_firstn_nostopwords = dfm_to_bscore(cs_dfm_first200,
                                                bscore_nostopwords,
                                                type_name='_first200_nostopwords')

"ntoken_corpus.rds" (282.9 KB) loaded (0.02 secs) (2021-03-08 2:28 AM)
cs_dfm (116.8 MB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)
cs_dfm_first200 (43.1 MB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)
bscore_nostopwords (232.4 KB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)


In [73]:
bscore_bypid = bscore_bypid[bscore_bypid_nostopwords, on=.(pid)
    ][bscore_bypid_firstn, on=.(pid)
    ][bscore_bypid_firstn_nostopwords, on=.(pid)
    ][ntoken_corpus, on=.(pid)]

sv(bscore_bypid)

"bscore_bypid" saved as "bscore_bypid.feather" (1.7 MB) (0.01 secs, 2021-03-08 02:28:57)


In [74]:
names(bscore_bypid)

# Word Freq (R)

## Compute freq_dict

Compute the word frequency from the Google dataset

In [110]:
ld(tokens_as_df)
ld(nltk_stopwords)

google_freqdict_withstop = fread('data/freqdict.csv')

tokens_as_df (462.3 MB) already loaded, will NOT load again! (0 secs) (2021-04-06 7:35 PM)
nltk_stopwords (2.5 KB) already loaded, will NOT load again! (0 secs) (2021-04-06 7:35 PM)


In [135]:
google_freqdict = google_freqdict_withstop[
      !(word %in% nltk_stopwords$word),
    ][, .(word, freq_google_withoutstop=freq/max(freq))
    ][google_freqdict_withstop[, .(word, freq_google_withstop=freq)], 
      on=.(word)
    # ][, ':='(freq_google_withoutstop=nafill(freq_google_withoutstop, 'const', 0))
    ][order(-freq_google_withstop)]

google_freqdict[1:2]

word,freq_google_withoutstop,freq_google_withstop
<chr>,<dbl>,<dbl>
the,,1.0
of,,0.5684659


In [169]:
sv(google_freqdict)
fwrite(google_freqdict, 'data/Sharing/google_freqdict.csv')

"google_freqdict" saved as "google_freqdict.feather" (1.2 MB) (0.01 secs, 2021-04-06 23:43:00)


Compute the word frequency from the Kickstarer dataset

In [136]:
punct = c(',', '.', '-', '?', '!', '(', ')', '$', '/', ':', ' ', '"', intToUtf8(160))

kck_freqdict_withstopwords = tokens_as_df[, .(doc_id, word=tolower(token))
    ][!(word %in% punct)
    ][, .(n=.N), keyby=.(word)
    ][, ':='(freq_kck_withstop=n/max(n))
    ][order(-freq_kck_withstop), .(word, freq_kck_withstop)]

kck_freqdict_withoutstopwords = tokens_as_df[, .(doc_id, word=tolower(token))
    ][(!(word %in% punct)) & (!(word %in% nltk_stopwords$word))
    ][, .(n=.N), keyby=.(word)
    ][, ':='(freq_kck_withoutstop=n/max(n))
    ][order(-freq_kck_withoutstop), .(word, freq_kck_withoutstop)]

fwrite(kck_freqdict_withstopwords, 'data/Sharing/kck_freqdict_withstopwords.csv')
fwrite(kck_freqdict_withoutstopwords, 'data/Sharing/kck_freqdict_withoutstopwords.csv')

kck_freqdict = kck_freqdict_withoutstopwords[kck_freqdict_withstopwords, on=.(word)
    # ][, ':='(freq_kck_withoutstop=nafill(freq_kck_withoutstop, 'const', 0))
    ][order(-freq_kck_withstop)]

In [168]:
sv(kck_freqdict)
fwrite(kck_freqdict, 'data/Sharing/kck_freqdict.csv')

"kck_freqdict" saved as "kck_freqdict.feather" (9 MB) (0.35 secs, 2021-04-06 23:38:29)


In [None]:
kck_freqdict

Merge Kickstar_freq with Google_freq

In [153]:
freq_dict = google_freqdict[kck_freqdict, on=.(word)]
freq_dict = freq_dict[order(-freq_google_withstop)
    ][, ':='(
      top_google_withoutstop=word %in% word[!is.na(freq_google_withoutstop)][1:5000],
      top_google_withstop=word %in% word[!is.na(freq_google_withstop)][1:5000]
      )
    ][order(-freq_kck_withstop)
    ][, ':='(
      top_kck_withoutstop=word %in% word[!is.na(freq_kck_withoutstop)][1:5000],
      top_kck_withstop=word %in% word[!is.na(freq_kck_withstop)][1:5000]
      )]
      
sv(freq_dict)

"freq_dict" saved as "freq_dict.feather" (9.8 MB) (0.35 secs, 2021-04-06 20:09:14)


## Compute score

Compute the "frequency score"

- top 5000 words or all
- with/without stopwords
- based on kickstarter or LVIS

So there're 2 * 2 * 2=8 versions

In [155]:
ld(freq_dict, force=T)
freq_dict[1]

"freq_dict.feather" (9.8 MB) loaded (0.19 secs) (2021-04-06 8:09 PM)


word,freq_google_withoutstop,freq_google_withstop,freq_kck_withoutstop,freq_kck_withstop,top_google_withoutstop,top_google_withstop,top_kck_withoutstop,top_kck_withstop
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>
the,,1,,1,False,True,False,True


In [None]:
punct = c(',', '.', '-', '?', '!', '(', ')', '$', '/', ':', ' ', '"', intToUtf8(160))

score = tokens_as_df[,.(doc_id, word=tolower(token))
    ][!(word %in% punct)
    ][freq_dict, 
      on=.(word), nomatch=NULL
    ][, .(score_google_all_withstop=sum(freq_google_withstop, na.rm=T),
          score_google_all_withoutstop=sum(freq_google_withoutstop, na.rm=T),
          score_google_top5000_withstop=sum(freq_google_withstop[top_google_withstop], na.rm=T),
          score_google_top5000_withoutstop=sum(freq_google_withoutstop[top_google_withstop], na.rm=T),
          score_kck_all_withstop=sum(freq_kck_withstop, na.rm=T),
          score_kck_all_withoutstop=sum(freq_kck_withoutstop, na.rm=T),
          score_kck_top5000_withstop=sum(freq_kck_withstop[top_google_withstop], na.rm=T),
          score_kck_top5000_withoutstop=sum(freq_kck_withoutstop[top_google_withstop], na.rm=T),
          n_words_withstop=.N,
          n_words_withoutstop=sum(!(word%in%(nltk_stopwords$word)))
        ),
      keyby=.(doc_id)]

In [164]:
sv(score)
fwrite(score, 'data/Sharing/text_freq_score.csv')

"score" saved as "score.feather" (3.3 MB) (0.01 secs, 2021-04-06 23:32:40)


In [170]:
score

doc_id,score_google_all_withstop,score_google_all_withoutstop,score_google_top5000_withstop,score_google_top5000_withoutstop,score_kck_all_withstop,score_kck_all_withoutstop,score_kck_top5000_withstop,score_kck_top5000_withoutstop,n_words_withstop,n_words_withoutstop
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>
1000064918,16.564182,4.598660,16.561525,4.559029,19.778298,6.016741,19.759641,5.809680,112,56
1000081649,51.086338,17.597705,51.079939,17.502268,60.843685,19.767471,60.579193,16.831987,371,182
1000103948,112.149320,29.849668,112.132549,29.599549,124.743338,32.240279,124.365149,28.042923,728,371
1000117510,21.128929,10.256774,21.123576,10.176936,24.905090,14.126331,24.661630,11.424276,302,203
1000201265,129.155865,36.138959,129.125064,35.679576,140.784441,43.173963,140.345152,38.298483,861,540
1000234595,99.208358,29.004745,99.193653,28.785436,112.382945,30.236821,111.778967,23.622939,705,378
1000291263,61.218582,15.533275,61.210438,15.411811,70.420722,20.273904,70.020829,15.835664,419,207
1000335422,156.627975,61.134812,156.584620,60.488197,180.241515,73.519672,178.746441,56.926494,1254,730
1000426032,108.298016,34.991093,108.283498,34.774564,129.255283,35.537313,128.515083,27.322162,869,436
1000450327,65.615291,34.131709,65.603515,33.956070,78.465632,35.132424,77.754470,27.239544,601,361
