# Text Concreteness (R)

## Create `dfm`

In [None]:
library(spacyr)
library(quanteda)

spacy_initialize(model='en_core_web_lg',
                 save_profile = T)

ld(pjson, ldtype='feather')

In [17]:
# create corpus
corpus = pjson[, .(pid, project_desc)] %>%
    corpus(docid_field='pid', text_field='project_desc')

In [None]:
# tokenize with spacy
# the results is a data.frame
# 1) keep both "raw" and "lemma" tokens 
# 2) tokens are case-sensitive
tokens_as_df = corpus %>%
    spacy_parse(pos=F, entity=F)

sv(tokens_as_df)
sv()

In [144]:
# convert `tokens_as_df` to quanteda `tokens` object
# 1) we use the lemmatized tokens, because the lookup table is also lemmatized 
# 2) tokens are case-sensitive
tokens_as_qeda = tokens_as_df %>%
    as.tokens(use_lemma=T)

tokens_as_qeda[1]

sv(tokens_as_qeda)

Tokens consisting of 1 document.
1000064918 :
 [1] "the"     "Beard"   "be"      "a"       "comedy"  "base"    "comic"  
 [8] "about"   "an"      "average" "guy"     "that"   
[ ... and 108 more ]
-tokens_lemmatized- saved  (10.24 secs)


In [None]:
# get the number of tokens of each doc
ntoken_corpus = ntoken(tokens_as_qeda)
ntoken_corpus = data.table(pid=names(ntoken_corpus), ntoken=ntoken_corpus)
ntoken_corpus[1]
sv(ntoken_corpus)

In [33]:
# Convert tokens to dfm
ld(tokens_as_qeda, force=T)

tokens_to_dfm <- function(tokens_as_qeda, startpos=1, endpos=-1) {
    # select tokens
    tokens = tokens_select(tokens_as_qeda, startpos=startpos, endpos=endpos)
    
    # create ngram
    tokens_ngram = tokens %>%
        tokens_ngrams(n=1:2, concatenator = " ")
    
    # create dfm
    cs_dfm = tokens_ngram %>% dfm(tolower=T, stem=F)
}

cs_dfm = tokens_to_dfm(tokens_as_qeda)
cs_dfm_first200 = tokens_to_dfm(tokens_as_qeda, endpos=200)

# sv(cs_dfm)
sv(cs_dfm_first200)

tokens_as_qeda (47.5 MB) already loaded, will NOT load again! (0 secs) (2021-03-07 5:55 PM)
"cs_dfm_first200" saved as "cs_dfm_first200.rds" (43.1 MB) (7.55 secs, 2021-03-07 17:56:10)


## Compute B-score

In [4]:
# get stopwords from nltk (Python code)
import nltk
import pandas as pd
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
df = pd.DataFrame({'word':list(stopwords)})
sv('df', svname='nltk_stopwords')

"df" saved as "nltk_stopwords.feather" (2.5 KB) (<1s)


In [2]:
# ------------ Create bscore dict ----------------
# load stop word list
ld(nltk_stopwords, force=T)
nltk_stopwords = nltk_stopwords[,word]

# read raw bscore
bscore_dt = fread('data/concreteness score.csv')[, .(word=str_trim(Word), score=Conc.M)]

# create TWO bscore, one has stopwords, one doesn't 
bscore = bscore_dt$score
names(bscore) = bscore_dt$word

bscore_nostopwords = bscore[!(names(bscore)%in%nltk_stopwords)]

sprintf('%s%% words are stopwords', (1-length(bscore_nostopwords)/length(bscore))*100%>%round(2)) %>% cat()

"nltk_stopwords.feather" (2.5 KB) loaded (0 secs) (2021-03-07 6:00 PM)
"bscore_nostopwords" saved as "bscore_nostopwords.rds" (232.4 KB) (0.04 secs, 2021-03-07 18:00:19)
0.310356910447018% words are stopwords

In [72]:
# ------------ Create bscore from dtm ----------------

ld(ntoken_corpus, force=T)
ld(cs_dfm)
ld(cs_dfm_first200)
ld(bscore_nostopwords)


dfm_to_bscore <- function(cs_dfm, bscore_dict, type_name='') {
    ntoken_name = str_c('ntoken_bscore', type_name)
    bscore_name = str_c('bscore', type_name)
    
    output_name = c('pid', bscore_name)
    
    dfm_bscore = dfm_match(cs_dfm, names(bscore_dict))
    ntoken_bscore = ntoken(dfm_bscore)
    ntoken_bscore_dt = data.table(pid=names(ntoken_bscore))
    ntoken_bscore_dt[, (ntoken_name) := ntoken_bscore]
    
    
    dfm_bscore_weighted = dfm_weight(dfm_bscore, weights=bscore_dict)
    dfm_bscore_weighted = convert(dfm_bscore_weighted, 'data.frame',
                                  docid_field='pid'
                                 ) %>% as.data.table()
    
    bscore_by_pid = dfm_bscore_weighted[, (bscore_name) := rowSums(.SD),
                                        .SDcols=is.numeric
        ][, ..output_name]
    
    bscore = bscore_by_pid[ntoken_bscore_dt, on=.(pid)]
}

bscore_bypid = dfm_to_bscore(cs_dfm, bscore)
bscore_bypid_nostopwords = dfm_to_bscore(cs_dfm, bscore_nostopwords, 
                                         type_name='_nostopwords')
bscore_bypid_firstn = dfm_to_bscore(cs_dfm_first200, bscore,
                                    type_name='_first200')
bscore_bypid_firstn_nostopwords = dfm_to_bscore(cs_dfm_first200,
                                                bscore_nostopwords,
                                                type_name='_first200_nostopwords')

"ntoken_corpus.rds" (282.9 KB) loaded (0.02 secs) (2021-03-08 2:28 AM)
cs_dfm (116.8 MB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)
cs_dfm_first200 (43.1 MB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)
bscore_nostopwords (232.4 KB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)


In [73]:
bscore_bypid = bscore_bypid[bscore_bypid_nostopwords, on=.(pid)
    ][bscore_bypid_firstn, on=.(pid)
    ][bscore_bypid_firstn_nostopwords, on=.(pid)
    ][ntoken_corpus, on=.(pid)]

sv(bscore_bypid)

"bscore_bypid" saved as "bscore_bypid.feather" (1.7 MB) (0.01 secs, 2021-03-08 02:28:57)


In [74]:
names(bscore_bypid)

# Word Freq (Py)

In [35]:
# Load data

import datatable as dt
import pyarrow.feather as feather
from datatable import update, f, join, by

tokens_as_df = feather.read_feather('data/tokens_as_df.feather',
                                    columns=['doc_id', 'token'])
tokens_as_df['word'] = tokens_as_df.token.str.lower()
tokens_as_df = dt.Frame(tokens_as_df)[:,[f.doc_id, f.word]]

freqdict = dt.fread('data/freqdict.csv')
freqdict.key = 'word'

In [36]:
# compute freq
result = tokens_as_df[:,:,join(freqdict)
    ][:,{'score':dt.sum(f.freq),'n_words':dt.count()}, by(f.doc_id)]

In [40]:
result
feather.write_feather(result.to_pandas(), 'data/result.feather')

Unnamed: 0_level_0,doc_id,score,n_words
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪▪▪▪▪
0,1000064918,16.5642,120
1,1000081649,51.0863,407
2,1000103948,112.149,773
3,1000117510,21.1289,335
4,1000201265,129.156,1013
5,1000234595,99.2084,768
6,1000291263,61.2186,471
7,1000335422,156.628,1438
8,1000426032,108.298,959
9,1000450327,65.6153,668
