# Init

In [None]:
# if you've not installed my toolbox `utilr`, please install through:
# devtools::install_github('xiaomowu/utilr')
# then `library(utilr)`q
library(utilr)
library(jsonlite)
library(quanteda)

quanteda_options(threads = 32) # On your laptop, you probably want to set it to 4 or 8.

setwd('~/OneDrive/Construal')

# Parse JSON

## get `project_ids` and `project_dirs`

In [3]:
data_root_dir = './data/Kickstarter Data/'

project_ids = list.dirs(data_root_dir, full.names=F, recursive=F)

project_dirs = str_c(data_root_dir, project_ids, '/')

## parse JSON

In [None]:
pjson = vector(mode = "list", length=length(project_ids))

for (i in 1:length(project_ids)) {
    pid = project_ids[i]
    pdir = project_dirs[i]
    
    tryCatch({
        json_path = sprintf('%s/%s.json', pdir, pid)
        json = fromJSON(json_path)
        json$pledge_money = str_c(json$pledge_money, collapse=',')
        json$pledge_count = str_c(json$pledge_count, collapse=',')
        pjson[[pid]] = json
    }, error=function(cond) {
        message(sprintf('Error: %s', json_path))
        NULL
    })
}

pjson = rbindlist(pjson, use.names=T, idcol='pid') %>% unique()

In [11]:
write_feather(pjson, './data/pjson.feather', version = 2)

In [13]:
success_pct = round(pjson[, .N]/length(project_ids)*100, 2)
message(sprintf('%s%% projects have been successfully parsed.', success_pct))

99.97% projects have been successfully parsed.



In [9]:
pjson[1, .(pid, pledge_count)]

pid,pledge_count
<chr>,<chr>
1000064918,92400


# Parse HTML

## get `project_ids` and `project_dirs`

In [11]:
data_root_dir = './data/Kickstarter Data/'

project_ids = list.dirs(data_root_dir, full.names=F, recursive=F)

project_dirs = str_c(data_root_dir, project_ids, '/')

In [57]:
# risk
# the json file already captured the risk portion
risk = getNodeSet(parsed_html, '//div[@class="mb3 mb10-sm mb3 js-risks"]//p') %>%
    xmlValue() %>%
    str_c(collapse='\n')

# cat(sprintf('[risk]:\n%s\n', risk))
# cat('---------------------------\n')

In [295]:
# get parsed_html
pid = 1649873594
html_page = sprintf("C:/Users/rossz/OneDrive/Construal/data/Kickstarter Data/%s/%s.html", pid, pid)
parsed_html = htmlParse(html_page)


split_team_and_project <- function(pid,
    parsed_html, 
    bold_title_xpath,
    pos_team_xpath) {
    
    # possible team_titles
    team_titles = c("Who's on the team?", "The Team", "About Us", "About the Artist", "Meet the team", "Who we are", "Meet the creators", 
                    "Who's involved", "Who is involved") %>% tolower()
    
    # root node for proj_desc and team_desc
    root = xmlDoc(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]])
    
    # all the <p> under the root
    all_p = root['//p']
    
    # set default team_desc/proj_desc
    team_desc = NA
    proj_desc = getNodeSet(parsed_html, '//div[@class="full-description js-full-description responsive-media formatted-lists"]//text()') %>%
        xmlValue() %>%
        str_c(collapse='\n') %>%
        str_trim()
    success_counter = 0
    
    # print(proj_desc)
    
    # find out all bold titles
    # convert them to lower case
    bold_titles = getNodeSet(root, bold_title_xpath) %>%
        xmlValue() %>%
        str_trim() %>%
        tolower()
    
    # print(bold_titles)
    
    # loop over every possible team_title
    for (team_title in team_titles) {
        idx = match(team_title, bold_titles)
        
        # if successfully finds team_title, output BOTH team_description and project_descriptoin;
        # else, pased_html as project_description
        if (is.na(idx)) {
            next
        } else {
            success_counter = success_counter + 1
            
            team_title_next = bold_titles[idx+1]
            
#             print(team_title)
#             print(team_title_next)
            
            
           
            pos_team_start = getNodeSet(all_p, sprintf(pos_team_xpath, team_title))

            pos_team_end = getNodeSet(all_p, sprintf(pos_team_xpath, team_title_next))

            # extract team_description
            team_desc = getNodeSet(all_p, sprintf('//p[position()>%s and position()<%s]', pos_team_start, pos_team_end)) %>%
                xmlValue() %>%
                str_c(collapse='\n') %>%  
                str_trim()
            
#             print(pos_team_start)
#             print(pos_team_end)
            

            
            # extract project_description
            proj_desc = getNodeSet(all_p, sprintf('//p[position()<=%s or position()>=%s]', pos_team_start, pos_team_end)) %>%
                xmlValue() %>%
                str_c(collapse='\n') %>%
                str_trim()
        }
    }
    
    # log message if more than one team_description have been found
    if (success_counter>1) {
        message(sprintf('More than one team_description have been found (%s).', parsed_html))
    }
    return(list(pid=pid, team_desc=team_desc, proj_desc=proj_desc))
}

# h1 = split_team_and_project(
#     pid,
#     parsed_html,
#     bold_title_xpath='//div[@class="full-description js-full-description responsive-media formatted-lists"]//h1',
#     pos_team_xpath='count(//div[@class="full-description js-full-description responsive-media formatted-lists"]//h1[contains(lower-case(text()),"%s")]/preceding-sibling::p)+1')
# print(h1)

b = split_team_and_project(
    pid,
    parsed_html,
    bold_title_xpath='//p[count(./b)=1]//b',
    pos_team_xpath='count(//p[contains(lower-case(text()),"%s")]/preceding-sibling::p)+1')
# b

[1] "enamel pin product specs:" "about us:"                
[3] "shipping:"                


In [365]:
all_p[[1]]
cat('---------')
str(all_p[[1]])

<p>Floral Tiger is part of our Floral Endangered Animals pin series, featuring these majestic cats! Choose between Orange Bengal or White Tiger, or both!</p> 

---------Classes 'XMLInternalElementNode', 'XMLInternalNode', 'XMLAbstractNode' <externalptr> 


In [None]:
# get parsed_html
pid = 1649873594
html_page = sprintf("C:/Users/rossz/OneDrive/Construal/data/Kickstarter Data/%s/%s.html", pid, pid)
parsed_html = htmlParse(html_page)

root = parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]]

root_children = xmlChildren(root) # I shouldn't use `xmlChildren` 
length(root_children)

# root_children[[2]] %>% class()
# root_children[[2]]
cat('------------\n')

getNodeSet(root, './/p[count(.//b)=1 and contains(.//b, "About")]/preceding::*')

In [None]:
root = xmlDoc(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]])

pos_team_start = getNodeSet(root, sprintf('count(//p[contains(lower-case(.//b/text()),"%s")]/preceding-sibling::*)+1', 'about'))
pos_team_start

names(root)

In [415]:
class(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][1][[1]])

In [None]:
all_p = getNodeSet(parsed_html, '//div[@class="full-description js-full-description responsive-media formatted-lists"]//p')

getNodeSet(all_p[[1]], './/text()')

In [None]:
getNodeSet(parsed_html, 
           sprintf('//div[@class="full-description js-full-description responsive-media formatted-lists"]//p[contains(lower-case(./b/text()),"%s")]', 'shipping'))

# Text Concreteness

## Create `dfm`

In [None]:
library(spacyr)
library(quanteda)

spacy_initialize(model='en_core_web_lg',
                 save_profile = T)

ld(pjson, ldtype='feather')

In [17]:
# create corpus
corpus = pjson[, .(pid, project_desc)] %>%
    corpus(docid_field='pid', text_field='project_desc')

In [None]:
# tokenize with spacy
# the results is a data.frame
# 1) keep both "raw" and "lemma" tokens 
# 2) tokens are case-sensitive
tokens_as_df = corpus %>%
    spacy_parse(pos=F, entity=F)

sv(tokens_as_df)

In [144]:
# convert `tokens_as_df` to quanteda `tokens` object
# 1) we use the lemmatized tokens, because the lookup table is also lemmatized 
# 2) tokens are case-sensitive
tokens_as_qeda = tokens_as_df %>%
    as.tokens(use_lemma=T)

tokens_as_qeda[1]

sv(tokens_as_qeda)

Tokens consisting of 1 document.
1000064918 :
 [1] "the"     "Beard"   "be"      "a"       "comedy"  "base"    "comic"  
 [8] "about"   "an"      "average" "guy"     "that"   
[ ... and 108 more ]
-tokens_lemmatized- saved  (10.24 secs)


In [None]:
# get the number of tokens of each doc
ntoken_corpus = ntoken(tokens_as_qeda)
ntoken_corpus = data.table(pid=names(ntoken_corpus), ntoken=ntoken_corpus)
ntoken_corpus[1]
sv(ntoken_corpus)

In [33]:
# Convert tokens to dfm
ld(tokens_as_qeda, force=T)

tokens_to_dfm <- function(tokens_as_qeda, startpos=1, endpos=-1) {
    # select tokens
    tokens = tokens_select(tokens_as_qeda, startpos=startpos, endpos=endpos)
    
    # create ngram
    tokens_ngram = tokens %>%
        tokens_ngrams(n=1:2, concatenator = " ")
    
    # create dfm
    cs_dfm = tokens_ngram %>% dfm(tolower=T, stem=F)
}

cs_dfm = tokens_to_dfm(tokens_as_qeda)
cs_dfm_first200 = tokens_to_dfm(tokens_as_qeda, endpos=200)

# sv(cs_dfm)
sv(cs_dfm_first200)

tokens_as_qeda (47.5 MB) already loaded, will NOT load again! (0 secs) (2021-03-07 5:55 PM)
"cs_dfm_first200" saved as "cs_dfm_first200.rds" (43.1 MB) (7.55 secs, 2021-03-07 17:56:10)


## Compute B-score

In [4]:
# get stopwords from nltk (Python code)
import nltk
import pandas as pd
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))
df = pd.DataFrame({'word':list(stopwords)})
sv('df', svname='nltk_stopwords')

"df" saved as "nltk_stopwords.feather" (2.5 KB) (<1s)


In [2]:
# ------------ Create bscore dict ----------------
# load stop word list
ld(nltk_stopwords, force=T)
nltk_stopwords = nltk_stopwords[,word]

# read raw bscore
bscore_dt = fread('data/concreteness score.csv')[, .(word=str_trim(Word), score=Conc.M)]

# create TWO bscore, one has stopwords, one doesn't 
bscore = bscore_dt$score
names(bscore) = bscore_dt$word

bscore_nostopwords = bscore[!(names(bscore)%in%nltk_stopwords)]

sprintf('%s%% words are stopwords', (1-length(bscore_nostopwords)/length(bscore))*100%>%round(2)) %>% cat()

"nltk_stopwords.feather" (2.5 KB) loaded (0 secs) (2021-03-07 6:00 PM)
"bscore_nostopwords" saved as "bscore_nostopwords.rds" (232.4 KB) (0.04 secs, 2021-03-07 18:00:19)
0.310356910447018% words are stopwords

In [72]:
# ------------ Create bscore from dtm ----------------

ld(ntoken_corpus, force=T)
ld(cs_dfm)
ld(cs_dfm_first200)
ld(bscore_nostopwords)


dfm_to_bscore <- function(cs_dfm, bscore_dict, type_name='') {
    ntoken_name = str_c('ntoken_bscore', type_name)
    bscore_name = str_c('bscore', type_name)
    
    output_name = c('pid', bscore_name)
    
    dfm_bscore = dfm_match(cs_dfm, names(bscore_dict))
    ntoken_bscore = ntoken(dfm_bscore)
    ntoken_bscore_dt = data.table(pid=names(ntoken_bscore))
    ntoken_bscore_dt[, (ntoken_name) := ntoken_bscore]
    
    
    dfm_bscore_weighted = dfm_weight(dfm_bscore, weights=bscore_dict)
    dfm_bscore_weighted = convert(dfm_bscore_weighted, 'data.frame',
                                  docid_field='pid'
                                 ) %>% as.data.table()
    
    bscore_by_pid = dfm_bscore_weighted[, (bscore_name) := rowSums(.SD),
                                        .SDcols=is.numeric
        ][, ..output_name]
    
    bscore = bscore_by_pid[ntoken_bscore_dt, on=.(pid)]
}

bscore_bypid = dfm_to_bscore(cs_dfm, bscore)
bscore_bypid_nostopwords = dfm_to_bscore(cs_dfm, bscore_nostopwords, 
                                         type_name='_nostopwords')
bscore_bypid_firstn = dfm_to_bscore(cs_dfm_first200, bscore,
                                    type_name='_first200')
bscore_bypid_firstn_nostopwords = dfm_to_bscore(cs_dfm_first200,
                                                bscore_nostopwords,
                                                type_name='_first200_nostopwords')

"ntoken_corpus.rds" (282.9 KB) loaded (0.02 secs) (2021-03-08 2:28 AM)
cs_dfm (116.8 MB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)
cs_dfm_first200 (43.1 MB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)
bscore_nostopwords (232.4 KB) already loaded, will NOT load again! (0 secs) (2021-03-08 2:28 AM)


In [73]:
bscore_bypid = bscore_bypid[bscore_bypid_nostopwords, on=.(pid)
    ][bscore_bypid_firstn, on=.(pid)
    ][bscore_bypid_firstn_nostopwords, on=.(pid)
    ][ntoken_corpus, on=.(pid)]

sv(bscore_bypid)

"bscore_bypid" saved as "bscore_bypid.feather" (1.7 MB) (0.01 secs, 2021-03-08 02:28:57)


In [74]:
names(bscore_bypid)

# Structure of News

In [48]:
word_weight_by_topic = fread('data/Structure of News/Word_Weights_By_Topic_Phi.csv')
scaled_word_weight_by_topic = fread('data/Structure of News/Scaled_Word_Weights_By_Topic_Phi_tilde.csv')

word_weight_by_topic[1:3, 1:5]
scaled_word_weight_by_topic[1:3, 1:5]

term,Natural disasters,Internet,Soft drinks,Mobile devices
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
economist estimate,8.806352e-07,1.49717e-06,2.3285e-06,2.045927e-06
corporate earn,2.641906e-06,1.49717e-06,1.16425e-06,1.022963e-06
earn rose,3.522541e-06,7.485848e-07,3.49275e-06,1.022963e-06


term,Natural disasters,Internet,Soft drinks,Mobile devices
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
economist estimate,0.2205428,0.37494525,0.58314039,0.51237388
corporate earn,0.120081,0.06804997,0.05291797,0.04649615
earn rose,0.1282354,0.02725166,0.12715091,0.03724021


In [8]:
x = double(3)
y = 3+x

ERROR: Error in parse(text = x, srcfile = src): <text>:1:14: unexpected input
1: x = double(3)
                 ^
