# Init

In [None]:
# if you've not installed my toolbox `utilr`, please install through:
# devtools::install_github('xiaomowu/utilr')
# then `library(utilr)`q
library(utilr)
library(jsonlite)
library(quanteda)

quanteda_options(threads = 32) # On your laptop, you probably want to set it to 4 or 8.

setwd('~/OneDrive/Construal')

# Parse JSON

## get `project_ids` and `project_dirs`

In [None]:
data_root_dir = './data/Kickstarter Data/'

project_ids = list.dirs(data_root_dir, full.names=F, recursive=F)

project_dirs = str_c(data_root_dir, project_ids, '/')

## parse JSON

In [None]:
pjson = vector(mode = "list", length=length(project_ids))

for (i in 1:length(project_ids)) {
    pid = project_ids[i]
    pdir = project_dirs[i]
    
    tryCatch({
        json_path = sprintf('%s/%s.json', pdir, pid)
        json = fromJSON(json_path)
        json$pledge_money = str_c(json$pledge_money, collapse=',')
        json$pledge_count = str_c(json$pledge_count, collapse=',')
        pjson[[pid]] = json
    }, error=function(cond) {
        message(sprintf('Error: %s', json_path))
        NULL
    })
}

pjson = rbindlist(pjson, use.names=T, idcol='pid') %>% unique()

In [None]:
write_feather(pjson, './data/pjson.feather', version = 2)

In [None]:
success_pct = round(pjson[, .N]/length(project_ids)*100, 2)
message(sprintf('%s%% projects have been successfully parsed.', success_pct))

In [None]:
pjson[1, .(pid, pledge_count)]

In [None]:
ld(pjson, ldtype='feather')

In [None]:
pjson[category=='Product Design'][order(pid)]

# Parse HTML

## get `project_ids` and `project_dirs`

In [None]:
data_root_dir = './data/Kickstarter Data/'

project_ids = list.dirs(data_root_dir, full.names=F, recursive=F)

project_dirs = str_c(data_root_dir, project_ids, '/')

In [None]:
# risk
# the json file already captured the risk portion
risk = getNodeSet(parsed_html, '//div[@class="mb3 mb10-sm mb3 js-risks"]//p') %>%
    xmlValue() %>%
    str_c(collapse='\n')

# cat(sprintf('[risk]:\n%s\n', risk))
# cat('---------------------------\n')

In [None]:
# get parsed_html
pid = 1649873594
html_page = sprintf("C:/Users/rossz/OneDrive/Construal/data/Kickstarter Data/%s/%s.html", pid, pid)
parsed_html = htmlParse(html_page)


split_team_and_project <- function(pid,
    parsed_html, 
    bold_title_xpath,
    pos_team_xpath) {
    
    # possible team_titles
    team_titles = c("Who's on the team?", "The Team", "About Us", "About the Artist", "Meet the team", "Who we are", "Meet the creators", 
                    "Who's involved", "Who is involved") %>% tolower()
    
    # root node for proj_desc and team_desc
    root = xmlDoc(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]])
    
    # all the <p> under the root
    all_p = root['//p']
    
    # set default team_desc/proj_desc
    team_desc = NA
    proj_desc = getNodeSet(parsed_html, '//div[@class="full-description js-full-description responsive-media formatted-lists"]//text()') %>%
        xmlValue() %>%
        str_c(collapse='\n') %>%
        str_trim()
    success_counter = 0
    
    # print(proj_desc)
    
    # find out all bold titles
    # convert them to lower case
    bold_titles = getNodeSet(root, bold_title_xpath) %>%
        xmlValue() %>%
        str_trim() %>%
        tolower()
    
    # print(bold_titles)
    
    # loop over every possible team_title
    for (team_title in team_titles) {
        idx = match(team_title, bold_titles)
        
        # if successfully finds team_title, output BOTH team_description and project_descriptoin;
        # else, pased_html as project_description
        if (is.na(idx)) {
            next
        } else {
            success_counter = success_counter + 1
            
            team_title_next = bold_titles[idx+1]
            
#             print(team_title)
#             print(team_title_next)
            
            
           
            pos_team_start = getNodeSet(all_p, sprintf(pos_team_xpath, team_title))

            pos_team_end = getNodeSet(all_p, sprintf(pos_team_xpath, team_title_next))

            # extract team_description
            team_desc = getNodeSet(all_p, sprintf('//p[position()>%s and position()<%s]', pos_team_start, pos_team_end)) %>%
                xmlValue() %>%
                str_c(collapse='\n') %>%  
                str_trim()
            
#             print(pos_team_start)
#             print(pos_team_end)
            

            
            # extract project_description
            proj_desc = getNodeSet(all_p, sprintf('//p[position()<=%s or position()>=%s]', pos_team_start, pos_team_end)) %>%
                xmlValue() %>%
                str_c(collapse='\n') %>%
                str_trim()
        }
    }
    
    # log message if more than one team_description have been found
    if (success_counter>1) {
        message(sprintf('More than one team_description have been found (%s).', parsed_html))
    }
    return(list(pid=pid, team_desc=team_desc, proj_desc=proj_desc))
}

# h1 = split_team_and_project(
#     pid,
#     parsed_html,
#     bold_title_xpath='//div[@class="full-description js-full-description responsive-media formatted-lists"]//h1',
#     pos_team_xpath='count(//div[@class="full-description js-full-description responsive-media formatted-lists"]//h1[contains(lower-case(text()),"%s")]/preceding-sibling::p)+1')
# print(h1)

b = split_team_and_project(
    pid,
    parsed_html,
    bold_title_xpath='//p[count(./b)=1]//b',
    pos_team_xpath='count(//p[contains(lower-case(text()),"%s")]/preceding-sibling::p)+1')
# b

In [None]:
all_p[[1]]
cat('---------')
str(all_p[[1]])

In [None]:
# get parsed_html
pid = 1649873594
html_page = sprintf("C:/Users/rossz/OneDrive/Construal/data/Kickstarter Data/%s/%s.html", pid, pid)
parsed_html = htmlParse(html_page)

root = parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]]

root_children = xmlChildren(root) # I shouldn't use `xmlChildren` 
length(root_children)

# root_children[[2]] %>% class()
# root_children[[2]]
cat('------------\n')

getNodeSet(root, './/p[count(.//b)=1 and contains(.//b, "About")]/preceding::*')

In [None]:
root = xmlDoc(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]])

pos_team_start = getNodeSet(root, sprintf('count(//p[contains(lower-case(.//b/text()),"%s")]/preceding-sibling::*)+1', 'about'))
pos_team_start

names(root)

In [None]:
class(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][1][[1]])

In [None]:
all_p = getNodeSet(parsed_html, '//div[@class="full-description js-full-description responsive-media formatted-lists"]//p')

getNodeSet(all_p[[1]], './/text()')

In [None]:
getNodeSet(parsed_html, 
           sprintf('//div[@class="full-description js-full-description responsive-media formatted-lists"]//p[contains(lower-case(./b/text()),"%s")]', 'shipping'))

# Test

In [20]:
ld(pjson, path='../data', ldtype='feather')
ld(bscore_bypid, path='../data')
ld(score, path='../data')

pjson (167.5 MB) already loaded, will NOT load again! (0 secs) (2021-06-23 3:24 PM)
bscore_bypid (2.4 MB) already loaded, will NOT load again! (0 secs) (2021-06-23 3:24 PM)
"score.feather" (3.3 MB) loaded (0.02 secs) (2021-06-23 3:24 PM)


In [34]:
score[, .(pid=doc_id, diff=score_google_top5000_withoutstop-score_kck_top5000_withoutstop)
    ][pid=='2002220833']

pid,diff
<chr>,<dbl>
2002220833,6.534847


In [49]:
bscore_bypid[pid=='384333840', .(bscore_nostopwords/ntoken_bscore_nostopwords)]

V1
<dbl>
3.003413


In [5]:
fwrite(pjson[category=='Product Design', .(pid, project_desc)], 'pjson.csv')