# Init

In [3]:
suppressMessages(suppressWarnings({
    library(stringr)
    library(RPostgres)
    library(ggplot2)
    library(dplyr)
    library(arrow)
    library(pryr)
    library(data.table)
    library(lubridate)
    library(hms)
    
    library(repr)
    options(repr.matrix.max.cols=500) # display more cols
}))

# quanteda_options(threads = 32) # On your laptop, you probably want to set it to 4 or 8.

setwd('~/OneDrive/Construal')

# Parse JSON

## get `project_ids` and `project_dirs`

In [2]:
data_root_dir = './data/Kickstarter Data/'

project_ids = list.dirs(data_root_dir, full.names=F, recursive=F)

project_dirs = str_c(data_root_dir, project_ids, '/')

## parse JSON

In [None]:
pjson = vector(mode = "list", length=length(project_ids))

for (i in 1:length(project_ids)) {
    pid = project_ids[i]
    pdir = project_dirs[i]
    
    tryCatch({
        json_path = sprintf('%s/%s.json', pdir, pid)
        json = fromJSON(json_path)
        json$pledge_money = str_c(json$pledge_money, collapse=',')
        json$pledge_count = str_c(json$pledge_count, collapse=',')
        pjson[[pid]] = json
    }, error=function(cond) {
        message(sprintf('Error: %s', json_path))
        NULL
    })
}

pjson = rbindlist(pjson, use.names=T, idcol='pid') %>% unique()

In [None]:
write_feather(pjson, './data/pjson.feather', version = 2)

In [None]:
success_pct = round(pjson[, .N]/length(project_ids)*100, 2)
message(sprintf('%s%% projects have been successfully parsed.', success_pct))

In [None]:
pjson[1, .(pid, pledge_count)]

In [None]:
ld(pjson, ldtype='feather')

In [None]:
pjson[category=='Product Design'][order(pid)]

# Parse HTML (deprecated)

## get `project_ids` and `project_dirs`

In [None]:
data_root_dir = './data/Kickstarter Data/'

project_ids = list.dirs(data_root_dir, full.names=F, recursive=F)

project_dirs = str_c(data_root_dir, project_ids, '/')

In [None]:
# risk
# the json file already captured the risk portion
risk = getNodeSet(parsed_html, '//div[@class="mb3 mb10-sm mb3 js-risks"]//p') %>%
    xmlValue() %>%
    str_c(collapse='\n')

# cat(sprintf('[risk]:\n%s\n', risk))
# cat('---------------------------\n')

In [None]:
# get parsed_html
pid = 1649873594
html_page = sprintf("C:/Users/rossz/OneDrive/Construal/data/Kickstarter Data/%s/%s.html", pid, pid)
parsed_html = htmlParse(html_page)


split_team_and_project <- function(pid,
    parsed_html, 
    bold_title_xpath,
    pos_team_xpath) {
    
    # possible team_titles
    team_titles = c("Who's on the team?", "The Team", "About Us", "About the Artist", "Meet the team", "Who we are", "Meet the creators", 
                    "Who's involved", "Who is involved") %>% tolower()
    
    # root node for proj_desc and team_desc
    root = xmlDoc(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]])
    
    # all the <p> under the root
    all_p = root['//p']
    
    # set default team_desc/proj_desc
    team_desc = NA
    proj_desc = getNodeSet(parsed_html, '//div[@class="full-description js-full-description responsive-media formatted-lists"]//text()') %>%
        xmlValue() %>%
        str_c(collapse='\n') %>%
        str_trim()
    success_counter = 0
    
    # print(proj_desc)
    
    # find out all bold titles
    # convert them to lower case
    bold_titles = getNodeSet(root, bold_title_xpath) %>%
        xmlValue() %>%
        str_trim() %>%
        tolower()
    
    # print(bold_titles)
    
    # loop over every possible team_title
    for (team_title in team_titles) {
        idx = match(team_title, bold_titles)
        
        # if successfully finds team_title, output BOTH team_description and project_descriptoin;
        # else, pased_html as project_description
        if (is.na(idx)) {
            next
        } else {
            success_counter = success_counter + 1
            
            team_title_next = bold_titles[idx+1]
            
#             print(team_title)
#             print(team_title_next)
            
            
           
            pos_team_start = getNodeSet(all_p, sprintf(pos_team_xpath, team_title))

            pos_team_end = getNodeSet(all_p, sprintf(pos_team_xpath, team_title_next))

            # extract team_description
            team_desc = getNodeSet(all_p, sprintf('//p[position()>%s and position()<%s]', pos_team_start, pos_team_end)) %>%
                xmlValue() %>%
                str_c(collapse='\n') %>%  
                str_trim()
            
#             print(pos_team_start)
#             print(pos_team_end)
            

            
            # extract project_description
            proj_desc = getNodeSet(all_p, sprintf('//p[position()<=%s or position()>=%s]', pos_team_start, pos_team_end)) %>%
                xmlValue() %>%
                str_c(collapse='\n') %>%
                str_trim()
        }
    }
    
    # log message if more than one team_description have been found
    if (success_counter>1) {
        message(sprintf('More than one team_description have been found (%s).', parsed_html))
    }
    return(list(pid=pid, team_desc=team_desc, proj_desc=proj_desc))
}

# h1 = split_team_and_project(
#     pid,
#     parsed_html,
#     bold_title_xpath='//div[@class="full-description js-full-description responsive-media formatted-lists"]//h1',
#     pos_team_xpath='count(//div[@class="full-description js-full-description responsive-media formatted-lists"]//h1[contains(lower-case(text()),"%s")]/preceding-sibling::p)+1')
# print(h1)

b = split_team_and_project(
    pid,
    parsed_html,
    bold_title_xpath='//p[count(./b)=1]//b',
    pos_team_xpath='count(//p[contains(lower-case(text()),"%s")]/preceding-sibling::p)+1')
# b

In [None]:
all_p[[1]]
cat('---------')
str(all_p[[1]])

In [None]:
# get parsed_html
pid = 1649873594
html_page = sprintf("C:/Users/rossz/OneDrive/Construal/data/Kickstarter Data/%s/%s.html", pid, pid)
parsed_html = htmlParse(html_page)

root = parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]]

root_children = xmlChildren(root) # I shouldn't use `xmlChildren` 
length(root_children)

# root_children[[2]] %>% class()
# root_children[[2]]
cat('------------\n')

getNodeSet(root, './/p[count(.//b)=1 and contains(.//b, "About")]/preceding::*')

In [None]:
root = xmlDoc(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][[1]])

pos_team_start = getNodeSet(root, sprintf('count(//p[contains(lower-case(.//b/text()),"%s")]/preceding-sibling::*)+1', 'about'))
pos_team_start

names(root)

In [None]:
class(parsed_html['//div[@class="full-description js-full-description responsive-media formatted-lists"]'][1][[1]])

In [None]:
all_p = getNodeSet(parsed_html, '//div[@class="full-description js-full-description responsive-media formatted-lists"]//p')

getNodeSet(all_p[[1]], './/text()')

In [None]:
getNodeSet(parsed_html, 
           sprintf('//div[@class="full-description js-full-description responsive-media formatted-lists"]//p[contains(lower-case(./b/text()),"%s")]', 'shipping'))

# Image Label distribution

## LVIS distribution (Py)

In [None]:
LVIS_DATA_DIR = '/home/yu/Data/LVIS'

import json

with open(f'{LVIS_DATA_DIR}/lvis_v1_train.json') as ff:
    lvis_dist = dt.Frame(json.load(ff)['categories'])

lvis_dist.names = {'def': 'definition'}
lvis_dist = lvis_dist[:, 
      [f.id, f.name, f.definition, f.instance_count, f.image_count, f.frequency]]

sv('lvis_dist')
fwrite(lvis_dist, 'object_detect_lvis_distribution.csv')

## kick distribution (R)

In [2]:
ld(lvis_dist, force=T) # dist of LVIS
ld(df_objdet) # object detection results

"lvis_dist.feather" (80.4 KB) loaded (0.03 secs) (2021-04-21 1:09 AM)
"df_objdet.feather" (73.1 MB) loaded (0.61 secs) (2021-04-21 1:09 AM)


In [34]:
kick = df_objdet[prob>=0.5, .(kick_freq=.N),
      keyby=.(label_id)
    ][, .(label_id, kick_freq=kick_freq/sum(kick_freq))]

dist = lvis_dist[, .(label_id=id, lvis_freq=instance_count)
    ][kick, on=.(label_id), nomatch=NULL
    ][, ':='(lvis_freq=lvis_freq/sum(lvis_freq))
    ][, ':='(is_kick_more=sign(kick_freq-lvis_freq))]

In [38]:
# plot_ly(dist, x=~label_id, y=~lvis_freq, type='bar', name='LVIS') %>%
#     add_trace(y=~kick_freq, name='Kickstart') %>%
#     plotly::layout(barmode='group')

kick_dist = df_objdet[prob>=0.5, .(inst_count=.N), keyby=.(pid, label_id)
    ][dist, on=.(label_id), nomatch=NULL
    ][, {
      n_labels=uniqueN(label_id)
      n_instances=sum(inst_count)
    
      kick_freq=sum(kick_freq*inst_count)
      kick_freq_norm=kick_freq/n_instances
    
      lvis_freq=sum(lvis_freq*inst_count)
      lvis_freq_norm=lvis_freq/n_instances
    
      abs_freq_diff=abs(sum(kick_freq*inst_count)-sum(lvis_freq*inst_count))
      sign_freq_diff=sign(sum(is_kick_more*inst_count))
      
       
      list(n_labels=n_labels, n_instances=n_instances, kick_freq=kick_freq,
           kick_freq_norm=kick_freq_norm, lvis_freq=lvis_freq, 
           lvis_freq_norm=lvis_freq_norm,
           abs_freq_diff=abs_freq_diff, sign_freq_diff=sign_freq_diff)
      },
      keyby=.(pid)]

# Merge all datasets (text, image)

> Note:
>
> 1. The order of this variable dictionary is the same as the order that the varaibles appears in the dataset.
> 2. Variables are grouped as TEXT, IMAGE, and TEXT-IMAGE FIT. In the following setction, "start" means the first variable of this group; "end" means the last.

- `pid`: project id. Primary key of the dataset.

- TEXT:
    - start: 'bscore'; end: 'objectivity'
    - ntoken: number of tokens (words) in "project_desc."
    - ntoken_bscore_[unique/first200]_[nostopwords]: the number of tokens that falls into the "B-score" (Brysbaert et al., 2014) dictionary. [unique/first200] indicates whether we consider [the whole document] or [the first 200 words]. [nostopwords] indicates whether excluding stop words.
    - bscore_[unique/first200]_[nostopwords]: the concreteness score (Brysbaert et al., 2014) of the [whole document/first 200 words]. [nostopwords] indicates whether excluding stop words.
    - [...]_title: this variable is calculated for the title 
    - fog_score: the FOG index. Proxy for readability
    - n_spelling_errors: N of spelling errors.
    - n_grammar_errors: N of grammar errors
    - sentiment_label: 'POSITIVE' or 'NEGATIVE'
    - sentiment_score: the score (probability) of sentiment_label. In range of [0,1]
    - objectivity: objectivity score, i.e., the number of sentences that are classified as "objective." Please note almost all (>99%) sentences are classified as objective!


- IMAGE:
    - start: 'entropy_weighted_unnormalized'; end: 'n_happy_faces'
    - entropy_weighted_[un]normalized: entropy-based image concreteness score.
    - mni_k[N]_weighted_[un]normalized: MNI-based concreteness score
    - n_faces: N of faces
    - n_happy_faces: N of happy faces (<= n_faces)


- TEXT-IMAGE FIT
    - start: 'glove_cluster_cos_dist'
    - end: 'charngram_shortest_linf_dist'
    - [glove/fasttext/charngram]_[cluster/shortest]_l[1/2/inf]_dist. Text-image fit score. [glove/fasttext/charngram] indicates the pretrained embedding used.
[cluster/shortest] indicates whether it's distance between the "clusters centrality" or "closest points;" l[1/2/inf] indicates whether it's L1/L2/L-infinity distance.

- META data
    - start: 'title'
    - end: 'state_at_scraping'

In [1]:
suppressMessages(library(utilr))
WORK_DIR = '/home/yu/OneDrive/Construal'
SHARE_DIR = '/home/yu/OneDrive/Construal/data/sharing'
setwd(WORK_DIR)

# ---------------------
# load datasets
# ---------------------

# text
ld(bscore_bypid)
ld(out_fog)
ld(out_grammar)
ld(out_sentiment)
ld(out_objectivity)

# image
ld(pid_weighted_mni_entropy)
ld(out_faces)

# text-image fit
dist_glove_title = fread('data/sharing/dist_glove_title.csv')
dist_fasttext_title = fread('data/sharing/dist_fasttext_title.csv')
dist_charngram_title = fread('data/sharing/dist_charngram_title.csv')
dist_glove_desc = fread('data/sharing/dist_glove_desc.csv')
dist_fasttext_desc = fread('data/sharing/dist_fasttext_desc.csv')
dist_charngram_desc = fread('data/sharing/dist_charngram_desc.csv')

# pjson
ld(pjson, ldtype='feather')

# ---------------------
# merge datasets
# ---------------------

# TEXT concreteness
text = bscore_bypid[out_fog, on=.(pid), nomatch=NULL
    ][out_grammar, on=.(pid), nomatch=NULL
    ][out_sentiment, on=.(pid), nomatch=NULL
    ][out_objectivity, on=.(pid), nomatch=NULL]
setnames(text, c('label', 'score'), c('sentiment_label', 'sentiment_score'))


# IMAGE concreteness
image = pid_weighted_mni_entropy[out_faces, on=.(pid), nomatch=NULL]

# TEXT-IMAGE fit
add_var_prefix <- function(dt, prefix, suffix=NULL) {
    col_names = names(dt)
    col_names = col_names[!str_detect(col_names, 'pid')]
    setnames(dt, col_names, str_c(prefix, col_names, suffix, sep='_'))
    dt
}
dist_glove_title = add_var_prefix(
    dist_glove_title, 'glove', 'title')
dist_fasttext_title = add_var_prefix(
    dist_fasttext_title, 'fasttext', 'title')
dist_charngram_title = add_var_prefix(
    dist_charngram_title, 'charngram', 'title')

dist_glove_desc = add_var_prefix(
    dist_glove_desc, 'glove', 'desc')
dist_fasttext_desc = add_var_prefix(
    dist_fasttext_desc, 'fasttext', 'desc')
dist_charngram_desc = add_var_prefix(
    dist_charngram_desc, 'charngram', 'desc')

fit = dist_glove_title[dist_fasttext_title, on=.(pid), nomatch=NULL
    ][dist_charngram_title, on=.(pid), nomatch=NULL
    ][dist_glove_desc, on=.(pid), nomatch=NULL
    ][dist_fasttext_desc, on=.(pid), nomatch=NULL
    ][dist_charngram_desc, on=.(pid), nomatch=NULL]


# final dataset!!!
final_dataset = text[image, on=.(pid), nomatch=NULL
    ][fit[, ':='(pid=as.character(pid))], on=.(pid), nomatch=NULL
    ][pjson, on=.(pid), nomatch=NULL]

sv(final_dataset)
fwrite(final_dataset, 'data/sharing/final_dataset.csv')

names(final_dataset)


Yu's data science toolbox loaded! 
"bscore_bypid.feather" (3.2 MB) loaded (0.04 secs) (2022-01-15 10:10 PM)
"out_fog.feather" (815.5 KB) loaded (0.01 secs) (2022-01-15 10:10 PM)
"out_grammar.feather" (619.2 KB) loaded (0.01 secs) (2022-01-15 10:10 PM)
"out_sentiment.feather" (935.2 KB) loaded (0.01 secs) (2022-01-15 10:10 PM)
"out_objectivity.feather" (502 KB) loaded (0.01 secs) (2022-01-15 10:10 PM)
"pid_weighted_mni_entropy.feather" (339.8 KB) loaded (0 secs) (2022-01-15 10:10 PM)
"out_faces.feather" (47.5 KB) loaded (0 secs) (2022-01-15 10:10 PM)
"pjson.feather" (167.5 MB) loaded (0.49 secs) (2022-01-15 10:10 PM)
"final_dataset" saved as "final_dataset.feather" (10.9 MB) (0.05 secs, 2022-01-15 22:10:29)


# Trash 

In [9]:
final = ld('final_dataset', folder='data')

"final_dataset.feather" (10.9 MB) loaded (0.06 secs) (2022-08-11 3:06 AM)


In [18]:
final[order(fasttext_cluster_cos_dist_title)
    ][c(1:100, (.N-100):.N), .(pid, fasttext_cluster_cos_dist_title)
    ] %>% fwrite('data/sharing/high-low-fit.csv')

In [19]:
final[order(-mni_k100_weighted_normalized), 
      .(pid, mni_k100_weighted_normalized)
    ][c(1:100, (.N-100):.N)
    ] %>% fwrite('data/sharing/high-low-mni.csv')