# ISBN Clustering

In [None]:
library(data.table)
library(tidyverse)

In [None]:
options(repr.plot.height=5)

In [None]:
db = DBI::dbConnect(RPostgreSQL::PostgreSQL(), dbname='openlib')

In [None]:
numspaces = list(work=100000000, edition=200000000, rec=300000000, isbn=900000000)

## Clustering Algorithm

We cluster ISBNs by taking the bipartite graph of ISBNs and records, and computing the closure for each ISBN.  Each closure becomes a cluster with a single ‘book’ ID.

In [None]:
xmsg = function(start, ...) {
    tag = sprintf("[%8.2f] ", as.double(Sys.time() - start, units="secs"))
    message(tag, ...)
}

In [None]:
tools::Rcmd(c("SHLIB", "fcluster.f95"))
dyn.load("fcluster.so")
cluster_isbns = function(isbn_rec_tbl) {
    start = Sys.time()
    xmsg(start, "initializing clusters")
    clusters = isbn_rec_tbl %>% group_by(isbn_id) %>% summarize(cluster=min(record)) %>% ungroup() %>%
        mutate(cluster = as.integer(cluster), ino=1:n())
    xmsg(start, "populating edges")
    intbl = isbn_rec_tbl %>% inner_join(select(clusters, isbn_id, ino))
    edges = intbl %>%
        select(left_isbn = ino, record) %>%
        inner_join(intbl %>% select(right_isbn = ino, record)) %>%
        select(left_isbn, right_isbn) %>%
        distinct()
    xmsg(start, "clustering ", nrow(clusters), " isbns with ", 
         nrow(edges), " edges (", length(unique(clusters$cluster)), " initial clusters)")    
    res = .Fortran("compute_clusters", 
                   nc=nrow(clusters), clusters=clusters$cluster, 
                   ne=nrow(edges), ls=edges$left_isbn, rs=edges$right_isbn,
                   iter=0L)
    rcs = res$clusters
    xmsg(start, "built ", length(unique(rcs)), " clusters in ", res$iter, " iterations")
    data_frame(isbn_id=clusters$isbn_id, cluster=rcs)
}

## Library of Congress

In [None]:
loc_rec_isbns = db %>% tbl('loc_rec_isbn') %>% select(isbn_id, record=rec_id) %>% collect()
print(loc_rec_isbns)

In [None]:
loc_clusters = cluster_isbns(loc_rec_isbns)

How big are our largest clusters?

In [None]:
cluster_sizes = loc_clusters %>% group_by(cluster) %>% summarize(nisbns=n()) %>% ungroup() 
cluster_sizes %>% arrange(-nisbns) %>% head()

In [None]:
ggplot(cluster_sizes) +
    aes(x=nisbns) +
    geom_histogram(binwidth=1)

In [None]:
loc_clusters %>% select(isbn_id, cluster) %>% write_csv('data/loc-clusters.csv', col_names = FALSE)

## OpenLibrary

In [None]:
ol_rec_edges = db %>% tbl('ol_isbn_link') %>% select(isbn_id, record=book_code) %>% 
    collect()

In [None]:
print(ol_rec_edges)

In [None]:
ol_clusters = cluster_isbns(ol_rec_edges)

In [None]:
ol_cluster_sizes = ol_clusters %>% group_by(cluster) %>% summarize(nisbns=n()) %>% ungroup() 
ol_cluster_sizes %>% arrange(-nisbns) %>% head()

In [None]:
ol_cluster_sizes = ol_cluster_sizes %>% mutate(bucket = as.integer(log10(nisbns)))
ol_cluster_buckets = ol_cluster_sizes %>% filter(nisbns > 1) %>% group_by(bucket) %>% summarize(n=n())
ol_cluster_buckets

In [None]:
ggplot(ol_cluster_buckets) +
    aes(x=bucket, y=n) +
    geom_bar(stat='identity')

In [None]:
ol_clusters %>% select(isbn_id, cluster) %>% write_csv('data/ol-clusters.csv', col_names = FALSE)

## Integrated Clusters

In [None]:
all_isbn_recs = bind_rows(
    loc_rec_isbns %>% mutate(record = record + numspaces$rec),
    ol_rec_edges
)

In [None]:
print(all_isbn_recs)

In [None]:
int_clusters = cluster_isbns(all_isbn_recs)

In [None]:
int_cluster_sizes = int_clusters %>% group_by(cluster) %>% summarize(nisbns=n()) %>% ungroup() 
int_cluster_sizes %>% arrange(-nisbns) %>% head()

In [None]:
int_cluster_sizes = int_cluster_sizes %>% mutate(bucket = as.integer(log10(nisbns)))
int_cluster_buckets = int_cluster_sizes %>% filter(nisbns > 1) %>% group_by(bucket) %>% summarize(n=n())
int_cluster_buckets

In [None]:
ggplot(int_cluster_buckets) +
    aes(x=bucket, y=n) +
    geom_bar(stat='identity')

In [None]:
int_clusters %>% select(isbn_id, cluster) %>% write_csv('data/isbn-clusters.csv', col_names=FALSE)

### Load ISBN info to work on checking clusters

In [None]:
isbn_ids = db %>% tbl('isbn_id') %>% collect()
print(isbn_ids)

How many ISBNs are in LOC but not OL?

In [None]:
missing_isbns = loc_clusters %>% anti_join(select(ol_clusters, isbn_id))
nrow(missing_isbns)