In [19]:
import json

books_obj = {}
with open("books_cleaned.json", "r") as f:
	books_obj = json.load(f)

Here, we find all the DDN classes we have, and all the books (their titles, actually) belonging to them.

One subtlety is that a book with the DDN 317 belongs to all of the classes 300, 310, and 317.

We only consider the top level 3 decimal digits, so 317.134 only belongs to 300, 310, and 317

In [20]:
def extract_categories(ddn) -> list:
	num = int(ddn.split(".")[0])

	categories = set()

	for i in range(3):
		snap_to = 10**i
		snapped = (num // snap_to) * snap_to
		categories.add(f"{snapped:03}")

	return sorted(list(categories))

extract_categories("006.3814")

['000', '006']

In [21]:
cat_to_ids = {}

for id, book in books_obj.items():
	cats = extract_categories(book["ddn"])

	for cat in cats:
		if cat not in cat_to_ids:
			cat_to_ids[cat] = [id]
		else:
			cat_to_ids[cat].append(id)


In [49]:
all_cats = sorted(list(cat_to_ids.keys()))

print(len(all_cats))

541


In [54]:
filtered_cat_to_ids = {cat: ids for cat, ids in cat_to_ids.items() if len(ids) >= 3}
len(filtered_cat_to_ids)

346

In [55]:
cat_counts = sorted([(cat, len(ids)) for cat, ids in filtered_cat_to_ids.items()])
cat_counts

[('000', 138),
 ('001', 20),
 ('003', 5),
 ('004', 17),
 ('005', 29),
 ('006', 8),
 ('010', 8),
 ('016', 7),
 ('020', 4),
 ('030', 4),
 ('050', 9),
 ('060', 6),
 ('069', 6),
 ('070', 16),
 ('080', 10),
 ('081', 4),
 ('082', 4),
 ('100', 658),
 ('108', 3),
 ('109', 3),
 ('110', 36),
 ('111', 22),
 ('120', 66),
 ('121', 33),
 ('126', 7),
 ('128', 22),
 ('130', 25),
 ('131', 15),
 ('133', 8),
 ('140', 42),
 ('141', 5),
 ('142', 9),
 ('144', 7),
 ('146', 4),
 ('149', 16),
 ('150', 131),
 ('152', 16),
 ('153', 23),
 ('155', 26),
 ('158', 21),
 ('160', 14),
 ('170', 71),
 ('171', 12),
 ('172', 14),
 ('174', 9),
 ('179', 5),
 ('180', 114),
 ('181', 86),
 ('184', 11),
 ('189', 4),
 ('190', 138),
 ('191', 15),
 ('192', 24),
 ('193', 52),
 ('194', 25),
 ('197', 3),
 ('200', 431),
 ('201', 6),
 ('210', 13),
 ('211', 6),
 ('220', 5),
 ('230', 15),
 ('232', 3),
 ('234', 3),
 ('240', 5),
 ('248', 5),
 ('260', 15),
 ('261', 10),
 ('266', 4),
 ('270', 13),
 ('275', 5),
 ('280', 6),
 ('289', 4),
 ('290

In [56]:
import pickle

with open("cat_to_ids.pkl", "wb") as f:
	pickle.dump(filtered_cat_to_ids, f)