In [30]:
import pandas as pd
import numpy as np
import pickle
import csv

%load_ext autoreload
%autoreload 2

term_freqs = {}
with open("term_freqs.csv", "r") as f:
	reader = csv.reader(f)

	for term, freq in reader:
		term_freqs[term] = float(freq)

Ks = [8, 16, 32, 65, 130, 260, 520, 521]

freq_truncs = {k: None for k in Ks}
norm_truncs = {k: None for k in Ks}
bin_truncs = {k: None for k in Ks}

cat_to_ids = {}
with open("cat_to_ids.pkl", "rb") as f:
	cat_to_ids = pickle.load(f)

sorted_cats = np.array(sorted(list(cat_to_ids.keys())))

for k in Ks:
	with open(f"freq_trunc_{k}.pkl", "rb") as f:
		freq_truncs[k] = pickle.load(f)

	with open(f"norm_trunc_{k}.pkl", "rb") as f:
		norm_truncs[k] = pickle.load(f)

	with open(f"bin_trunc_{k}.pkl", "rb") as f:
		bin_truncs[k] = pickle.load(f)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import json

ddc_mapping = {}

# thanks, claude
with open("ddc.json", "r") as f:
	ddc_mapping = json.load(f)

books_obj = {}
with open("books_cleaned.json", "r") as f:
	books_obj = json.load(f)

In [26]:
from helpers import embed, doc_query, get_doc_indices_by_distance

def query_to_cats(query, svd, normalised=True):
	e = embed(query, term_freqs, normalised)
	q = doc_query(e, svd)
	return sorted_cats[get_doc_indices_by_distance(q, svd)]

In [49]:
def ddn_error(pred, gt):
	err = 3

	for i in range(2, -1, -1):
		div = 10**i

		if pred // div != gt // div:
			break

		err -= 1

	return err

ddn_error(320, 320)

0

In [53]:
book_count = len(books_obj.items())
book_count

8535

In [54]:
from tqdm import tqdm

bin_sums = {K: 0 for K in Ks}
freq_sums = {K: 0 for K in Ks}
norm_sums = {K: 0 for K in Ks}

for K in Ks:
	for _, book in tqdm(books_obj.items()):
		title = book["title"]
		gt = int(book["ddn"].split(".")[0])

		pred = int(query_to_cats(title, freq_truncs[K])[0])
		err = ddn_error(pred, gt)
		freq_sums[K] += err

		pred = int(query_to_cats(title, bin_truncs[K])[0])
		err = ddn_error(pred, gt)
		bin_sums[K] += err

		pred = int(query_to_cats(title, norm_truncs[K])[0])
		err = ddn_error(pred, gt)
		norm_sums[K] += err

100%|██████████| 8535/8535 [00:37<00:00, 230.56it/s]
100%|██████████| 8535/8535 [00:41<00:00, 204.71it/s]
100%|██████████| 8535/8535 [00:53<00:00, 158.76it/s]
100%|██████████| 8535/8535 [01:34<00:00, 90.74it/s] 
100%|██████████| 8535/8535 [02:42<00:00, 52.64it/s]
100%|██████████| 8535/8535 [04:25<00:00, 32.15it/s]
100%|██████████| 8535/8535 [07:29<00:00, 18.99it/s]
100%|██████████| 8535/8535 [02:08<00:00, 66.40it/s]


In [55]:

freq_avgs = {K: freq_sums[K] / book_count for K in Ks}
freq_avgs

{8: 1.833743409490334,
 16: 1.5757469244288225,
 32: 1.4679554774458115,
 65: 1.3052138254247216,
 130: 1.3330990041007615,
 260: 1.552548330404218,
 520: 1.960398359695372,
 521: 2.518804920913884}

In [56]:
norm_avgs = {K: norm_sums[K] / book_count for K in Ks}
norm_avgs

{8: 1.9361452841241944,
 16: 1.7288810779144699,
 32: 1.535676625659051,
 65: 1.2787346221441125,
 130: 1.0924428822495607,
 260: 1.0678383128295255,
 520: 1.547510251903925,
 521: 1.9022847100175746}

In [57]:
bin_avgs = {K: bin_sums[K] / book_count for K in Ks}
bin_avgs

{8: 1.9220855301698887,
 16: 1.7342706502636205,
 32: 1.4481546572934973,
 65: 1.1669595782073814,
 130: 1.1103690685413006,
 260: 1.2856473345049795,
 520: 1.6515524311657879,
 521: 2.1161101347393085}