In [78]:
import pandas as pd
import numpy as np
import pickle
import csv

%load_ext autoreload
%autoreload 2

term_freqs = {}
with open("term_freqs.csv", "r") as f:
	reader = csv.reader(f)

	for term, freq in reader:
		term_freqs[term] = float(freq)

Ks = [8, 16, 32, 65, 130, 260, 520, 521]

freq_truncs = {k: None for k in Ks}
norm_truncs = {k: None for k in Ks}
bin_truncs = {k: None for k in Ks}

cat_to_ids = {}
with open("cat_to_ids.pkl", "rb") as f:
	cat_to_ids = pickle.load(f)

sorted_cats = np.array(sorted(list(cat_to_ids.keys())))

for k in Ks:
	with open(f"freq_trunc_{k}.pkl", "rb") as f:
		freq_truncs[k] = pickle.load(f)

	with open(f"norm_trunc_{k}.pkl", "rb") as f:
		norm_truncs[k] = pickle.load(f)

	with open(f"bin_trunc_{k}.pkl", "rb") as f:
		bin_truncs[k] = pickle.load(f)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [79]:
import json

ddc_mapping = {}

# thanks, claude
with open("ddc.json", "r") as f:
	ddc_mapping = json.load(f)

In [133]:
from helpers import embed, doc_query, get_doc_indices_by_distance

def query_to_cats(query, svd, normalised=True):
	e = embed(query, term_freqs, normalised)
	q = doc_query(e, svd)
	return sorted_cats[get_doc_indices_by_distance(q, svd)]	

def query_to_cat_names(query, svd, normalised=True):
	cats = query_to_cats(query, svd, normalised)

	topics = []
	for cat in cats:
		topics.append(ddc_mapping[cat])

	return topics

In [140]:
K = 32

In [141]:
q = "linear algebra"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Analysis',
  'Algebra & number theory',
  'Mathematics',
  'Probabilities & applied mathematics',
  'History of Southeast Asia'],
 ['Algebra & number theory',
  'Social processes',
  'Philosophy & theory',
  'General management',
  'Economics of land & energy'],
 ['Factors affecting social behavior',
  'Social interaction',
  'East Indo-European & Celtic languages',
  'Labor economics',
  'Algebra & number theory'])

In [142]:
q = "ambedkar"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Social processes',
  'Culture & institutions',
  'Political science',
  'Social groups',
  'History of South Asia'],
 ['Social groups',
  'Culture & institutions',
  'Social processes',
  'Social interaction',
  'Communities'],
 ['Social groups',
  'Culture & institutions',
  'Social processes',
  'Political science',
  'History of South Asia'])

In [143]:
q = "textile industry"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Production',
  'History of Southeast Asia',
  'Labor economics',
  'International commerce',
  'General management'],
 ['International commerce',
  'Production',
  'Systems of governments & states',
  'International migration & colonization',
  'International law'],
 ['Production',
  'World history',
  'Factors affecting social behavior',
  'Labor economics',
  'Economics of land & energy'])

In [144]:
q = "machine learning"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Military science',
  'Chinese & East Asian philosophy',
  'Communities',
  'Sociology & anthropology',
  'Social sciences'],
 ['Military science',
  'Financial economics',
  'Probabilities & applied mathematics',
  'Constitutional & administrative law',
  'Mathematics'],
 ['Military science',
  'Chinese & East Asian philosophy',
  'Sociology & anthropology',
  'Economics',
  'History of Central Asia'])

In [145]:
q = "social media"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Social interaction',
  'Social processes',
  'Production',
  'Communities',
  'Financial economics'],
 ['Social processes',
  'Social interaction',
  'Psychology',
  'Diseases',
  'Constitutional & administrative law'],
 ['Social interaction',
  'Algebra & number theory',
  'Social processes',
  'English drama',
  'Production'])

In [146]:
q = "first past the post"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Political science',
  'Social groups',
  'Production',
  'English fiction',
  'History of Europe'],
 ['English fiction',
  'History of Europe',
  'Psychology',
  'Political science',
  'Social groups'],
 ['Political science',
  'Production',
  'Social groups',
  'English fiction',
  'World history'])

In [147]:
q = "marx"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Socialism & related systems',
  'Sociology & anthropology',
  'Culture & institutions',
  'Political science',
  'Modern Western philosophy in Germany & Austria'],
 ['Socialism & related systems',
  'East & Southeast Asian literatures',
  'Philosophy & theory',
  'Social processes',
  'Modern Western philosophy in Germany & Austria'],
 ['Socialism & related systems',
  'Political science',
  'Sociology & anthropology',
  'Geography of Asia',
  'International migration & colonization'])

In [148]:
K = 130

In [149]:
q = "linear algebra"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Algebra & number theory',
  'Analysis',
  'Probabilities & applied mathematics',
  'Other Germanic literatures',
  'English essays'],
 ['Algebra & number theory',
  'Differential & developmental psychology',
  'Modern Western philosophy in British Isles',
  'Civil & political rights',
  'Applied physics'],
 ['Algebra & number theory',
  'Probabilities & applied mathematics',
  'Analysis',
  'Indoor games & amusements',
  'Topology'])

In [150]:
q = "ambedkar"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Civil & political rights',
  'Social processes',
  'Culture & institutions',
  'Political science',
  'Social groups'],
 ['Civil & political rights',
  'Social processes',
  'Culture & institutions',
  'Social groups',
  'Human physiology'],
 ['Civil & political rights',
  'Social groups',
  'Culture & institutions',
  'Social processes',
  'Political science'])

In [151]:
q = "textile industry"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['International commerce',
  'Production',
  'Education',
  'Public finance',
  'International law'],
 ['Public finance',
  'International commerce',
  'Education',
  'Philosophy & theory of history',
  'Production'],
 ['Textile arts',
  'International commerce',
  'Production',
  'Philosophy & theory of fine & decorative arts',
  'International law'])

In [152]:
q = "machine learning"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Military science',
  'Probabilities & applied mathematics',
  'Schools & their activities',
  'Philosophy & theory of science',
  'Philosophy & theory of language'],
 ['Schools & their activities',
  'Differential & developmental psychology',
  'Philosophy & theory of language',
  'Data processing & computer science',
  'History of the ancient world'],
 ['Military science',
  'Schools & their activities',
  'Philosophy & theory of language',
  'Probabilities & applied mathematics',
  'Mathematics'])

In [153]:
q = "social media"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['News media, journalism & publishing',
  'Constitutional & administrative law',
  'Social interaction',
  'Social processes',
  'Occupational & professional ethics'],
 ['Social interaction',
  'News media, journalism & publishing',
  'Advertising & public relations',
  'History of Germany',
  'Constitutional & administrative law'],
 ['Social interaction',
  'News media, journalism & publishing',
  'Constitutional & administrative law',
  'Social processes',
  'Production'])

In [154]:
q = "first past the post"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Mental processes & intelligence',
  'Education',
  'International migration & colonization',
  'The political process',
  'Applied physics'],
 ['Mental processes & intelligence',
  'International migration & colonization',
  'The political process',
  'Applied physics',
  'Education'],
 ['Mental processes & intelligence',
  'International migration & colonization',
  'The political process',
  'Political science',
  'History of France'])

In [155]:
q = "marx"
query_to_cat_names(q, bin_truncs[K])[:5], query_to_cat_names(q, freq_truncs[K])[:5], query_to_cat_names(q, norm_truncs[K])[:5]

(['Historical & geographic treatments of sociology',
  'Philosophy & theory',
  'Modern Western philosophy in Germany & Austria',
  'Socialism & related systems',
  'German fiction'],
 ['Socialism & related systems',
  'Historical & geographic treatments of sociology',
  'Modern Western philosophy in Germany & Austria',
  'Philosophy & theory',
  'Altaic, Ural-Altaic, Dravidian literatures'],
 ['Socialism & related systems',
  'Modern Western philosophy in Germany & Austria',
  'Political science',
  'Sociology & anthropology',
  'Culture & institutions'])