We now create our term-document matrix, as required for LSI. Here, our terms are simply all the english terms appearing in book titles (lemmatized), and DDN categories are the documents. Our term-doc matrix then has the form:

$$(A)_{i, j} = f(i, j)$$

Where $f(i, j)$ is the frequency of the $i^{th}$ term in the combined titles of all the books in the $j^{th}$ DDN category. For now, let's imagine that this is the raw frequency.

In [90]:
import json
import csv
import nltk

books_obj = {}
with open("books_cleaned.json", "r") as f:
	books_obj = json.load(f)

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Collecting all terms

In [91]:
from collections import Counter

tokenizer = nltk.tokenize.TreebankWordTokenizer()
lemmatizer = nltk.WordNetLemmatizer()

def get_term_freqs(title):
	toks = [lemmatizer.lemmatize(tok).lower() for tok in tokenizer.tokenize(title) if tok.isalpha()]
	return Counter(toks)

get_term_freqs("rethinking knowledges within higher education :Adorno and social justice")

Counter({'rethinking': 1,
         'knowledge': 1,
         'within': 1,
         'higher': 1,
         'education': 1,
         'adorno': 1,
         'and': 1,
         'social': 1,
         'justice': 1})

In [92]:
ids_to_term_freqs = {}
term_freqs = Counter()

for id, book in books_obj.items():
	freqs = get_term_freqs(book["title"])

	ids_to_term_freqs[id] = freqs
	term_freqs += freqs

In [93]:
len(term_freqs)

10115

In [98]:
sorted_term_freqs = sorted(list(term_freqs.items()))

with open("term_freqs.csv", "w", newline = "") as f:
	writer = csv.writer(f)

	writer.writerows(sorted_term_freqs)

In [95]:
import pickle

with open("ids_to_term_freqs.pkl", "wb") as f:
	pickle.dump(ids_to_term_freqs, f)

### Setting up the termdoc matrix

In [121]:
import pandas as pd
import numpy as np

import pickle
import csv

term_indices = {}
cat_to_ids = {}
ids_to_term_freqs = {}

with open("term_freqs.csv", "r") as f:
	reader = csv.reader(f)

	for term, _ in reader:
		term_indices[term] = len(term_indices)

print(term_indices)
term_count = len(term_indices)

with open("cat_to_ids.pkl", "rb") as f:
	cat_to_ids = pickle.load(f)

sorted_cats = sorted(list(cat_to_ids.keys()))
cat_count = len(sorted_cats)

with open("ids_to_term_freqs.pkl", "rb") as f:
	ids_to_term_freqs = pickle.load(f)

{'a': 0, 'aayurved': 1, 'abandoned': 2, 'abanindranath': 3, 'abbe': 4, 'abbey': 5, 'abd': 6, 'abdoumaliq': 7, 'abhidharmakosa': 8, 'abhiyatri': 9, 'abid': 10, 'abigail': 11, 'abisaab': 12, 'able': 13, 'abnormal': 14, 'abolition': 15, 'abolitionist': 16, 'aboriginal': 17, 'aborigine': 18, 'about': 19, 'abouzeid': 20, 'above': 21, 'abraham': 22, 'abrahams': 23, 'abroad': 24, 'absence': 25, 'absent': 26, 'absolutism': 27, 'absolutist': 28, 'abstract': 29, 'abundance': 30, 'abuse': 31, 'abyss': 32, 'academic': 33, 'accelerating': 34, 'acceleration': 35, 'acceptance': 36, 'access': 37, 'accessing': 38, 'accession': 39, 'accident': 40, 'accidental': 41, 'accommodation': 42, 'accompany': 43, 'accomplice': 44, 'according': 45, 'account': 46, 'accounting': 47, 'accounts': 48, 'accumulation': 49, 'accuse': 50, 'acer': 51, 'achaemenid': 52, 'acharnians': 53, 'achen': 54, 'achieve': 55, 'achievement': 56, 'achieving': 57, 'achintya': 58, 'achyut': 59, 'acquaintances': 60, 'acquiring': 61, 'acquisi

In [222]:
freq_termdoc = np.zeros((term_count, cat_count))

for i in range(cat_count):
	cat = sorted_cats[i]

	embedding = np.zeros(term_count)

	for id in cat_to_ids[cat]:
		for term, freq in ids_to_term_freqs[id].items():
			embedding[term_indices[term]] += freq

	freq_termdoc[:, i] = embedding

freq_termdoc

array([[24.,  3.,  1., ...,  2.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [223]:
binary_termdoc = (freq_termdoc > 0) * 1.
binary_termdoc.astype(np.float64)

array([[1., 1., 1., ..., 1., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [224]:
with open("frequency_termdoc.pkl", "wb") as f:
	pickle.dump(freq_termdoc, f)

with open("binary_termdoc.pkl", "wb") as f:
	pickle.dump(binary_termdoc, f)