We now create our term-document matrix, as required for LSI. Here, our terms are simply all the english terms appearing in book titles (lemmatized), and DDN categories are the documents. Our term-doc matrix then has the form:

$$(A)_{i, j} = f(i, j)$$

Where $f(i, j)$ is the frequency of the $i^{th}$ term in the combined titles of all the books in the $j^{th}$ DDN category. For now, let's imagine that this is the raw frequency.

In [84]:
import json
import csv
import nltk

%load_ext autoreload
%autoreload 2

books_obj = {}
with open("books_cleaned.json", "r") as f:
	books_obj = json.load(f)

nltk.download("wordnet")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Collecting all terms

In [85]:
from collections import Counter
from helpers import get_term_freqs

get_term_freqs("rethinking knowledges within higher education :Adorno and social justice")

Counter({'rethinking': 1,
         'knowledge': 1,
         'within': 1,
         'higher': 1,
         'education': 1,
         'adorno': 1,
         'social': 1,
         'justice': 1})

In [86]:
ids_to_term_freqs = {}
term_freqs = Counter()

for id, book in books_obj.items():
	freqs = get_term_freqs(book["title"])

	ids_to_term_freqs[id] = freqs
	term_freqs += freqs

In [87]:
len(term_freqs)

10103

In [88]:
sorted_by_freq = sorted(list(term_freqs.items()), key=lambda t: t[1], reverse=True)
[t[0] for t in sorted_by_freq]

['india',
 'history',
 'world',
 'indian',
 'china',
 'life',
 'theory',
 'politics',
 'modern',
 'introduction',
 'war',
 'new',
 'political',
 'social',
 'study',
 'story',
 'asia',
 'philosophy',
 'edited',
 'how',
 'culture',
 'power',
 'essay',
 'south',
 'science',
 'society',
 'century',
 'development',
 'chinese',
 'art',
 'people',
 'early',
 'economic',
 'revolution',
 'state',
 'it',
 'global',
 'time',
 'analysis',
 'empire',
 'writing',
 'american',
 'other',
 'literature',
 'economy',
 'guide',
 'contemporary',
 'work',
 'making',
 'novel',
 'great',
 'at',
 'british',
 'short',
 'democracy',
 'cultural',
 'international',
 'policy',
 'man',
 'economics',
 'language',
 'age',
 'human',
 'religion',
 'english',
 'critical',
 'perspective',
 'psychology',
 'is',
 'pakistan',
 'woman',
 'nation',
 'ancient',
 'year',
 'handbook',
 'america',
 'colonial',
 'first',
 'selected',
 'book',
 'europe',
 'thought',
 'law',
 'education',
 'future',
 'through',
 'my',
 'change',
 'ou

In [89]:
sorted_term_freqs = sorted(list(term_freqs.items()))

with open("term_freqs.csv", "w", newline = "") as f:
	writer = csv.writer(f)

	writer.writerows(sorted_term_freqs)

In [90]:
import pickle

with open("ids_to_term_freqs.pkl", "wb") as f:
	pickle.dump(ids_to_term_freqs, f)

### Setting up the termdoc matrix

In [91]:
import pandas as pd
import numpy as np

import pickle
import csv

term_indices = {}
cat_to_ids = {}
ids_to_term_freqs = {}

with open("term_freqs.csv", "r") as f:
	reader = csv.reader(f)

	for term, _ in reader:
		term_indices[term] = len(term_indices)

print(term_indices)
term_count = len(term_indices)

with open("cat_to_ids.pkl", "rb") as f:
	cat_to_ids = pickle.load(f)

sorted_cats = sorted(list(cat_to_ids.keys()))
cat_count = len(sorted_cats)

with open("ids_to_term_freqs.pkl", "rb") as f:
	ids_to_term_freqs = pickle.load(f)

{'aayurved': 0, 'abandoned': 1, 'abanindranath': 2, 'abbe': 3, 'abbey': 4, 'abd': 5, 'abdoumaliq': 6, 'abhidharmakosa': 7, 'abhiyatri': 8, 'abid': 9, 'abigail': 10, 'abisaab': 11, 'able': 12, 'abnormal': 13, 'abolition': 14, 'abolitionist': 15, 'aboriginal': 16, 'aborigine': 17, 'about': 18, 'abouzeid': 19, 'above': 20, 'abraham': 21, 'abrahams': 22, 'abroad': 23, 'absence': 24, 'absent': 25, 'absolutism': 26, 'absolutist': 27, 'abstract': 28, 'abundance': 29, 'abuse': 30, 'abyss': 31, 'academic': 32, 'accelerating': 33, 'acceleration': 34, 'acceptance': 35, 'access': 36, 'accessing': 37, 'accession': 38, 'accident': 39, 'accidental': 40, 'accommodation': 41, 'accompany': 42, 'accomplice': 43, 'according': 44, 'account': 45, 'accounting': 46, 'accounts': 47, 'accumulation': 48, 'accuse': 49, 'acer': 50, 'achaemenid': 51, 'acharnians': 52, 'achen': 53, 'achieve': 54, 'achievement': 55, 'achieving': 56, 'achintya': 57, 'achyut': 58, 'acquaintances': 59, 'acquiring': 60, 'acquisition': 61

In [92]:
freq_termdoc = np.zeros((term_count, cat_count))

for i in range(cat_count):
	cat = sorted_cats[i]

	embedding = np.zeros(term_count)

	for id in cat_to_ids[cat]:
		for term, freq in ids_to_term_freqs[id].items():
			embedding[term_indices[term]] += freq

	freq_termdoc[:, i] = embedding

freq_termdoc

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [93]:
freqs = np.sum(freq_termdoc, axis = 1)
norm_termdoc = freq_termdoc / freqs[:, None]

In [94]:
binary_termdoc = (freq_termdoc > 0) * 1.
binary_termdoc.astype(np.float64)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [95]:
with open("frequency_termdoc.pkl", "wb") as f:
	pickle.dump(freq_termdoc, f)

with open("binary_termdoc.pkl", "wb") as f:
	pickle.dump(binary_termdoc, f)

with open("norm_termdoc.pkl", "wb") as f:
	pickle.dump(norm_termdoc, f)