In [139]:
import pandas as pd
import numpy as np
import pickle
import csv

%load_ext autoreload
%autoreload 2

freq_termdoc = None
with open("frequency_termdoc.pkl", "rb") as f:
	freq_termdoc = pickle.load(f)

term_freqs = {}
with open("term_freqs.csv", "r") as f:
	reader = csv.reader(f)

	for term, freq in reader:
		term_freqs[term] = int(freq)

cat_to_ids = {}
with open("cat_to_ids.pkl", "rb") as f:
	cat_to_ids = pickle.load(f)

bin_termdoc = None
with open("binary_termdoc.pkl", "rb") as f:
	bin_termdoc = pickle.load(f)

norm_termdoc = None
with open("norm_termdoc.pkl", "rb") as f:
	norm_termdoc = pickle.load(f)

M, N = freq_termdoc.shape
M, N

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


(10103, 521)

### SVD Generation

In [140]:
freq_svd = np.linalg.svd(freq_termdoc, full_matrices=False)
bin_svd = np.linalg.svd(bin_termdoc, full_matrices=False)
norm_svd = np.linalg.svd(norm_termdoc, full_matrices=False)

In [141]:
with open("freq_termdoc_svd.pkl", "wb") as f:
	pickle.dump(freq_svd, f)

with open("bin_termdoc_svd.pkl", "wb") as f:
	pickle.dump(bin_svd, f)

with open("norm_termdoc_svd.pkl", "wb") as f:
	pickle.dump(norm_svd, f)

### Loading & Plotting

In [142]:
with open("freq_termdoc_svd.pkl", "rb") as f:
	freq_svd = pickle.load(f)

with open("bin_termdoc_svd.pkl", "rb") as f:
	bin_svd = pickle.load(f)

with open("norm_termdoc_svd.pkl", "rb") as f:
	norm_svd = pickle.load(f)

In [143]:
from helpers import rank_drop_indices

rank_drop_indices(freq_svd.S)

array([519])

In [144]:
rank_drop_indices(norm_svd.S)

array([519])

In [145]:
rank_drop_indices(bin_svd.S)

array([519])

In [146]:
import plotly.express as px

def plot_term_space(svd, a=0, b=1):
	fig = px.scatter(x=svd["U"][:, a] * svd["S"][a], y=svd["U"][:, b] * svd["S"][b], text=list(term_freqs.keys()))
	fig.show()

def plot_doc_space(svd, a=0, b=1):
	fig = px.scatter(x=svd["Vh"][a, :] * svd["S"][a], y=svd["Vh"][b, :] * svd["S"][b], text=sorted(cat_to_ids.keys()))
	fig.show()

### Frequency Based TermDoc

Values are pretty consistently similar in magnitude until the last few guys, so we should maybe have $K = N - i$, but that would likely not give us a good shrink in rank (i.e. no compression of "semantic meaning" to deal with polysemy and synonymy), so I pick an arbitrary shrink, $K = N // 16$, and take the best K rank approximation to A by truncating the SVD to k terms

In [147]:
K = N // 16
K

32

In [148]:
from helpers import truncate_svd

freq_trunc = truncate_svd(freq_svd, K)

(freq_trunc["S"] > 0).sum()

np.int64(32)

In [149]:
plot_term_space(freq_trunc)

In [150]:
plot_doc_space(freq_trunc)

In [151]:
with open("freq_trunc.pkl", "wb") as f:
	pickle.dump(freq_trunc, f)

### Normalised TermDoc

Same deal

In [152]:
K = N // 16
K

32

In [153]:
norm_trunc = truncate_svd(norm_svd, K)

(norm_trunc["S"] > 0).sum()

np.int64(32)

In [154]:
plot_term_space(norm_trunc)

In [155]:
plot_doc_space(norm_trunc)

In [156]:
with open("norm_trunc.pkl", "wb") as f:
	pickle.dump(norm_trunc, f)

### Binary Termdoc
Same deal

In [157]:
K = N // 16
K

32

In [158]:
bin_trunc = truncate_svd(bin_svd, K)

(bin_trunc["S"] > 0).sum()

np.int64(32)

In [159]:
plot_term_space(bin_trunc)

In [160]:
plot_doc_space(bin_trunc)

In [161]:
with open("bin_trunc.pkl", "wb") as f:
	pickle.dump(bin_trunc, f)