In [1]:
import pandas as pd
import numpy as np
import pickle
import csv

%load_ext autoreload
%autoreload 2

freq_termdoc = None
with open("frequency_termdoc.pkl", "rb") as f:
	freq_termdoc = pickle.load(f)

term_freqs = {}
with open("term_freqs.csv", "r") as f:
	reader = csv.reader(f)

	for term, freq in reader:
		term_freqs[term] = int(freq)

cat_to_ids = {}
with open("cat_to_ids.pkl", "rb") as f:
	cat_to_ids = pickle.load(f)

bin_termdoc = None
with open("binary_termdoc.pkl", "rb") as f:
	bin_termdoc = pickle.load(f)

norm_termdoc = None
with open("norm_termdoc.pkl", "rb") as f:
	norm_termdoc = pickle.load(f)

M, N = freq_termdoc.shape
M, N

(10103, 521)

### SVD Generation

In [2]:
freq_svd = np.linalg.svd(freq_termdoc, full_matrices=False)
bin_svd = np.linalg.svd(bin_termdoc, full_matrices=False)
norm_svd = np.linalg.svd(norm_termdoc, full_matrices=False)

freq_svd.U.shape, freq_svd.S.shape, freq_svd.Vh.shape

((10103, 521), (521,), (521, 521))

In [3]:
with open("freq_termdoc_svd.pkl", "wb") as f:
	pickle.dump(freq_svd, f)

with open("bin_termdoc_svd.pkl", "wb") as f:
	pickle.dump(bin_svd, f)

with open("norm_termdoc_svd.pkl", "wb") as f:
	pickle.dump(norm_svd, f)

### Loading & Plotting

In [4]:
with open("freq_termdoc_svd.pkl", "rb") as f:
	freq_svd = pickle.load(f)

with open("bin_termdoc_svd.pkl", "rb") as f:
	bin_svd = pickle.load(f)

with open("norm_termdoc_svd.pkl", "rb") as f:
	norm_svd = pickle.load(f)

In [5]:
freq_svd.S[0], freq_svd.S[-10:]

(np.float64(277.76468701758984),
 array([1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 9.99550860e-01,
        9.09783759e-01, 8.82445132e-01, 8.78410569e-01, 8.15545374e-01,
        7.32179115e-01, 1.23346057e-13]))

In [6]:
norm_svd.S[0], norm_svd.S[-10:]

(np.float64(20.281099870796524),
 array([8.53412975e-02, 7.21586991e-02, 7.15620045e-02, 6.56573025e-02,
        6.27162366e-02, 4.79656917e-02, 3.06044837e-02, 2.75136884e-02,
        2.65705158e-02, 9.00661762e-15]))

In [7]:
bin_svd.S[0], bin_svd.S[-10:]

(np.float64(60.8847596295952),
 array([1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        9.98291054e-01, 8.97522237e-01, 8.89976191e-01, 8.37791016e-01,
        7.62934966e-01, 2.46848957e-16]))

In [8]:
bin_svd.S[0] / bin_svd.S[-1]

np.float64(2.4664783055092896e+17)

In [9]:
from helpers import rank_drop_indices

rank_drop_indices(freq_svd.S, 1.0)

array([519])

In [10]:
rank_drop_indices(norm_svd.S, 1.0)

array([519])

In [11]:
rank_drop_indices(bin_svd.S, 1.0)

array([519])

In [38]:
import kaleido
import plotly.express as px

def plot_term_space(svd, a=0, b=1):
	fig = px.scatter(x=svd["U"][:, a] * svd["S"][a], y=svd["U"][:, b] * svd["S"][b], text=list(term_freqs.keys()))
	fig.show()

	return fig

def plot_doc_space(svd, a=0, b=1):
	fig = px.scatter(x=svd["Vh"][a, :] * svd["S"][a], y=svd["Vh"][b, :] * svd["S"][b], text=sorted(cat_to_ids.keys()))
	fig.show()
	
	return fig

### Frequency Based TermDoc

Values are pretty consistently similar in magnitude until the last few guys, so we should maybe have $K = N - 1$, but that would likely not give us a good shrink in rank (i.e. no compression of "semantic meaning" to deal with polysemy and synonymy), so I pick an arbitrary shrink, $K = N // something$, and take the best K rank approximation to A by truncating the SVD to k terms

In [34]:
K = 520
K

520

In [35]:
from helpers import truncate_svd

freq_trunc = truncate_svd(freq_svd, K)

(freq_trunc["S"] > 0).sum()

np.int64(520)

In [39]:
fig = plot_term_space(freq_trunc)

with open("freq_term_space.png", "wb") as f:
	fig.write_image(f, width=1000)

In [37]:
plot_doc_space(freq_trunc)

In [28]:
with open(f"freq_trunc_{K}.pkl", "wb") as f:
	pickle.dump(freq_trunc, f)

### Normalised TermDoc

Same deal

In [29]:
norm_trunc = truncate_svd(norm_svd, K)

(norm_trunc["S"] > 0).sum()

np.int64(520)

In [30]:
fig = plot_term_space(norm_trunc)

with open("norm_term_space.png", "wb") as f:
	fig.write_image(f, width=1000)

In [31]:
plot_doc_space(norm_trunc)

In [140]:
with open(f"norm_trunc_{K}.pkl", "wb") as f:
	pickle.dump(norm_trunc, f)

### Binary Termdoc
Same deal

In [22]:
bin_trunc = truncate_svd(bin_svd, K)

(bin_trunc["S"] > 0).sum()

np.int64(520)

In [32]:
fig = plot_term_space(bin_trunc)

with open("bin_term_space.png", "wb") as f:
	fig.write_image(f, width=1000)

In [143]:
plot_doc_space(bin_trunc)

In [144]:
with open(f"bin_trunc_{K}.pkl", "wb") as f:
	pickle.dump(bin_trunc, f)