In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.sparse import coo_matrix
from scipy.sparse import dok_matrix
import scipy.sparse.linalg as ssl
import scipy.sparse as scs
from tqdm import tqdm

In [2]:
%load_ext cython

First we'll load in the data. Using the Jeapardy data as it's small.

In [3]:
data = pd.read_csv('./data/JEOPARDY_CSV.csv')
data = data[:1000]
data.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [29]:
data.values[0]

array([4680, '2004-12-31', 'Jeopardy!', 'HISTORY', '$200',
       "For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",
       'Copernicus'], dtype=object)

In [4]:
m = len(data)

In [5]:
%%cython

import numpy as np
cimport numpy as np
import scipy.sparse as scs
from scipy.sparse import dok_matrix

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def unique_words(list sentences):
    cdef dict words = {}
    cdef int n = len(sentences)
    cdef int i, j
    for i in range(n):
        sent_list = [w.lower() for w in sentences[i].split(' ')]
        clean_sent_list = []
        for j in range(len(sent_list)):
            newword = ''
            for char in sent_list[j]:
                if char in alphabet:
                    newword += char
            clean_sent_list.append(newword)
        for word in clean_sent_list:
            if word != '':
                try:
                    words[word] += 1
                except KeyError:
                    words[word] = 1
    wordlist = sorted(words.keys())
    return wordlist, len(wordlist), words

# Use tf-idf
# https://en.wikipedia.org/wiki/Tf%E2%80%93idf
def populate_doc_matrix(docmatrix, wordlist, word_freq, np.ndarray data):
    cdef int n = len(data)   # number of documents
    cdef int i, j, k, m
    # construct word index first
    # This tells us (for any word) what index it is in in document
    print('Constructing Word Reference')
    wordref = {}
    for i in range(len(wordlist)):
        wordref[wordlist[i]] = i
    # Now populate sparse matrix
    print('Populating Sparse Matrix')
    for i in range(n):
        for j in range(2):
            words = [w.lower() for w in data[i, j].split(' ') if w != '']
            m = len(words)
            for k in range(m):
                word = words[k]
                cword = ''
                for char in word:
                    if char in alphabet:
                        cword += char
                if cword != '':
                    docmatrix[i, wordref[cword]] += 1
    # finish weighting
    print('Weighting Matrix')
    m, n = docmatrix.shape
    weighted_docmatrix = dok_matrix((m, n), dtype=float)
    for i in range(n):
        weighted_docmatrix[:, i] = docmatrix[:, i] * np.log(m / word_freq[wordlist[i]])
    return weighted_docmatrix, wordref

In [6]:
words, n, wordfreq = unique_words(list(np.concatenate((data[[' Question']].values[:, 0],
                                  data[[' Answer']].values[:, 0]))))

In [15]:
print('{} Documents (m) by {} Unique Words (n)\n\nTop 100 Most Frequent Words:{}'.format(
        m, n, ','.join([tup[0] for tup in sorted(wordfreq.items(), key=lambda tup: -tup[1])[:100]])))

1000 Documents (m) by 5244 Unique Words (n)

Top 100 Most Frequent Words:the,this,of,a,in,to,for,is,on,was,its,from,as,with,that,an,his,you,these,he,by,it,at,first,one,name,or,city,and,named,state,i,s,are,john,man,country,us,who,have,be,your,has,word,like,new,her,not,seen,called,when,hrefhttpwwwjarchivecommediadjjpg,had,out,were,here,about,can,clue,known,all,show,she,war,but,years,th,if,which,crew,make,now,film,made,wrote,series,may,type,island,more,used,area,than,began,queen,most,also,book,some,term,became,flag,said,part,river,youre,little,george,whose,him


In [8]:
docmatrix = dok_matrix((m, n), dtype=float)   # m-docs, n-unique words

In [9]:
ndocterm, wordref = populate_doc_matrix(docmatrix, words, wordfreq,
                                data[[' Question', ' Answer']].values)

Constructing Word Reference
Populating Sparse Matrix
Weighting Matrix


In [10]:
ndocterm

<1000x5244 sparse matrix of type '<class 'numpy.float64'>'
	with 14157 stored elements in Dictionary Of Keys format>

In [38]:
u, s, vt = ssl.svds(ndocterm.T, k=20)
u.shape, s.shape, vt.shape

((5244, 20), (20,), (20, 1000))

In [37]:
np.save('umatrix.npy', u)
np.save('smatrix.npy', s)
np.save('vtmatrix.npy', vt)

Now that we have our $k$th-order decomposition, let's query the word "Species".

In [187]:
wordref['species']

4384

In [34]:
!ls

data	  satrix.npy   Testing.html   umatrix.npy
proposal  smatrix.npy  Testing.ipynb  vtmatrix.npy


In [35]:
np.load('./umatrix.npy')

array([[-0.05452245, -0.00216872,  0.00831265, ...,  0.02053064,
         0.01959931, -0.03883994],
       [ 0.02404747, -0.01370522,  0.02219454, ...,  0.00463428,
         0.00127721, -0.02248268],
       [ 0.01949402,  0.00652029,  0.00995533, ..., -0.01139821,
         0.01296431, -0.02839199],
       ..., 
       [ 0.04080266, -0.00356032, -0.01054592, ..., -0.00705405,
         0.00661091, -0.02353832],
       [-0.01053368,  0.04073003, -0.01327792, ...,  0.02682542,
         0.00499936, -0.02426648],
       [ 0.00029609, -0.01917808, -0.02569479, ..., -0.00053521,
         0.03182219, -0.03529649]])