In [1]:
import warnings
warnings.filterwarnings('ignore')
import re
import numpy as np

In [2]:
doc = open('../DataSets/text8','r').read().lower()

In [3]:
len(doc)

100000000

In [4]:
corpus_list = re.split('\W+', doc)

In [5]:
len(corpus_list)

17005208

In [6]:
cutOffValue = 100
from collections import defaultdict
frequency = defaultdict(int)
for token in corpus_list:
    frequency[token] += 1
processedCorpus_list = [token for token in corpus_list 
                        if frequency[token] >= cutOffValue]

In [7]:
len(processedCorpus_list)

15471435

In [8]:
len(corpus_list)-len(processedCorpus_list)

1533773

In [9]:
len(frequency)

253855

In [10]:
allWords = np.array(list(frequency.keys()))
allCounts = np.array(list(frequency.values()))

In [11]:
len(allWords)

253855

In [12]:
vocab = allWords[allCounts >= cutOffValue]
wordCounts = allCounts[allCounts >= cutOffValue]

In [13]:
len(vocab)

11815

In [14]:
from scipy.sparse import lil_matrix

In [15]:
def computeWordContextMatrix(corpus_list,vocab=None,windowSize=2):
    if vocab is None:
        vocab = sorted(list(set(cospus_list)))
    numWords = len(vocab)
    M = np.zeros((numWords,numWords))
    # M = lil_matrix((numWords,numWords)) #for computing big matrix by splitting into pieces
    W2I = dict(zip(vocab,np.arange(numWords)))
    I2W = dict(zip(np.arange(numWords),vocab))
    doc = corpus_list
    docLen = len(doc)
    curIdx = 0
    while curIdx < docLen:
        left = max(curIdx-windowSize,0)
        right = min(curIdx+windowSize+1,docLen)
        wordsInContext = doc[left:curIdx] + doc[curIdx+1:right]
        currentWord = doc[curIdx]
        currentWordIdx = W2I[currentWord]
        for word in wordsInContext:
            contextWordIdx = W2I[word]
            M[currentWordIdx,contextWordIdx] += 1
        curIdx += 1
    return M,W2I,I2W

In [16]:
M,W2I,I2W = computeWordContextMatrix(processedCorpus_list,vocab)

In [17]:
M.shape

(11815, 11815)

In [18]:
word = 'good'
print(W2I[word],I2W[190])

190 good


In [19]:
v = M[W2I['good'],:]
print(v)

[  0.   0. 214. ...   0.   0.   0.]


In [20]:
v.shape

(11815,)

In [21]:
def pmi(M, positive=True):
    col_totals = np.sum(M,axis=0)
    total = col_totals.sum()
    row_totals = np.sum(M,axis=1)
    expected = np.outer(row_totals, col_totals) / total
    M = M / expected    
    with np.errstate(divide='ignore'):
        M = np.log(M)
    M[np.isinf(M)] = 0.0  
    if positive:
        M[M < 0] = 0.0
    return M

In [22]:
M = pmi(M)

In [23]:
M[W2I['good'],:]

array([0.       , 0.       , 0.4605286, ..., 0.       , 0.       ,
       0.       ])

# Latent Semantic Analysis (LSA)

In [24]:
from sklearn.decomposition import TruncatedSVD, PCA, IncrementalPCA # IncrementalPCA is used when the matrix is very-very large

In [25]:
transformer = TruncatedSVD(n_components=100)  # new dimention size = 100

In [26]:
M_reduced = transformer.fit_transform(M)

In [27]:
print(M.shape)
print(M_reduced.shape)

(11815, 11815)
(11815, 100)


In [28]:
M_reduced[W2I['good'],:]

array([ 2.35845596e+01, -3.84230766e+00, -3.72714416e+00, -4.18267271e+00,
       -4.50620746e+00, -7.88498605e+00, -1.00250428e+00, -3.38177731e+00,
        5.66646779e-02, -2.34174101e+00,  7.71492772e+00,  2.24916639e+00,
       -2.86843204e+00,  3.30230636e+00,  5.32936647e-01, -1.46204547e-01,
       -6.91937925e+00,  4.46866966e+00, -8.75815601e-01, -3.49978788e-01,
        1.34576279e+00,  7.95122650e-01,  1.87877530e+00,  3.97482641e+00,
       -2.33570176e+00,  8.38519708e-01, -1.33045643e+00,  2.25237190e+00,
        1.38415064e+00,  1.27790100e-01, -4.07878609e+00,  3.86462373e+00,
       -4.65722705e-01,  4.52661183e+00,  7.57365387e-01,  1.40180331e+00,
        1.09452951e+00, -3.19721482e+00,  1.76780719e+00, -2.49646052e+00,
        3.53053455e-01, -1.86064764e+00, -1.42075634e+00,  2.66949691e+00,
       -1.08486799e+00, -1.78399429e+00,  1.51149971e+00,  1.76061359e+00,
       -2.95793700e-01,  1.30829525e+00,  3.59065938e+00, -1.09109433e+00,
       -4.65868452e-02,  

# Word Semantics

## Cosine Similarity

In [29]:
def getNorms(E):  # e = single row vector or multiple row matrix
    if E.ndim == 1:
        E = E[np.newaxis,:]
    nrms = np.sum(E**2,axis=1)**0.5
    return nrms