In [None]:
!pip install memory_profiler
!pip install line_profiler

In [2]:
import numpy as np
import math
from scipy.sparse import csr_matrix, csc_matrix
from scipy.sparse.linalg import svds, eigs
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE

In [3]:
%load_ext memory_profiler
%load_ext line_profiler
f1 = "/content/drive/MyDrive/enwiki8.txt"
f2 = "/content/drive/MyDrive/wordsim353_human_scores.txt"

In [4]:
def parseUniqueWordsWcount(filename):
  uniword = {}
  with open(filename, "r") as file1:
    text = file1.read()
    words = text.split()
    for word in words:
      if word not in uniword:
        uniword[word] = 1
      else:
        uniword[word] += 1
    return uniword

In [5]:
def parseUniqueWords(filename):
  with open(filename, "r") as file2:
    lines = file2.read()
    words = lines.split()
    addwords = []
    for word in words:
      if "." not in word and word not in addwords:
        addwords.append(word)
  return addwords

In [6]:
def genSparseData(filename, uniqword):
  with open(filename, "r") as file1:
    doculines = file1.readlines() 
  wordset = set(uniqword)
  rows = []
  cols = []
  vals = []
  for i in range(len(doculines)):
    words = doculines[i].split()
    tempfreq = [0] * len(uniqword) 
    for word in words:
      if word in wordset:
        tempfreq[uniqword.index(word)] += 1
    # Progress check since this takes the longest parsing through the file
    if i % 50000 == 0:
      print(i)
    # Basic Sparse data
    for j in range(len(tempfreq)):
      if tempfreq[j] > 0:
        rows.append(j)
        cols.append(i)
        vals.append(tempfreq[j])
  # conversion to csr sparse matrix format
  datmat = csr_matrix((vals,(rows, cols)), dtype=float)
  return datmat

In [7]:
def libSVD(spmat ,kval):
  u,sd,v = svds(spmat, k=kval)
  return u, sd, v

In [8]:
def matrix_factor(x, displacement, kval):
  v = np.random.rand(kval, x.shape[1])
  idmat = np.identity(kval)
  u = np.zeros(())
  max_iter = 50
  for i in range(max_iter):
    u = np.linalg.inv(v @ v.T + (displacement * idmat)) @ (v @ x.T)
    v = np.linalg.inv(u @ u.T + (displacement * idmat)) @ (u @ x)
    # iteration progress check 
    if i % 10 == 0:
      print(i)
  eval, evec = eigs(x @ x.T, k=kval)
  evalmat = np.diag(eval)
  return u, evalmat ,v

In [21]:
def correlationScores(filename, testerr, allwords):
  humanscores = []
  coscore = []
  with open(filename, "r") as file2:
    humanlines = file2.readlines()
    for line in humanlines:
      word = line.split()
      #first word
      fword = testerr[allwords.index(word[0])]
      fword = (fword.reshape(fword.shape[0],1)).T
      #second word
      sword = testerr[allwords.index(word[1])]
      sword = sword.reshape(sword.shape[0],1)
      distance = (fword @ sword) / (np.linalg.norm(fword) * np.linalg.norm(sword))
      humanscores.append(float(word[2]))
      coscore.append(distance[0][0].real)
  humanscores = np.array(humanscores)
  coscore = np.array(coscore)
  return humanscores, coscore

In [10]:
def PCC(x, y):
  stdx = np.std(x)
  stdy = np.std(y)
  score = (np.cov(x,y))[0][1] / (stdx * stdy)
  return score

In [11]:
def PPMI(smat):
  # All sums
  total = float(smat.sum())
  sumr = np.array(smat.sum(axis=1), dtype=np.float64).flatten()
  sumc = np.array(smat.sum(axis=0), dtype=np.float64).flatten()
  # used values
  ii, jj = smat.nonzero()
  fij = np.array(smat[ii,jj], dtype=np.float64).flatten()
  # ppmi value
  tempzri = sumr[ii]
  tempzrj = sumc[jj]
  pmi = np.log(fij * total / (sumr[ii] * sumc[jj]))
  ppmi = np.maximum(0, pmi)
  # reshape to original matrix
  newmat = csr_matrix((ppmi, (ii,jj)), shape=smat.shape, dtype=np.float64)
  # purge 0s
  newmat.eliminate_zeros() 
  return newmat

In [12]:
uniword = parseUniqueWordsWcount(f1)
toptk = sorted(uniword, key=uniword.get,reverse=True)[:10000]

In [13]:
extrawords = parseUniqueWords(f2)
allwords = toptk.copy()
for word in extrawords:
  if word not in allwords:
    allwords.append(word)

In [None]:
smatrix = genSparseData(f1, allwords)

In [15]:
# convert to csc since more columns than rows
tempmat = smatrix.tocsc()
jmatrix = PPMI(tempmat)

In [None]:
# change this for original matrix(jmatrix) vs pmi matrix(smatrix)
usemat = smatrix
k = 20
%time %memit lu20, le20, lv20 = libSVD(usemat,k)
%time %memit su20, se20, sv20 = matrix_factor(usemat, 0.1, k)
k = 50
%time %memit lu50, le50, lv50 = libSVD(usemat,k)
%time %memit su50, se50, sv50 = matrix_factor(usemat, 0.1, k) 
k = 100
%time %memit lu100, le100, lv100 = libSVD(usemat,k)
%time %memit su100, se100, sv100 = matrix_factor(usemat, 0.1, k)  

In [None]:
wordvec = su100.T @ se100
humanscores, coscores = correlationScores(f2, wordvec, allwords)
tester = PCC(humanscores, coscores)
print(tester)

In [None]:
humanscores,coscores = correlationScores(f2, lu100 @ np.diag(le100), allwords)
tester = PCC(humanscores, coscores)
print(tester)

In [19]:
tsne = TSNE(n_components=2, random_state=0, method="exact")
o = usemat[:300]
tsne_results = tsne.fit_transform(o)
tsne_results = tsne_results.T

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
ax.scatter(tsne_results[0], tsne_results[1])
for i, txt in enumerate(allwords[:300]):
    ax.annotate(txt, (tsne_results[0][i], tsne_results[1][i]))