In [1]:
#!pip install transformers
import torch
from transformers import BertTokenizer,BertModel, BertConfig
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np 
import scipy as sp 
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest
from scipy.sparse import lil_matrix
from scipy.sparse import coo_matrix


In [2]:
from evaluate import *
from process import *

In [3]:
f = open("data/brown.txt")
vectorizer = CountVectorizer()
vectorizer.fit(f)
vocab = vectorizer.vocabulary_
tokenizer = vectorizer.build_tokenizer()

In [112]:
def file_to_words(filename):
    with open (filename, "r") as myfile:
        lines=myfile.readlines()
        return lines
train_lines = file_to_words("data/brown.txt")

In [115]:
def cooc_matrix(train_lines, context_window, vocab, tokenizer):
    if len(train_lines) == 0:
        return []
    r = len(vocab.keys())
    D = lil_matrix((r,r))
    for i in range(len(train_lines)):
        line = tokenizer(train_lines[i])
        print(f"Generating cooc_matrix on line {i+1}: {round(i/len(train_lines)*100,1)} percent complete", end="\r")
        for j in range(len(line)):
            word = line[j].lower()
            word_index = vocab[word]
            for k in range(1,context_window+1):
                if j+k < len(line):
                    context_index = vocab[line[j+k].lower()]
                    if not context_index in D.rows[word_index]:
                        D.rows[word_index].append(context_index)
                        D.data[word_index].append(1)
                    else:
                        D.data[word_index][D.rows[word_index].index(context_index)] += 1
                if j-k >= 0:
                    context_index = vocab[line[j-k].lower()]
                    if not context_index in D.rows[word_index]:
                        D.rows[word_index].append(context_index)
                        D.data[word_index].append(1)
                    else:
                        D.data[word_index][D.rows[word_index].index(context_index)] += 1
    return D

In [116]:
D = cooc_matrix(train_lines,2,vocab, tokenizer)



In [117]:
D_sum = D.sum(1).A
nD = D.sum()

In [118]:
def D_to_M(D,D_sum,nD):
    n = len(D_sum)
    print("creating PMI as lil_matrix")
    M = D.copy()
    rows = M.rows
    data = M.data
    for i in range(len(rows)):
        print(f"Generating PMI: {round(i/len(rows)*100,1)} percent complete", end="\r")
        row = rows[i]
        for j in range(len(row)):
            # row[j] = index in the vocabulary dictionary for c, i = index for w
            assert data[i][j] != 0
            data[i][j] = np.log(data[i][j] * nD / D_sum[i][0] / D_sum[row[j]][0])
            assert data[i][j] != 0
    print("PMI created")
    return M
    

In [119]:
M = D_to_M(D,D_sum,nD)

creating PMI as lil_matrix
PMI created


In [120]:
def M_to_svd(M,dimension):
    svd = TruncatedSVD(n_components=dimension)
    U = svd.fit_transform(M)
    V = svd.components_
    S = svd.singular_values_ 
    return U,S,V

In [140]:
u,s_diag,vt = M_to_svd(M,50)
s = np.zeros((len(s_diag),len(s_diag)))
np.fill_diagonal(s,s_diag)
print(u.shape,s.shape,vt.shape)

(42396, 50) (50, 50) (50, 42396)


In [142]:
W = np.matmul(u,np.sqrt(s))
print(W.shape)

(42396, 50)


In [143]:
embedding = {}
for word in vocab.keys():
    embedding[word] = W[vocab[word]]

In [146]:
f = open("svd_50_2.txt","w", encoding='utf8')
for key, value in embedding.items():
    values = ""
    for i in range(len(value)):
        values += f"{value[i]} "
    values = values.strip()
    f.write("{0} {1}\n".format(str(key), values))
print(f.name)
f.close()

svd_50_2.txt


In [149]:
tokenizer("S. J. Perelman")

['Perelman']