In [1]:
#!pip install transformers
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np 
import scipy as sp 
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest
from scipy.sparse import lil_matrix
import pandas as pd 

In [2]:
from evaluate import *
from process import *

In [14]:
# f = open("data/brown.txt")
# vectorizer = CountVectorizer()
# vectorizer.fit(f)
# vocab = vectorizer.vocabulary_
# tokenizer = vectorizer.build_tokenizer()

In [15]:
def file_to_words(filename):
    with open (filename, "r") as myfile:
        lines=myfile.readlines()
        return lines
# train_lines = file_to_words("data/brown.txt")

In [16]:
def cooc_matrix(train_lines, context_window, vocab, tokenizer):
    if len(train_lines) == 0:
        return []
    r = len(vocab.keys())
    D = lil_matrix((r,r))
    for i in range(len(train_lines)):
        line = tokenizer(train_lines[i])
        print(f"Generating cooc_matrix on line {i+1}: {round(i/len(train_lines)*100,1)} percent complete", end="\r")
        for j in range(len(line)):
            word = line[j].lower()
            word_index = vocab[word]
            for k in range(1,context_window+1):
                if j+k < len(line):
                    context_index = vocab[line[j+k].lower()]
                    if not context_index in D.rows[word_index]:
                        D.rows[word_index].append(context_index)
                        D.data[word_index].append(1)
                    else:
                        D.data[word_index][D.rows[word_index].index(context_index)] += 1
                if j-k >= 0:
                    context_index = vocab[line[j-k].lower()]
                    if not context_index in D.rows[word_index]:
                        D.rows[word_index].append(context_index)
                        D.data[word_index].append(1)
                    else:
                        D.data[word_index][D.rows[word_index].index(context_index)] += 1
    return D

In [17]:
# D = cooc_matrix(train_lines,2,vocab, tokenizer)

In [18]:
# D_sum = D.sum(1).A
# nD = D.sum()

In [19]:
def D_to_M(D,D_sum,nD):
    n = len(D_sum)
    print("creating PMI as lil_matrix")
    M = D.copy()
    rows = M.rows
    data = M.data
    for i in range(len(rows)):
        print(f"Generating PMI: {round(i/len(rows)*100,1)} percent complete", end="\r")
        row = rows[i]
        for j in range(len(row)):
            # row[j] = index in the vocabulary dictionary for c, i = index for w
            assert data[i][j] != 0
            data[i][j] = np.log(data[i][j] * nD / D_sum[i][0] / D_sum[row[j]][0])
            assert data[i][j] != 0
    print("PMI created")
    return M
    

In [20]:
# M = D_to_M(D,D_sum,nD)

In [21]:
def M_to_svd(M,dimension):
    svd = TruncatedSVD(n_components=dimension)
    U = svd.fit_transform(M)
    V = svd.components_
    S = svd.singular_values_ 
    return U,S,V

In [22]:
# u,s_diag,vt = M_to_svd(M,50)
# s = np.zeros((len(s_diag),len(s_diag)))
# np.fill_diagonal(s,s_diag)
# print(u.shape,s.shape,vt.shape)

In [23]:
# W = np.matmul(u,np.sqrt(s))
# print(W.shape)

In [24]:
def write_results(vocab,W,filename):
    embedding = {}
    for word in vocab.keys():
        embedding[word] = W[vocab[word]]
    f = open(filename,"w", encoding='utf8')
    for key, value in embedding.items():
        if not np.any(value):
            continue
        values = ""
        for i in range(len(value)):
            values += f"{value[i]} "
        values = values.strip()
        f.write("{0} {1}\n".format(str(key), values))
    print(f.name)
    f.close()

In [25]:
# tokenizer("S. J. Perelman")

In [26]:
train_path = "data/brown.txt"
def generate_all_embeddings(train_path):
    f = open(train_path)
    vectorizer = CountVectorizer()
    vectorizer.fit(f)
    vocab = vectorizer.vocabulary_
    tokenizer = vectorizer.build_tokenizer()
    train_lines = file_to_words(train_path)
    for context_window in [2,5,10]:
        D = cooc_matrix(train_lines,context_window,vocab, tokenizer)
        D_sum = D.sum(1).A
        nD = D.sum()
        M = D_to_M(D,D_sum,nD)

        for dimension in [50,100,300]:
            u,s_diag,vt = M_to_svd(M,dimension)
            s = np.zeros((len(s_diag),len(s_diag)))
            np.fill_diagonal(s,s_diag)
            W = np.matmul(u,np.sqrt(s))
            filename = f"svd_{dimension}_{context_window}.txt"
            write_results(vocab,W,filename)

In [27]:
generate_all_embeddings(train_path)

creating PMI as lil_matrix
PMI created
svd_50_2.txt
svd_100_2.txt
svd_300_2.txt
creating PMI as lil_matrix
PMI created
svd_50_5.txt
svd_100_5.txt
svd_300_5.txt
creating PMI as lil_matrix
PMI created
svd_50_10.txt
svd_100_10.txt
svd_300_10.txt


In [5]:
path = "svd_50_2.txt"
def evaluate(path):
    print('[evaluate] Loading model...')
    model = load_model(path)

    print('[evaluate] Collecting matrix...')
    matrix, vocab, indices = collect(model)

    print('[evaluate] WordSim353 correlation:')
    ws = eval_wordsim(model)
    print(ws)

    print('[evaluate] BATS accuracies:')
    bats = eval_bats(model, matrix, vocab, indices)
    print(bats)

    print('[evaluate] MSR accuracy:')
    msr = eval_msr(model)
    print(msr)
    return ws,bats,msr

In [3]:
table = pd.DataFrame()

In [6]:
for win in [2,5,10]:
    for dim in [50,100,300]:
        path = f"svd_{dim}_{win}.txt"
        ws,bats,msr = evaluate(path)
        row = {"Algorithm":"SVD", "Win.":win, "Dim.":dim, "N. s.":"-", "WordSim":ws[0]*100, "BATS Male-Female":np.round(bats["E10 [male - female]"],2), "BATS hypernym - misc": np.round(bats["L02 [hypernyms - misc]"],2) , "BATS total":np.round(bats["total"],2), "MSR":msr}
        table = table.append(pd.DataFrame(row,index=[0]), ignore_index = True)

[evaluate] Loading model...
[evaluate] Collecting matrix...
[evaluate] WordSim353 correlation:
SpearmanrResult(correlation=0.07384088022317557, pvalue=0.18490281768438876)
[evaluate] BATS accuracies:
{'inflectional_morphology': 0.012244897959183673, 'I01 [noun - plural_reg]': 0.04081632653061224, 'I02 [noun - plural_irreg]': 0.0, 'I03 [adj - comparative]': 0.0, 'I04 [adj - superlative]': 0.0, 'I05 [verb_inf - 3pSg]': 0.0, 'I06 [verb_inf - Ving]': 0.0, 'I07 [verb_inf - Ved]': 0.02040816326530612, 'I08 [verb_Ving - 3pSg]': 0.0, 'I09 [verb_Ving - Ved]': 0.0, 'I10 [verb_3pSg - Ved]': 0.061224489795918366, 'derivational_morphology': 0.0, 'D01 [noun+less_reg]': 0.0, 'D02 [un+adj_reg]': 0.0, 'D03 [adj+ly_reg]': 0.0, 'D04 [over+adj_reg]': 0.0, 'D05 [adj+ness_reg]': 0.0, 'D06 [re+verb_reg]': 0.0, 'D07 [verb+able_reg]': 0.0, 'D08 [verb+er_irreg]': 0.0, 'D09 [verb+tion_irreg]': 0.0, 'D10 [verb+ment_irreg]': 0.0, 'encyclopedic_semantics': 0.0471441961797639, 'E01 [country - capital]': 0.0370370370

In [7]:
table

Unnamed: 0,Algorithm,Win.,Dim.,N. s.,WordSim,BATS Male-Female,BATS hypernym - misc,BATS total,MSR
0,SVD,2,50,-,7.384088,0.06,0.0,0.02,0.652754
1,SVD,2,100,-,5.791511,0.21,0.08,0.04,0.644638
2,SVD,2,300,-,11.904982,0.15,0.0,0.03,0.646377
3,SVD,5,50,-,17.285479,0.06,0.05,0.03,0.662029
4,SVD,5,100,-,15.730036,0.12,0.08,0.03,0.663188
5,SVD,5,300,-,15.339866,0.21,0.03,0.03,0.652174
6,SVD,10,50,-,21.061826,0.09,0.08,0.03,0.664928
7,SVD,10,100,-,20.127318,0.21,0.0,0.03,0.666087
8,SVD,10,300,-,16.399344,0.18,0.03,0.03,0.662609


In [8]:
table.to_csv("svd_results.csv")

In [31]:
msr

NameError: name 'msr' is not defined