In [1]:
import os
import io
import json

import numpy as np

from scipy.stats import spearmanr
from gensim.models.keyedvectors import KeyedVectors

import Constant

# Path a carpeta principal
MAIN_FOLDER = Constant.MAIN_FOLDER

# Path a carpeta con los embeddings
EMBEDDING_FOLDER = Constant.EMBEDDING_FOLDER

# Lista con los nombres de los archivos de los embeddings
embedding_name_list = os.listdir(EMBEDDING_FOLDER)

print(">>> Embeddings a evaluar:")
for embedding in embedding_name_list:
    print("     > " + embedding)

def get_wordvector(file, cant=None):
    wordvector_file = EMBEDDING_FOLDER / file

    return KeyedVectors.load_word2vec_format(wordvector_file, limit=cant)


D:\Documents\Memoria - Eval. Word Embeddings\Testing
D:\Documents\Memoria - Eval. Word Embeddings
D:\Documents\Memoria - Eval. Word Embeddings\Embeddings
D:\Documents\Memoria - Eval. Word Embeddings\Datasets
D:\Documents\Memoria - Eval. Word Embeddings\Resultados
D:\Documents\Memoria - Eval. Word Embeddings\TempResultados
>>> Embeddings a evaluar:
     > _fasttext_sbwc.vec
     > _fasttext_suc.vec
     > _fasttext_wiki.vec
     > _glove_sbwc.vec
     > _w2v_sbwc.txt


In [2]:
# Realizacion de test

import SimilarityTest

# Test de similaridad
print(">>> Test de similaridad\n")
for embedding in embedding_name_list:
    print(">>> Testing " + embedding + "\n")
    
    print(">>> Cargando vectores...", end='')
    wordvector = get_wordvector(embedding, 10000)
    print("listo\n")
    
    result = SimilarityTest.similarityTest(wordvector, embedding.split('.')[0])


>>> Test de similaridad

>>> Testing _fasttext_sbwc.vec

>>> Cargando vectores...listo

>>> Test individuales
>>> Archivo test: rg65_es.txt
     > Opening file: rg65_es.txt
>>> Not found words:63
    ['cordel', 'gallo', 'caldera', 'autógrafo', 'hechicero', 'loma', 'estufa', 'monje', 'manicomio', 'gallo', 'vaso', 'mago', 'cojín', 'joya', 'monje', 'esclavo', 'chaval', 'monje', 'oráculo', 'sabio', 'cojín', 'loma', 'chaval', 'hechicero', 'gallo', 'pájaro', 'caldera', 'grulla', 'gallo', 'loma', 'joya', 'mago', 'oráculo', 'grúa', 'chaval', 'sabio', 'hechicero', 'oráculo', 'sabio', 'pájaro', 'gallo', 'pájaro', 'grulla', 'monje', 'manicomio', 'caldera', 'estufa', 'mago', 'hechicero', 'loma', 'cordel', 'vaso', 'recipiente', 'siervo', 'esclavo', 'autógrafo', 'pollo', 'gallo', 'chaval', 'cojín', 'almohada', 'gema', 'joya']
     > Cantidad de pares no procesados: 44


>>> Archivo test: semeval17_es.txt
     > Opening file: semeval17_es.txt
>>> Not found words:309
    ['astronave', 'poliedro', 'mos

In [3]:
import AnalogyTest

# Test de analogias
print(">>> Test de analogias\n")
for embedding in embedding_name_list:
    print(">>> Testing " + embedding + "\n")
    
    print(">>> Cargando vectores...", end='')
    wordvector = get_wordvector(embedding, 10000)
    print("listo\n")
    
    AnalogyTest.analogyTest(wordvector, embedding.split('.')[0], all_score=True)


>>> Test de analogias

>>> Testing _fasttext_sbwc.vec

>>> Cargando vectores...listo

>>> Testing : _español_D01 [prefijo_anti-].txt
>>> Testing : _español_D02 [prefijo_des-].txt
>>> Testing : _español_D03 [prefijo_in-].txt
>>> Testing : _español_D04 [sufijo_-able].txt
>>> Testing : _español_D05 [sufijo_-ción].txt
>>> Testing : _español_D06 [sufijo_-ísimo].txt
>>> Testing : _español_D07 [sufijo_-ita].txt
>>> Testing : _español_D08 [sufijo_-ito].txt
>>> Testing : _español_D09 [sufijo_-mente].txt
>>> Testing : _español_D10 [sufijo_-miento].txt
>>> Testing : _español_E01 [pais - capital].txt
>>> Testing : _español_E02 [pais - idioma].txt
>>> Testing : _español_E04 [nombre - nacionalidad].txt
>>> Testing : _español_E05 [nombre - ocupacion].txt
>>> Testing : _español_E10 [hombre - mujer].txt
>>> Testing : _español_E11 [pais - gentilicio].txt
>>> Testing : _español_I067 [gerund - parti].txt
>>> Testing : _español_I096 [inf - gerund].txt
>>> Testing : _español_I097 [inf - parti].txt
>>> Testi

In [2]:
import CrossMatchTest

# Test de cross match
pair_method_corpus = []
for embedding in embedding_name_list:
    _embedding = embedding.strip().split('.')[0]
    _embedding = _embedding.split('_')
    
    pair_method_corpus.append([embedding, _embedding[1], _embedding[2]])
    
test_pairing_corpus = []
test_pairing_method = []
for i in range(len(pair_method_corpus)):
    for j in range(len(pair_method_corpus)):
        if j <= i:
            continue
            
        if pair_method_corpus[i][1] == pair_method_corpus[j][1]:
            test_pairing_method.append([pair_method_corpus[i][0], pair_method_corpus[j][0]])
            
        if pair_method_corpus[i][2] == pair_method_corpus[j][2]:
            test_pairing_corpus.append([pair_method_corpus[i][0], pair_method_corpus[j][0]])

print("Method pairs:")          
for pair in test_pairing_method:
    print(pair)
    wordvector1 = get_wordvector(pair[0], 1000)
    wordvector1.init_sims()
    wordvector2 = get_wordvector(pair[1], 1000)
    wordvector2.init_sims()
    
    CrossMatchTest.crossMatchTest(wordvector1, pair[0], wordvector2, pair[1], 100, 2, 1)
    
    print("\n")

print("\nCorpus pairs:")    
for pair in test_pairing_corpus:
    print(pair)
    

    


Method pairs:
['_fasttext_sbwc.vec', '_fasttext_suc.vec']
['ni', 'buenos']
['visto', 'sería']
[10000000, 0.1933175, 0.010025692, -0.025257824]
[0.1933175, 10000000, -0.034393698, 0.022242174]
[0.010025692, -0.034393698, 10000000, 0.13966084]
[-0.025257824, 0.022242174, 0.13966084, 10000000]
[0, 1, 2, 3]
[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
1 - 2
0 - 3
p-value 0: 0


['_fasttext_sbwc.vec', '_fasttext_wiki.vec']
['serie', 'américa']
['lópez', 'martín']
[10000000, 0.23246072, 0.1073523, 0.12666698]
[0.23246072, 10000000, -0.045404542, 0.0046490286]
[0.1073523, -0.045404542, 10000000, 0.43719432]
[0.12666698, 0.0046490286, 0.43719432, 10000000]
[0, 1, 2, 3]
[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
1 - 2
0 - 3
p-value 0: 0


['_fasttext_suc.vec', '_fasttext_wiki.vec']
['diferentes', 'punto']
['sobre', 'q']
[10000000, 0.15370934, 0.04975097, 0.04575902]
[0.15370934, 10000000, -0.008141271, 0.041690335]
[0.04975097, -0.008141271, 10000000, 0.12969647]
[0.04575902, 0.041690

In [3]:
import OutlierDetectionTest

# Test de outlier detection
print(">>> Test de outlier detection\n")
for embedding in embedding_name_list:
    print(">>> Testing " + embedding + "\n")
    
    print(">>> Cargando vectores...", end='')
    wordvector = get_wordvector(embedding, 10000)
    print("listo\n")
    
    OutlierDetectionTest.outlierDetectionTest(wordvector, embedding.split('.')[0])
    

>>> Test de outlier detection

>>> Testing _fasttext_sbwc.vec

>>> Cargando vectores...listo

>>> Test 1 of 436
>>> Original set:
    ['esterházy', 'borjigin', 'bernoulli', 'strauss', 'qizilbash', 'amati']
    ['sinonimia', 'mario']
>>> In-vocabulary set:
    []
    ['mario']
Test set invalido, conjunto principal muy pequeño o conjunto outlier vacio

>>> Test 2 of 436
>>> Original set:
    ['makati', 'tacloban', 'zamboanga', 'baguio', 'cebú', 'davao', 'manila']
    ['kalayaan']
>>> In-vocabulary set:
    []
    []
Test set invalido, conjunto principal muy pequeño o conjunto outlier vacio

>>> Test 3 of 436
>>> Original set:
    ['trillian', 'kopete', 'irssi', 'pidgin', 'mirc', 'chatzilla']
    ['asterisk', 'iphoto', 'yacc']
>>> In-vocabulary set:
    []
    []
Test set invalido, conjunto principal muy pequeño o conjunto outlier vacio

>>> Test 4 of 436
>>> Original set:
    ['líridas', 'perseidas', 'oriónidas', 'leónidas', 'gemínidas', 'dracónidas', 'cuadrántidas']
    ['tormenta', 'di