In [1]:
import os
import io
import json

import numpy as np

from scipy.stats import spearmanr
from gensim.models.keyedvectors import KeyedVectors

import Constant

# Path a carpeta principal
MAIN_FOLDER = Constant.MAIN_FOLDER

# Path a carpeta con los embeddings
EMBEDDING_FOLDER = Constant.EMBEDDING_FOLDER

# Lista con los nombres de los archivos de los embeddings
embedding_name_list = os.listdir(EMBEDDING_FOLDER)

print(">>> Embeddings a evaluar:")
for embedding in embedding_name_list:
    print("  > " + embedding)

def get_wordvector(file, cant=None):
    wordvector_file = EMBEDDING_FOLDER / file

    return KeyedVectors.load_word2vec_format(wordvector_file, limit=cant)


D:\Documents\Memoria - Eval. Word Embeddings\Testing
D:\Documents\Memoria - Eval. Word Embeddings
D:\Documents\Memoria - Eval. Word Embeddings\Embeddings
D:\Documents\Memoria - Eval. Word Embeddings\Datasets
D:\Documents\Memoria - Eval. Word Embeddings\Resultados
D:\Documents\Memoria - Eval. Word Embeddings\TempResultados
>>> Embeddings a evaluar:
  > _fasttext_sbwc.vec
  > _fasttext_suc.vec
  > _fasttext_wiki.vec
  > _glove_sbwc.vec
  > _w2v_sbwc.txt


In [3]:
# Realizacion de test

import SimilarityTest

# Test de similaridad
print(">>> Test de similaridad\n")
for embedding in embedding_name_list:
    print(">>> Testing " + embedding + "\n")
    
    print(">>> Cargando vectores...", end='')
    wordvector = get_wordvector(embedding, 10000)
    wordvector_name = embedding.split('.')[0]
    print("listo\n")
    
    result = SimilarityTest.similarityTest(wordvector, wordvector_name)


>>> Test de similaridad

>>> Testing _fasttext_sbwc.vec

>>> Cargando vectores...listo

>>> Test individuales
>>> Archivo test: rg65_es.txt
>>> Opening file rg65_es.txt
>>> Not found words:63
    ['cordel', 'gallo', 'caldera', 'autógrafo', 'hechicero', 'loma', 'estufa', 'monje', 'manicomio', 'gallo', 'vaso', 'mago', 'cojín', 'joya', 'monje', 'esclavo', 'chaval', 'monje', 'oráculo', 'sabio', 'cojín', 'loma', 'chaval', 'hechicero', 'gallo', 'pájaro', 'caldera', 'grulla', 'gallo', 'loma', 'joya', 'mago', 'oráculo', 'grúa', 'chaval', 'sabio', 'hechicero', 'oráculo', 'sabio', 'pájaro', 'gallo', 'pájaro', 'grulla', 'monje', 'manicomio', 'caldera', 'estufa', 'mago', 'hechicero', 'loma', 'cordel', 'vaso', 'recipiente', 'siervo', 'esclavo', 'autógrafo', 'pollo', 'gallo', 'chaval', 'cojín', 'almohada', 'gema', 'joya']
     > Cantidad de pares no procesados: 44


>>> Archivo test: semeval17_es.txt
>>> Opening file semeval17_es.txt
>>> Not found words:309
    ['astronave', 'poliedro', 'mosaico', '

In [3]:
import AnalogyTest

# Test de analogias
print(">>> Test de analogias")
for embedding in embedding_name_list:
    print("\n>>> Testing " + embedding + "\n")
    
    print(">>> Cargando vectores...", end='')
    wordvector = get_wordvector(embedding, 10000)
    wordvector_name = embedding.split('.')[0]
    print(" listo\n")
    
    AnalogyTest.analogyTest(wordvector, wordvector_name, all_score=True)


>>> Test de analogias

>>> Testing _fasttext_sbwc.vec

>>> Cargando vectores...listo

>>> Testing : _español_D01 [prefijo_anti-].txt
>>> Testing : _español_D02 [prefijo_des-].txt
>>> Testing : _español_D03 [prefijo_in-].txt
>>> Testing : _español_D04 [sufijo_-able].txt
>>> Testing : _español_D05 [sufijo_-ción].txt
>>> Testing : _español_D06 [sufijo_-ísimo].txt
>>> Testing : _español_D07 [sufijo_-ita].txt
>>> Testing : _español_D08 [sufijo_-ito].txt
>>> Testing : _español_D09 [sufijo_-mente].txt
>>> Testing : _español_D10 [sufijo_-miento].txt
>>> Testing : _español_E01 [pais - capital].txt
>>> Testing : _español_E02 [pais - idioma].txt
>>> Testing : _español_E04 [nombre - nacionalidad].txt
>>> Testing : _español_E05 [nombre - ocupacion].txt
>>> Testing : _español_E10 [hombre - mujer].txt
>>> Testing : _español_E11 [pais - gentilicio].txt
>>> Testing : _español_I067 [gerund - parti].txt
>>> Testing : _español_I096 [inf - gerund].txt
>>> Testing : _español_I097 [inf - parti].txt
>>> Testi

In [None]:
import CrossMatchTest

# Test de cross match
pair_method_corpus = []
for embedding in embedding_name_list:
    _embedding = embedding.strip().split('.')[0]
    _embedding = _embedding.split('_')
    
    pair_method_corpus.append([embedding, _embedding[1], _embedding[2]])
    
test_pairing_corpus = []
test_pairing_method = []
for i in range(len(pair_method_corpus)):
    for j in range(len(pair_method_corpus)):
        if j <= i:
            continue
            
        if pair_method_corpus[i][1] == pair_method_corpus[j][1]:
            test_pairing_method.append([pair_method_corpus[i][0], pair_method_corpus[j][0]])
            
        if pair_method_corpus[i][2] == pair_method_corpus[j][2]:
            test_pairing_corpus.append([pair_method_corpus[i][0], pair_method_corpus[j][0]])

print("Cross-match test:")

"""
Paper original utiliza:
repetition = 500
sample_size = 100000
sub_sample_size = 200
"""
repetition = 1
sample_size = 100000
sub_sample_size = 200
F_constant = CrossMatchTest.getFConstant(sub_sample_size)

for pair in test_pairing_method + test_pairing_corpus:
    print(pair)
    
    print("Cargando embedding " + pair[0], end='...')
    wordvector1 = get_wordvector(pair[0], 500000)
    wordvector1_name = pair[0]
    wordvector1.init_sims()
    print("listo")
    
    print("Cargando embedding " + pair[1], end='...')
    wordvector2 = get_wordvector(pair[1], 500000)
    wordvector2_name = pair[1]
    wordvector2.init_sims()
    print("listo")
    
    CrossMatchTest.crossMatchTest(wordvector1, pair[0], wordvector2, pair[1], sample_size, sub_sample_size, repetition, F_constant)
    
    print("\n")


Cross-match test:
['_fasttext_sbwc.vec', '_fasttext_suc.vec']


In [3]:
import OutlierDetectionTest

# Test de outlier detection
print(">>> Test de outlier detection\n")
for embedding in embedding_name_list:
    print(">>> Testing " + embedding + "\n")
    
    print(">>> Cargando vectores...", end='')
    wordvector = get_wordvector(embedding, 10000)
    wordvector_name = embedding.split('.')[0]
    print("listo\n")
    
    OutlierDetectionTest.outlierDetectionTest(wordvector, wordvector_name)

>>> Test de outlier detection

>>> Testing _fasttext_sbwc.vec

>>> Cargando vectores...listo

>>> Test 1 of 436
>>> Original set:
    ['esterházy', 'borjigin', 'bernoulli', 'strauss', 'qizilbash', 'amati']
    ['sinonimia', 'mario']
>>> In-vocabulary set:
    []
    ['mario']
Test set invalido, conjunto principal muy pequeño o conjunto outlier vacio

>>> Test 2 of 436
>>> Original set:
    ['makati', 'tacloban', 'zamboanga', 'baguio', 'cebú', 'davao', 'manila']
    ['kalayaan']
>>> In-vocabulary set:
    []
    []
Test set invalido, conjunto principal muy pequeño o conjunto outlier vacio

>>> Test 3 of 436
>>> Original set:
    ['trillian', 'kopete', 'irssi', 'pidgin', 'mirc', 'chatzilla']
    ['asterisk', 'iphoto', 'yacc']
>>> In-vocabulary set:
    []
    []
Test set invalido, conjunto principal muy pequeño o conjunto outlier vacio

>>> Test 4 of 436
>>> Original set:
    ['líridas', 'perseidas', 'oriónidas', 'leónidas', 'gemínidas', 'dracónidas', 'cuadrántidas']
    ['tormenta', 'di

In [2]:
import ConstitucionUtil

# Evaluacion extrinseca
gob_concept_data, open_concept_with_gob_concept, open_concept_as_new_concept, open_concept_as_other, open_concept_nondescript = ConstitucionUtil.getDataset()

['topic', 'is_open_concept', 'original_constitutional_concept', 'constitutional_concept', 'argument', 'argument_mode']
complete data: 205357
gob data: 183342
1, count: 37
2, count: 44
3, count: 12
4, count: 21
cantidad conceptos: 114
 1 Amistad cívica
 1 Autonomía / Libertad
 1 Bien Común / Comunidad
 1 Ciudadanía
 1 Democracia
 1 Desarrollo
 1 Descentralización
 1 Dignidad
 1 Diversidad
 1 Emprendimiento libre
 1 Equidad de género
 1 Estado de Derecho
 1 Estado laico
 1 Identidad cultural
 1 Igualdad
 1 Inclusión
 1 Innovación / Creatividad
 1 Integración
 1 Justicia
 1 Multiculturalidad
 1 Participación
 1 Patriotismo
 1 Paz / Convivencia pacífica
 1 Pluralismo
 1 Plurinacionalismo
 1 Probidad
 1 República
 1 Respeto
 1 Respeto / Conservación de la naturaleza o medio ambiente
 1 Responsabilidad
 1 Seguridad
 1 Soberanía
 1 Solidaridad
 1 Subsidiaridad
 1 Tolerancia
 1 Transparencia y publicidad
 1 Unidad
 2 A huelga
 2 A la educación
 2 A la honra / Al honor
 2 A la identidad cultura

In [21]:
# Task A



In [10]:
# Task B

import re

wordvector = get_wordvector(embedding_name_list[0])
print("listo")

listo


In [12]:
# Calculo de vector promedio para conceptos del gobierno
print("Calculo de vector promedio para conceptos del gobierno")
concept_vectors = {}
concept_list_by_topic = {}
cant = 0
for topic in gob_concept_data.keys():
    if not topic in concept_vectors.keys():
        concept_vectors[topic] = np.array([])
        
    if not topic in concept_list_by_topic.keys():
        concept_list_by_topic[topic] = []
        
    print(topic + ") cantidad de conceptos " + str(len(gob_concept_data[topic].keys())))
    
    for concept in gob_concept_data[topic].keys():
        print("  " + concept + ": " + str(len(gob_concept_data[topic][concept])))
        cant += len(gob_concept_data[topic][concept])
            
        all_words = re.sub('[^0-9a-zA-Záéíóú]+', ' ', concept.lower())
        mean_vector = ConstitucionUtil.getMeanVector(all_words, wordvector)
        if mean_vector.size == 0:
            print("    cero size vector")
            continue
        
        
        if concept_vectors[topic].size == 0:
            concept_vectors[topic] = mean_vector
        else:
            concept_vectors[topic] = np.vstack((concept_vectors[topic], mean_vector))
            
        print(mean_vector[:5])
        

for topic in concept_vectors.keys():
    print(topic, end=' ')
    print(concept_vectors[topic].shape)
    print(concept_vectors[topic])
    
print(cant)


# Calculo de vector promedio para conceptos abiertos y evaluacion de similaridad
print("Calculo de vector promedio para conceptos abiertos y evaluacion de similaridad")
for topic in open_concept_with_gob_concept.keys():
    print(topic)
    
    for concept in open_concept_with_gob_concept[topic].keys():
        print("  " + concept)
        
        all_original_concept = ""
        all_argument = ""
        
        for arg in open_concept_with_gob_concept[topic][concept]:
            original_concept, argument, argument_mode = arg
            
            original_concept = re.sub('[^0-9a-zA-Záéíóú]+', ' ', original_concept.lower())
            argument = re.sub('[^0-9a-zA-Záéíóú]+', ' ', argument.lower())
            
            print("    " + original_concept)
            print("    " + argument)
            print("    " + argument_mode)
            
            all_original_concept = all_original_concept + " " + original_concept
            all_argument = all_argument + " " + argument
            
        
        

Calculo de vector promedio para conceptos del gobierno
1) cantidad de conceptos 37
  Justicia: 4199
[ 0.5888   -0.49338   0.42296  -0.40135   0.093935]
  Democracia: 4037
[ 0.12414  -0.38491  -0.11367   0.032327 -0.3044  ]
  Respeto / Conservación de la naturaleza o medio ambiente: 3930
[ 0.10508725 -0.26593536 -0.01044863 -0.09110688  0.0455045 ]
  Autonomía / Libertad: 1862
[ 0.039505 -0.195895  0.00792  -0.22511  -0.173525]
  Estado de Derecho: 1608
[ 0.00579533 -0.27809033 -0.11649767 -0.10569633  0.05256867]
  Igualdad: 3960
[ 0.03711   -0.32813   -0.087445  -0.38216   -0.0049224]
  Participación: 1253
[-0.017586  0.019099  0.21074  -0.20178   0.1249  ]
  Unidad: 124
[-0.055166 -0.49314   0.015381 -0.077278  0.023912]
  Respeto: 2643
[ 0.15794  -0.43184  -0.37764  -0.11921   0.013678]
  Descentralización: 3213
[-0.12168  -0.054841  0.34909   0.20129  -0.061754]
  Identidad cultural: 587
[ 0.186971   0.003652   0.0758215 -0.0906165  0.1217925]
  Estado laico: 1587
[ 0.0477955 -0.51

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

w = "Identidad cultural"
w = re.sub('[^0-9a-zA-Záéíóú]+', ' ', w.lower())
print(w)

mean_vector = ConstitucionUtil.getMeanVector(all_words, wordvector)
print(mean_vector)

print(wordvector["justicia"])

identidad cultural
[-1.13068007e-01 -2.09641784e-01  1.38618005e-02  4.99185994e-02
 -5.07276040e-03  1.48699939e-01 -1.83658600e-01  1.15651205e-01
 -1.65459991e-01  4.79028001e-02 -2.33515337e-01 -3.04633975e-02
  4.71453965e-02  1.93538032e-02  1.41333610e-01  1.01976059e-01
  1.43928006e-01  1.83187008e-01 -1.21099958e-02 -2.37280607e-01
 -1.13470599e-01 -1.12639606e-01 -1.16323397e-01 -4.04377952e-02
  5.75807914e-02 -2.54254013e-01  2.68996805e-01  3.35266208e-03
 -9.37020034e-02  8.44514519e-02  1.21254012e-01 -7.29095936e-02
 -2.17474818e-01  1.22853599e-01 -6.48937002e-02 -9.24280062e-02
  8.01400107e-04 -1.29500404e-01 -1.20561801e-01  1.15683198e-01
  1.00042403e-01  4.31943797e-02 -1.76595405e-01 -9.83397942e-03
  5.91407940e-02  1.13249496e-01  6.72699958e-02 -1.23226598e-01
  1.82327196e-01  8.81083980e-02 -3.41850035e-02  9.74552035e-02
 -1.03685997e-01  8.71667042e-02 -5.71133979e-02 -2.13336617e-01
 -1.83484599e-01 -9.83394012e-02  1.22291192e-01  1.18529974e-02
 -4.82

In [None]:
# Task C

