In [140]:
from os import path
from itertools import chain, combinations
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
from pymorphy2 import MorphAnalyzer
from pandas import DataFrame, read_csv
from collections import defaultdict
from russian_tagsets import converters
from re import sub

import numpy as np

m = MorphAnalyzer()
conv = converters.converter('opencorpora-int', 'ud14')

## Supplementary 

In [76]:
def get_word_vector(word_, model, add_pos_tag=True):
    if len(word_.split(' ')) == 1:
        if add_pos_tag:
            word = add_pos_tag_to_word(word_)
        else:
            word = word_
        try:
            return model[word]
        except KeyError:
            return np.zeros(shape=model.vector_size)
    else:
        vector = np.zeros(shape=model.vector_size)
        for subword in word_.split(' '):
            if add_pos_tag:
                word = add_pos_tag_to_word(subword)
            else:
                word = subword
            try:
                vector = np.add(vector, model[word])
            except KeyError:
                pass
        return vector / len(word_.split(' '))

In [146]:
def add_pos_tag_to_word(word_):
    word = sub(r'\W+', '', word_)
    return '{}_{}'.format(word, conv(m.parse(word)[0].tag.POS).split(' ')[0])

## Load data

In [15]:
language = 'russian'
architecture = 'Word2Vec'
path_to_models = path.join('..', '..', '..', 'monolang', 'MODELS', architecture, language)
models_names = ['ruwikiruscorpora-superbigrams_skipgram_300_2_2018.vec']

In [18]:
models = {}

for model_name in models_names:
    models[model_name] = KeyedVectors.load_word2vec_format(path.join(path_to_models, model_name))

In [156]:
def get_word_lists_for_lexical_field(field_name='size_adj.csv', column_from_which_words_start = 3):
    df = read_csv(path.join('..', 'data', field_name)).fillna(0)
    words = defaultdict(lambda: [])
    for _, row in df.iterrows():
        for column_name in df.columns[column_from_which_words_start:]:
            if row[column_name]:
                words[column_name].append(row['microframe'])
    return dict(words)

## Evaluation on typology 1

In [148]:
def get_vectors_distance(word1, word2, model, metric='cosine'):
    if metric == 'cosine':
        return 1 - cosine(get_word_vector(word1, model), get_word_vector(word2, model))

In [149]:
def jaccard_distance(wordlist1, wordlist2):
    wordset1 = set(wordlist1)
    wordset2 = set(wordlist2)
    return float(len(wordset1 & wordset2)) / len(wordset1 | wordset2)

def sorensen_dice_distance(wordlist1, wordlist2):
    intersection = np.logical_and(wordlist1, wordlist2)
    return 2. * intersection.sum() / (wordlist1.sum() + wordlist2.sum())

def get_wordlist_distance(wordlist1, wordlist2, metric='jaccard'):
    if metric == 'jaccard':
        return jaccard_distance(wordlist1, wordlist2)
    elif metric == 'sorensen':
        return sorensen_dice_distance(wordlist1, wordlist2)

In [150]:
def evaluate_on_typology_1(model):
    results = []
    words = get_word_lists_for_lexical_field()
    word_pairs = list(combinations(words.keys(), 2))
    for word_pair in word_pairs:
        vector_distance = get_vectors_distance(word_pair[0], word_pair[1], model)
        wordlist_distance = get_wordlist_distance(words[word_pair[0]], words[word_pair[1]])
        results.append({'pair' : word_pair, 'vector' : vector_distance, 'wordlist' : wordlist_distance})
    return results

In [158]:
q = DataFrame(evaluate_on_typology_1(models['ruwikiruscorpora-s.vec']))

In [161]:
spearmanr(q.vector, q.wordlist)

SpearmanrResult(correlation=0.29283678863284196, pvalue=0.009273244509341683)