In [1]:
import pandas as pd
import numpy as np
import os
from semspaces.space import SemanticSpace
import json
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def cross_mapping(train_form_matrix, train_semantic_space, test_form_matrix, target_vocab):

    """
    :param train_form_matrix:       NumPy 2d array
    :param train_semantic_space:    NumPy 2d array
    :param test_form_matrix:        NumPy 2d array
    :return:                        NumPy 2d array
    """

    subset_transform = np.dot(np.linalg.pinv(train_form_matrix), train_semantic_space)
    estimated_semantic_space = np.dot(test_form_matrix, subset_transform)
    
    #print(estimated_semantic_space)
    
    space_dict = dict()
    
    for i in range(len(estimated_semantic_space)):
        for word in target_vocab[:3]:
            space_dict[word] = estimated_semantic_space[i].reshape(1,-1)

    return space_dict

In [3]:
ldl_dir = "C:/Users/wgvan/Documents/Universiteit_Nijmegen/jaar2/thesis/Project_Code/LDL_output_dir"

embedding_space_file = pd.read_csv("D:/UniversiteitNijmegen/Thesis/Test/Processed/Word2Vec/W2VChildDirected24.txt", sep = " ", header = None, skiprows = 1)

new_header = embedding_space_file.iloc[:, 0]
embedding_space_file = embedding_space_file.iloc[:, 1:]
embedding_space_file.index = new_header
embedding_space_file = embedding_space_file.sort_index(inplace=True)
print(embedding_space_file)
#full_space_array = embedding_space_file.to_numpy()

form_space_file = pd.read_csv("D:/UniversiteitNijmegen/Thesis/Test/Processed/FormEmbeddings/FormVectorChildDirected24.txt", sep = " ", header = None, skiprows = 2)
new_header2 = form_space_file.iloc[:, 0]
form_space_file = form_space_file.iloc[:, 1:]
form_space_file.index = new_header2
form_space_file = form_space_file.sort_index(inplace=True)
print(form_space_file)
#full_form_array = embedding_space_file.to_numpy()


None
None


In [88]:
embedding_space_file = "D:/UniversiteitNijmegen/Thesis/Test/Processed/Word2Vec/W2VChildDirected30.txt"
reference_space_file = "D:/UniversiteitNijmegen/Thesis/Test/Processed/Word2Vec/W2VChildDirected24.txt"
embedding_form_file = "D:/UniversiteitNijmegen/Thesis/Test/Processed/FormEmbeddings/FormVectorChildDirected30.txt"
reference_form_file = "D:/UniversiteitNijmegen/Thesis/Test/Processed/FormEmbeddings/FormVectorChildDirected24.txt"
child_produced_space_file = "D:/UniversiteitNijmegen/Thesis/Test/Processed/Word2Vec/W2VChildProduced24.txt"
wordcount_file = "D:/UniversiteitNijmegen/Thesis/Test/Processed/WordCounts/WordCountChildDirected30.json"
wordcount_file_reference = "D:/UniversiteitNijmegen/Thesis/Test/Processed/WordCounts/WordCountChildDirected24.json"

ldl_dir = "C:/Users/wgvan/Documents/Universiteit_Nijmegen/jaar2/thesis/Project_Code/LDL_output_dir"

embedding_space = SemanticSpace.from_csv(embedding_space_file, prenorm=True)
w2v_words = embedding_space.included_words()

reference_space = SemanticSpace.from_csv(reference_space_file, prenorm = True)
w2v_reference = reference_space.included_words()

embedding_form = SemanticSpace.from_csv(embedding_form_file, prenorm=True)
reference_form = SemanticSpace.from_csv(reference_form_file, prenorm=True)

child_produced_space = SemanticSpace.from_csv(child_produced_space_file, prenorm = True)
w2v_child_produced = child_produced_space.included_words()

wordcount = json.load(open(wordcount_file))
reference_wordcount = json.load(open(wordcount_file_reference))

In [99]:
full_space_array = reference_space.get_vector(list(w2v_reference)[0])
full_form_array = reference_form.get_vector(list(w2v_reference)[0])

for word in list(w2v_reference)[1:5]:
    new_space_vec = reference_space.get_vector(word)
    full_space_array = np.vstack((full_space_array, new_space_vec))
    new_form_vec = reference_form.get_vector(word)
    full_form_array = np.vstack((full_form_array, new_form_vec))

precursor_target_vocab = set(w2v_reference.symmetric_difference(w2v_words))
target_vocab = []
for word in precursor_target_vocab:
    try:
        if wordcount[word] >= 2:
            target_vocab.append(word)
    except KeyError:
        continue

target_form_array = embedding_form.get_vector(target_vocab[0])

for word in target_vocab[1:3]:
    new_form_vec = embedding_form.get_vector(word)
    target_form_array = np.vstack((target_form_array, new_form_vec))
print(target_form_array)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [148]:
target_space_dict = cross_mapping(full_form_array, full_space_array, target_form_array, target_vocab)
print(target_space_dict)
print(embedding_space.get_vector('you'))

{"panda's": array([[ 0.00452933, -0.00965967,  0.01262758,  0.02043455, -0.01208168,
         0.01450322,  0.00295767, -0.00259183, -0.02559093,  0.0017455 ,
        -0.01160489, -0.00434837,  0.02658522,  0.01660433,  0.00504672,
        -0.01243212,  0.00226519,  0.00128567,  0.01116104, -0.0017521 ,
         0.02071842,  0.00448997, -0.01242436,  0.02674672, -0.0168687 ,
         0.01654318,  0.00431544,  0.00429872, -0.00602618,  0.00825719,
         0.00017892,  0.01954423, -0.00758757,  0.01066629,  0.02206939,
         0.00057512, -0.00065231,  0.00404098,  0.02943714,  0.01803327,
        -0.02059295,  0.00022073,  0.05524384, -0.02240871,  0.00821656,
         0.00433515, -0.0059867 ,  0.00755121, -0.03812767,  0.01968912,
         0.013939  , -0.00893375,  0.000723  ,  0.02039473,  0.00149666,
         0.01775048,  0.00082164,  0.01343502,  0.03438625,  0.00598109,
         0.01551123,  0.00072966, -0.01242917, -0.00332576,  0.01371611,
         0.00877199, -0.01854672,  0.00

In [149]:
for word in target_vocab[:3]:
    sim = abs(cosine_similarity(target_space_dict[word], embedding_space.get_vector(word)))
    print(sim)

[[0.23327349]]
[[0.38080014]]
[[0.22614341]]
