In [1]:
import numpy as np
from fasttext import FastVector

def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)


def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)
    return (U, V)

In [None]:
import string
translate_table = dict((ord(char), None) for char in string.punctuation)

In [None]:
from janome.tokenizer import Tokenizer
jap_t = Tokenizer()

In [None]:
import jieba

In [None]:
from pythainlp import word_tokenize

In [None]:
def create_chinese_embeddings(content, dictionary, transform):
    id = 0
    vectors = []
    for sentence in content:
        #sentence = sentence.translate(translate_table)
        sentence = sentence.replace(" ","")
        words = []
        tokens = jieba.tokenize(sentence)
        for t in tokens:
            words.append(t[0])
        sentence_vec = np.zeros(300)
        for word in words:
            try:
                try:
                    vect = dictionary[word.lower()]
                except:
                    vect = dictionary[word.translate(translate_table).lower()]
                #print(vect.shape)
                tr_vec = np.matmul(vect, transform)
                sentence_vec+=tr_vec
            except Exception as e:
                print(e)
                continue
        try:
            sentence_vec = normalized(sentence_vec).reshape(300)
        except Exception as e:
            print(e)
            pass
        vectors.append(sentence_vec)
        id+=1
        
    print(id)
    return np.array(vectors)

In [None]:
def create_thai_embeddings(content, dictionary, transform):
    id = 0
    vectors = []
    for sentence in content:
        #sentence = sentence.translate(translate_table)
        #sentence = sentence.replace(" ","")
        words = word_tokenize(sentence)
        #print(words)
        sentence_vec = np.zeros(300)
        for word in words:
            try:
                try:
                    vect = dictionary[word.lower()]
                except:
                    vect = dictionary[word.translate(translate_table).lower()]
                #print(vect.shape)
                tr_vec = np.matmul(vect, transform)
                sentence_vec+=tr_vec
            except Exception as e:
                print(e)
                continue
        try:
            sentence_vec = normalized(sentence_vec).reshape(300)
        except Exception as e:
            print(e)
            pass
        vectors.append(sentence_vec)
        id+=1
        
    print(id)
    return np.array(vectors)

In [None]:
def create_japanese_embeddings(content, dictionary, transform):
    id = 0
    vectors = []
    for sentence in content:
        #sentence = sentence.translate(translate_table)
        #sentence = sentence.replace(" ","")
        words = []
        tokens = jap_t.tokenize(sentence)
        for t in tokens:
            words.append(t.surface)
        sentence_vec = np.zeros(300)
        for word in words:
            try:
                try:
                    vect = dictionary[word.lower()]
                except:
                    vect = dictionary[word.translate(translate_table).lower()]
                #print(vect.shape)
                tr_vec = np.matmul(vect, transform)
                sentence_vec+=tr_vec
            except Exception as e:
                print(e)
                continue
        try:
            sentence_vec = normalized(sentence_vec).reshape(300)
        except:
            pass
        vectors.append(sentence_vec)
        id+=1
        
    print(id)
    return np.array(vectors)

In [None]:
def create_sentence_embeddings(content, dictionary, transform, language):
    id = 0
    vectors = []
    if language == 'ja':
        return create_japanese_embeddings(content, dictionary, transform)
        
    
        
    if language == 'th':
        return create_thai_embeddings(content, dictionary, transform)
        
    
    for sentence in content:
        #sentence = sentence.translate(translate_table)
        words = sentence.split(" ")
        sentence_vec = np.zeros(300)
        for word in words:
            try:
                try:
                    vect = dictionary[word.lower()]
                except:
                    vect = dictionary[word.translate(translate_table).lower()]
                #print(vect.shape)
                tr_vec = np.matmul(vect, transform)
                sentence_vec+=tr_vec
            except Exception as e:
                print(e)
                continue
        try:
            sentence_vec = normalized(sentence_vec).reshape(300)
        except Exception as e:
            print(e)
            pass
        vectors.append(sentence_vec)
        id+=1
        
    print(id)
    return np.array(vectors)

In [None]:
import os
def extract_content(language):
    path = "/ais/clspace5/u/vkpriya/muse/fastText_multilingual/data/aligned/"
    all_content = []
    dirs = os.listdir(path)
    file_name = "/"+language+".txt"
    for dir in dirs:
            file = path + dir + file_name
            with open(file,"r") as f:
                content = f.readlines()
            content = [x.strip() for x in content]
            all_content.extend(content)
    print(len(all_content))
    return all_content

In [None]:
fr_dictionary = FastVector(vector_file='../MUSE/data/wiki.fr.vec')
en_dictionary = FastVector(vector_file='../MUSE/data/wiki.en.vec')
de_dictionary = FastVector(vector_file='../MUSE/data/wiki.de.vec')
es_dictionary = FastVector(vector_file='../MUSE/data/wiki.es.vec')
hu_dictionary = FastVector(vector_file='../MUSE/data/wiki.hu.vec')
tr_dictionary = FastVector(vector_file='../MUSE/data/wiki.tr.vec')
fi_dictionary = FastVector(vector_file='../MUSE/data/wiki.fi.vec')

#fr_vector = fr_dictionary["chat"]
#ru_vector = ru_dictionary["кот"]
#print(FastVector.cosine_similarity(fr_vector, ru_vector))

In [None]:
ru_dictionary = FastVector(vector_file='../MUSE/data/wiki.ru.vec')
pt_dictionary = FastVector(vector_file='../MUSE/data/wiki.pt.vec')
pl_dictionary = FastVector(vector_file='../MUSE/data/wiki.pl.vec')

In [None]:
it_dictionary = FastVector(vector_file='../MUSE/data/wiki.it.vec')

In [None]:
bg_dictionary = FastVector(vector_file='../MUSE/data/wiki.bg.vec')
ja_dictionary = FastVector(vector_file='../MUSE/data/wiki.ja.vec')
th_dictionary = FastVector(vector_file='../MUSE/data/wiki.th.vec')
zh_dictionary = FastVector(vector_file='../MUSE/data/wiki.zh.vec')

In [None]:
en_words = set(en_dictionary.word2id.keys())
fr_words = set(fr_dictionary.word2id.keys())
de_words = set(de_dictionary.word2id.keys())
es_words = set(es_dictionary.word2id.keys())
#overlap = list(ru_words & fr_words)
#bilingual_dictionary = [(entry, entry) for entry in overlap]

In [None]:
hu_words = set(hu_dictionary.word2id.keys())
fi_words = set(fi_dictionary.word2id.keys())
tr_words = set(tr_dictionary.word2id.keys())
ru_words = set(ru_dictionary.word2id.keys())
pt_words = set(pt_dictionary.word2id.keys())
pl_words = set(pl_dictionary.word2id.keys())

In [None]:
it_words = set(it_dictionary.word2id.keys())

In [None]:
bg_words = set(bg_dictionary.word2id.keys())
zh_words = set(zh_dictionary.word2id.keys())
th_words = set(th_dictionary.word2id.keys())
ja_words = set(ja_dictionary.word2id.keys())

In [None]:
fr_tran = np.loadtxt("alignment_matrices/fr.txt")
de_tran = np.loadtxt("alignment_matrices/de.txt")
es_tran = np.loadtxt("alignment_matrices/es.txt")
hu_tran = np.loadtxt("alignment_matrices/hu.txt")
tr_tran = np.loadtxt("alignment_matrices/tr.txt")
fi_tran = np.loadtxt("alignment_matrices/fi.txt")

In [None]:
ru_tran = np.loadtxt("alignment_matrices/ru.txt")
pl_tran = np.loadtxt("alignment_matrices/pl.txt")
pt_tran = np.loadtxt("alignment_matrices/pt.txt")

In [None]:
it_tran = np.loadtxt("alignment_matrices/it.txt")

In [None]:
zh_tran = np.loadtxt("alignment_matrices/zh.txt")
bg_tran = np.loadtxt("alignment_matrices/bg.txt")
ja_tran = np.loadtxt("alignment_matrices/ja.txt")
th_tran = np.loadtxt("alignment_matrices/th.txt")

In [None]:
de_vector = de_dictionary["die"]
fr_vector = fr_dictionary["les"]
print(FastVector.cosine_similarity(np.matmul(de_vector, ge_tran), np.matmul(fr_vector, fr_tran)))

In [None]:
import csv
def read_bible_embeddings(lang):
    sent_matrix = []
    file_name = lang+"_sent_embeddings.csv"
    with open(file_name,"r") as f:
        reader = csv.reader(f)
        for row in reader:
            array_string = row[1]
            array_string = array_string.replace("[","")
            array_string = array_string.replace("]","")
            array = np.fromstring(array_string, sep = ' ')
            sent_matrix.append(array)

    sent_matrix = np.array(sent_matrix)
    print(len(sent_matrix))
    return sent_matrix

In [None]:
french_bible = extract_content("French")
german_bible = extract_content("German")
spanish_bible = extract_content("Spanish")
english_bible = extract_content("English")
hungarian_bible = extract_content("Hungarian")
finnish_bible = extract_content("Finnish")
turkish_bible = extract_content("Turkish")

In [None]:
polish_bible = extract_content("Polish")
portuguese_bible = extract_content("Portuguese")
russian_bible = extract_content("Russian")

In [None]:
italian_bible = extract_content("Italian")

In [None]:
bulgarian_bible = extract_content("Bulgarian")
chinese_bible = extract_content("Chinese")
thai_bible = extract_content("Thai")
japanese_bible = extract_content("English")

In [None]:
fr_en = create_sentence_embeddings(french_bible, fr_dictionary, fr_tran,'fr')
de_en = create_sentence_embeddings(german_bible, de_dictionary, ge_tran,'de')
es_en = create_sentence_embeddings(spanish_bible, es_dictionary, es_tran,'es')
en_en = read_bible_embeddings("English")
hu_en = create_sentence_embeddings(hungarian_bible, hu_dictionary, hu_tran,'hu')
tr_en = create_sentence_embeddings(turkish_bible, tr_dictionary, tr_tran,'tr')
fi_en = create_sentence_embeddings(finnish_bible, fi_dictionary, fi_tran,'fi')

In [None]:
ru_en = create_sentence_embeddings(russian_bible, ru_dictionary, ru_tran,'ru')
pt_en = create_sentence_embeddings(portuguese_bible, pt_dictionary, pt_tran,'pt')
pl_en = create_sentence_embeddings(polish_bible, pl_dictionary, pl_tran,'pl')

In [None]:
it_en = create_sentence_embeddings(italian_bible, it_dictionary, it_tran, 'it')

In [None]:
i = np.identity(300)

In [None]:
"罣" in zh_dictionary

In [None]:
#bg_en = create_sentence_embeddings(bulgarian_bible, bg_dictionary, bg_tran, 'bg')
#ja_en = create_sentence_embeddings(japanese_bible, ja_dictionary, ja_tran, 'ja')
#th_en = create_sentence_embeddings(thai_bible, th_dictionary, th_tran, 'th')
zh_en = create_sentence_embeddings(chinese_bible, zh_dictionary, zh_tran, 'hh')

In [None]:
FastVector.cosine_similarity(zh_en[100].reshape(300), normalized(zh_en[100]).reshape(300))

In [None]:
np.shape(normalized(zh_en[100]))

In [None]:
chinese_orig_embeds = create_sentence_embeddings(chinese_bible, zh_dictionary, np.identity(300), 'zh')

In [None]:
ge_en.shape

In [None]:
import sklearn
min_max_scaler = sklearn.preprocessing.MinMaxScaler()

In [None]:
li = ['fr_en',
 'de_en',
 'es_en',
 'en_en',
 'hu_en',
 'tr_en',
 'fi_en',
 'pt_en',
 'ru_en',
 'pl_en',
 'it_en',
 'zh_en',
 'th_en']

In [None]:
for l in  li:
    #print(li)
    var = l[:2]+"_inter"
    vars() [var] = []
    embeds = vars() [l]
    for i in range(len(embeds)-1):
        vars() [var].append(FastVector.cosine_similarity(embeds[i],embeds[i+1]))
    vars() [var] = np.array(vars() [var])
    vars() [var] = np.nan_to_num(vars() [var])
    print(l)
    print(np.mean(vars() [var]))
    

In [None]:
#li = ['fr_en','de_en','es_en','en_en','hu_en','tr_en','fi_en', 'pt_en', 'ru_en', 'pl_en','it_en','bg_en','th_en']
for i in li:
    for j in li:
        if i==j:
            continue
        embeds1 = vars() [i]
        embeds2 = vars() [j]
        
        sim = i[:2]+"_"+j[:2]+"_"
        print(sim)
        vars() [sim] = []
        
        for k in range(len(embeds1)):
            vars() [sim].append(FastVector.cosine_similarity(embeds1[k], embeds2[k]))
        print(np.nanmean(vars() [sim]), np.nanmax(vars() [sim]))

In [None]:
normalized([1,2,3,4])

In [None]:
#french
count = 0
diff = []
for i in range(len(fr_en)):
    if hu_en_[i]>hu_tr_[i]:
        diff.append(hu_en_[i]-hu_tr_[i])
        count+=1

In [None]:
count/i

In [None]:
np.nanmin(diff)

In [None]:
language_codes = ['de','en','es','fi', 'fr','hu','tr','ru','pl','pt','it','th','zh']

In [None]:
for lc in language_codes:
    sim_matrix = []
    modify = [x for x in language_codes if x!=lc]
    print(modify)
    for lc2 in modify:
        mat_name = lc + "_"+lc2+"_"
        sim_matrix.append(vars() [mat_name])
    v = lc+"_sim_matrix"
    vars() [v] = sim_matrix

In [None]:
sim_matrix = np.array(th_sim_matrix)
sim_matrix.shape

In [None]:
sim_matrix[:,0]

In [None]:
max_lang = []
for i in range(len(french_bible)):
    modify = [x for x in language_codes if x!='th']
    sim_row = sim_matrix[:,i]
    max_index = np.argmax(sim_row)
    lang = modify[max_index]
    #print(lang)
    max_lang.append(lang)

In [None]:
len(max_lang)

In [None]:
from collections import Counter
def Most_Common(lst):
    data = Counter(lst)
    return data.most_common()

In [None]:
thai_counts = Most_Common(max_lang)

In [None]:
languages = ["english","german","spanish","french","turkish","hungarian","finnish","russian","polish","portuguese", "italian","chinese","thai"]


In [None]:
for language in languages:
    print(language)
    var = language+"_counts"
    print(vars() [var])

In [None]:
# Test on dictionaries first to see quality of alignment 

In [None]:
languages

In [None]:
language_codes.append('zh')

In [None]:
for language in language_codes:
    lc = language
    if lc =='en':
        continue
    file = "../MUSE/data/crosslingual/dictionaries/"+lc+"-en.txt"
    with open(file, "r") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    word_dict = []
    for item in content:
        pair = tuple(item.split(" "))
        if "\t" in pair[0]:
            pair = tuple(item.split("\t"))
        word_dict.append(pair)
    average_similarity = []
    count = 0
    transform = vars() [language+"_tran"]
    for pair in word_dict:
        dic = lc+"_dictionary"
        d = vars() [dic]
        source_vector = d[pair[0]]
        #word = en_dictionary.translate_nearest_neighbour(german_vector)
        #print(pair[1], word)
        #if word == pair[1]:
        #    count+=1
        similarity = FastVector.cosine_similarity(np.matmul(source_vector, transform), en_dictionary[pair[1]])
        average_similarity.append(similarity)
    print(lc)
    print(np.mean(average_similarity), np.max(average_similarity), np.min(average_similarity))



In [None]:
    file = "../MUSE/data/crosslingual/dictionaries/ru-en.txt"
    eng1 = []
    with open(file, "r") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    ru_dict = []
    for item in content:
        pair = tuple(item.split(" "))
        if "\t" in pair[0]:
            pair = tuple(item.split("\t"))
        ru_dict.append(pair)
        eng1.append(pair[1])
        
    file = "../MUSE/data/crosslingual/dictionaries/pl-en.txt"
    eng2 = []
    with open(file, "r") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    pl_dict = []
    for item in content:
        pair = tuple(item.split(" "))
        if "\t" in pair[0]:
            pair = tuple(item.split("\t"))
        pl_dict.append(pair)
        eng2.append(pair[1])
    '''
    file = "../MUSE/data/crosslingual/dictionaries/fi-en.txt"
    eng3 = []
    with open(file, "r") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    fi_dict = []
    for item in content:
        pair = tuple(item.split(" "))
        if "\t" in pair[0]:
            pair = tuple(item.split("\t"))
        fi_dict.append(pair)
        eng3.append(pair[1])
    
    '''

In [None]:
common_eng = list(set(eng1)&(set(eng2)))

In [None]:
common_pl = []
common_ru = []
#common_fi = []
for word in common_eng:
    if word in ru_dictionary and word in pl_dictionary:
        common_ru.append(np.matmul(ru_dictionary[word], ru_tran))
        common_pl.append(np.matmul(pl_dictionary[word], pl_tran))
        #common_fi.append(np.matmul(fi_dictionary[word], fi_tran))


In [None]:
len(common_ru)

In [None]:
s = []
for i in range(len(common_pl)):
    s.append(FastVector.cosine_similarity(common_pl[i], common_ru[i]))
print(np.mean(s))

## Common words is not a good measure of alignment accuracy

# Between european languages

In [None]:
european = ['pt', 'es', 'fr', 'ge','it']
for language1 in languages:
    for language2 in languages:
        lc1 = language_codes[languages.index(language1)]
        lc2 = language_codes[languages.index(language2)]
        if lc1==lc2:
            continue
       
        file = "../MUSE/data/crosslingual/dictionaries/"+lc1+"-"+lc2+".txt"
        with open(file, "r") as f:
            content = f.readlines()
        content = [x.strip() for x in content]
        word_dict = []
        for item in content:
            pair = tuple(item.split(" "))
            if "\t" in pair[0]:
                pair = tuple(item.split("\t"))
            word_dict.append(pair)
        average_similarity = []
        count = 0
        transform1 = vars() [language1+"_it5_tran"]
        transform2 = vars() [language2+"_it5_tran"]
        for pair in word_dict:
            dic = lc1+"_dictionary"
            d1 = vars() [dic]
            dic = lc2+"_dictionary"
            d2 = vars() [dic]
            source_vector = d1[pair[0]]
            target_vector = d2[pair[1]]
            #word = en_dictionary.translate_nearest_neighbour(german_vector)
            #print(pair[1], word)
            #if word == pair[1]:
            #    count+=1
            similarity = FastVector.cosine_similarity(np.matmul(source_vector, transform1), np.matmul(target_vector, transform2))
            average_similarity.append(similarity)
        print(lc1, lc2)
        print(np.sum(average_similarity)/len(word_dict))


In [None]:
#align using sentence embeddings and SVD, check dictionary performance

In [None]:
languages = ['german',
 'spanish',
 'french',
 'turkish',
 'hungarian',
 'finnish',
 'russian',
 'polish',
 'portuguese',
 'italian', 'bulgarian', 'thai']

In [None]:
language_codes = ['de','es','fr','tr','hu','fi','ru','pl','pt','it','bg','th']

In [None]:
english_embeddings.shape

In [None]:
    for language in languages:
        lc = language_codes[languages.index(language)]
        bible = vars() [language+"_bible"]
        dic = vars() [lc+"_dictionary"]
        trans = np.identity(300)
        source_embeddings = create_sentence_embeddings(bible, dic,trans, lc)
        
        #mat_file = "alignment_matrices/"+language_code[languages.index(language)]+".txt"
        (U, V) = learn_transformation(source_embeddings, english_embeddings, normalize_vectors=True)
        var = language+"_eng_tran"
        vars() [var] = []
        vars() [var] = np.matmul(U,V)

In [None]:
german_eng_tran.shape

In [None]:
g = create_sentence_embeddings(german_bible, de_dictionary, german_eng_tran, 'de')

In [None]:
s = []
for i in range(len(english_bible)):
    s.append(FastVector.cosine_similarity(g[i], en_en[i]))

In [None]:
np.nanmean(s)

In [None]:
for language in language_codes:
    lc = language
    if lc =='en':
        continue
    file = "../MUSE/data/crosslingual/dictionaries/"+lc+"-en.txt"
    with open(file, "r") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    word_dict = []
    for item in content:
        pair = tuple(item.split(" "))
        if "\t" in pair[0]:
            pair = tuple(item.split("\t"))
        word_dict.append(pair)
    average_similarity = []
    count = 0
    transform = vars() [languages[language_codes.index(language)]+"_eng_tran"]
    for pair in word_dict:
        dic = lc+"_dictionary"
        d = vars() [dic]
        source_vector = d[pair[0]]
        #word = en_dictionary.translate_nearest_neighbour(german_vector)
        #print(pair[1], word)
        #if word == pair[1]:
        #    count+=1
        similarity = FastVector.cosine_similarity(np.matmul(source_vector, transform), en_dictionary[pair[1]])
        average_similarity.append(similarity)
    print(lc)
    print(np.mean(average_similarity), np.max(average_similarity), np.min(average_similarity))



In [None]:
#new matrix = average of all vectors 
#realign. 
#Compare word sim scores

In [None]:
languages = ['german',
 'spanish',
 'french',
 'portuguese',
 'italian']

In [None]:
language_codes = ['de', 'es', 'fr','pt','it']

In [None]:
FastVector.cosine_similarity(de_en[20], english_orig_embeds[20])

In [None]:
#new sentence embeddings
for language in languages:
    lc = language_codes[languages.index(language)]
    dic = vars() [lc+"_dictionary"]
    content = vars() [language+"_bible"]
    trans = vars() [language+"_it4_tran"]
    vars() [language+"_it5_embs"] = create_sentence_embeddings(content, dic, trans, lc)
    

In [None]:
FastVector.cosine_similarity(german_it2_embs[20], english_orig_embeds[20])

In [None]:
new_average = []

for i in range(len(english_embeddings)):
    av = np.zeros(300)
    for lang in languages:
    
            var = vars() [lang+"_it5_embs"]
       
            av = np.add(av, var[i])
    new_average.append(av/len(languages))



In [None]:
new_average = np.array(new_average)

In [None]:
new_average.shape

In [None]:
#realign
for language in languages:
        source_embeddings = vars() [language+"_orig_embeds"]
        #russian_embeddings = read_bible_embeddings("Russian")
        #mat_file = "alignment_matrices/"+language_code[languages.index(language)]+".txt"
        (U, V) = learn_transformation(source_embeddings, new_average, normalize_vectors=True)
        var = language+"_it5_tran"
        vars() [var] = np.matmul(U,V)

In [None]:
#word similarity
for language in languages:
    lc = language_codes[languages.index(language)]
    if language == "english":
        continue
    file = "../MUSE/data/crosslingual/dictionaries/"+lc+"-en.txt"
    with open(file, "r") as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    word_dict = []
    for item in content:
        pair = tuple(item.split(" "))
        if "\t" in pair[0]:
            pair = tuple(item.split("\t"))
        word_dict.append(pair)
    average_similarity = []
    count = 0
    transform1 = vars() [language+"_it2_tran"]
    #transform2 = vars() ["english_it2_tran"]
    for pair in word_dict:
        dic = lc+"_dictionary"
        d = vars() [dic]
        source_vector = d[pair[0]]
        source_vector = np.matmul(source_vector, transform1)
        #source_vector = np.matmul(source_vector, transform1b)
        #word = en_dictionary.translate_nearest_neighbour(german_vector)
        #print(pair[1], word)
        #if word == pair[1]:
        #    count+=1
        similarity = FastVector.cosine_similarity(source_vector,en_dictionary[pair[1]])
        average_similarity.append(similarity)
    print(lc)
    print(np.mean(average_similarity), np.max(average_similarity), np.min(average_similarity))



# Align to different pivot spaces

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = sklearn.model_selection.KFold(n_splits=5)

In [None]:
for train_indices, test_indices in kf.split(chinese_bible):
    print(len(test_indices), len(train_indices))

In [None]:
print(languages)

In [None]:
language_codes = ['en','de','es','fr','tr','hu','fi','ru','pl','pt','it','zh','th']

In [None]:
l_lc_dict = {}
for i, l in enumerate(languages):
    l_lc_dict[l] = language_codes[i]

In [None]:
#Align all to Russian. Does Polish does the highest similarity?
french_bible[test_indices]

In [None]:
    for language in languages:
        kfold_avg_sim = []
        for train_indices, test_indices in kf.split(hungarian_bible):
            target_embeddings = english_orig_embeds[train_indices]
            lc = l_lc_dict[language]
            dic = vars() [lc+"_dictionary"]
            source_embeddings = vars() [language+"_orig_embeds"][train_indices]
            #mat_file = "alignment_matrices/"+language_code[languages.index(language)]+".txt"
            (U, V) = learn_transformation(source_embeddings, target_embeddings, normalize_vectors=True)
            transform = np.matmul(U,V)
            test_bible = vars() [language+"_bible"]
            test_source = []
            for i, v in enumerate(test_indices):
                test_source.append(test_bible[v])
            test_target = english_orig_embeds[test_indices]
            aligned_source = create_sentence_embeddings(test_source, dic, transform, lc)
            similarities = []
            for i in range(len(test_indices)):
                similarities.append(FastVector.cosine_similarity(aligned_source[i], test_target[i]))
            avg_sim = np.nanmean(similarities)
            kfold_avg_sim.append(avg_sim)
        var = language +"_kfold_avg"
        vars() [var] = kfold_avg_sim

In [None]:
for language in languages:
    print(language)
    print(np.mean(vars() [language+"_kfold_avg"]))

## TSNE Plots

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk

from gensim.models import word2vec

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [None]:
def tsne_plot(matrix1, matrix2):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    
    labels2 = []
    tokens2 = []
    
    

    for i, e in enumerate(matrix1):
        tokens.append(e)
        labels.append(str(i))
    
    
    for i, e in enumerate(matrix2):
        tokens.append(e)
        labels.append(str(i)+"'")
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)
    #new_values2 = tsne_model.fit_transform(tokens2)

    x = []
    y = []
    
    x2 =[]
    y2 = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
    '''   
    for value in new_values2:
        x2.append(value[0])
        y2.append(value[1])
    '''    
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    '''    
    for i in range(len(x2)):
        plt.scatter(x2[i],y2[i])
        plt.annotate(labels2[i],
                     xy=(x2[i], y2[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    #plt.show()
    '''

In [None]:

tsne_plot(hu_en[:100], en_en[:100])

In [None]:
print(language_codes)
print(languages)

In [None]:
for lang in languages:
    lc = language_codes[languages.index(lang)]
    if lc == "ge":
        lc = "de"
    dic = vars() [lc+"_dictionary"]
    content = vars() [lang+"_bible"]
    english_embeds = english_bible
    transform = vars() [lang+"_tran"]
    source_embeds = create_sentence_embeddings(content, dic, transform)
    vars() [lang+"_sentence_embs"] = source_embeds
    print(lang)
    s = []
    for i in range(len(english_embeds)):
        s.append(FastVector.cosine_similarity(source_embeds[i], english_embeds[i]))
    #print(np.nanmean(s), np.nanmax(s), np.nanmin(s))
    vars() [lang+"_sentence_sim"] = s

In [None]:
for lang in languages:
    var = vars() [lang+"_sentence_sim"]
    print(lang)
    print(np.nanmean(var), np.nanmax(var), np.nanmin(var))

In [None]:
#tsne the sentence embeddings

In [None]:
s = []
for i in range(len(russian_sentence_embs)):
    s.append(FastVector.cosine_similarity(russian_sentence_embs[i], polish_sentence_embs[i]))

In [None]:
s[0]

In [None]:
c = 0
for i in range(len(russian_sentence_embs)):
    if (s[i]>polish_sentence_sim[i]):
        c+=1