# create monolingual sentence embeddings 

In [1]:

#import
import csv, gensim, random

In [2]:
import numpy as np
from fasttext import FastVector

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return (U, V)

In [3]:
def read_bible_embeddings(lang):
    sent_matrix = []
    file_name = lang+"_sent_embeddings.csv"
    with open(file_name,"r") as f:
        reader = csv.reader(f)
        for row in reader:
            array_string = row[1]
            array_string = array_string.replace("[","")
            array_string = array_string.replace("]","")
            array = np.fromstring(array_string, sep = ' ')
            sent_matrix.append(array)

    sent_matrix = np.array(sent_matrix)
    print(len(sent_matrix))
    return sent_matrix

In [4]:
lines = 26006
index_list = np.arange(lines)
random.shuffle(index_list)
l = int(0.8*lines)
train_indices = index_list[:l]
test_indices = index_list[l:]


In [5]:

def get_u_v(source, target):
    source_matrix = read_bible_embeddings(source)
    target_matrix = read_bible_embeddings(target)
    #(train_indices, test_indices) = get_train_test(source_matrix)
    source_train = source_matrix[train_indices]
    target_train = target_matrix[train_indices]
    (U, V) = learn_transformation(source_train, target_train, normalize_vectors=True)
    print("get UV done")
    return (U,V)

In [6]:
languages = ['German', 'Spanish','Finnish','French','Hungarian','Lithuanian','Russian','Turkish','Vietnamese','Chinese', 'Thai']
language_code = ['de','es','fi','fr','hu','lt','ru','tr','vi','zh','th']

In [7]:
languages.index("German")

0

In [8]:
def align_common_subspace(source, target):
    (U,V) = get_u_v(source, target)
    if source=="English":
        source_fasttext_file = "/ais/clspace5/u/vkpriya/muse/MUSE/data/wiki.en.bin"
    else:
        source_fasttext_file = "../MUSE/data/cc."+language_code[languages.index(source)]+".300.bin"
    if target == "English":
        target_fasttext_file = "/ais/clspace5/u/vkpriya/muse/MUSE/data/wiki.en.bin"
    else:
        target_fasttext_file = "../MUSE/data/cc."+language_code[languages.index(target)]+".300.bin"
    source_ft_vectors = gensim.models.fasttext.FastText.load_fasttext_format(source_fasttext_file)
    target_ft_vectors = gensim.models.fasttext.FastText.load_fasttext_format(target_fasttext_file)
    
    source_ft_list = []
    target_ft_list = []
    source_vocab = []
    target_vocab = []
    
    for i,k in enumerate(source_ft_vectors.wv.index2word):
        source_ft_list.append(source_ft_vectors.wv.get_vector(k))
        source_vocab.append(k)
    for i,k in enumerate(target_ft_vectors.wv.index2word):
        target_ft_list.append(target_ft_vectors.wv.get_vector(k))
        target_vocab.append(k)
    source_ft_list = np.array(source_ft_list)
    target_ft_list = np.array(target_ft_list)
    
    
    #get u and v
    #(U,V) = get_u_v(source, target)
    
    source_aligned_vectors = np.matmul(source_ft_list,U)
    target_aligned_vectors = np.matmul(target_ft_list, np.transpose(V))
    
    return (source_aligned_vectors, target_aligned_vectors, source_vocab, target_vocab)
    

In [46]:
def align_english_subspace(source):
    target="English"
    (U,V) = get_u_v(source, target)
    
    if source=="English":
        source_fasttext_file = "/ais/clspace5/u/vkpriya/muse/MUSE/data/wiki.en.bin"
    else:
        source_fasttext_file = "../MUSE/data/cc."+language_code[languages.index(source)]+".300.bin"
    
    target_fasttext_file = "/ais/clspace5/u/vkpriya/muse/MUSE/data/wiki.en.bin"
    
    source_ft_vectors = gensim.models.fasttext.FastText.load_fasttext_format(source_fasttext_file)
    target_ft_vectors = gensim.models.fasttext.FastText.load_fasttext_format(target_fasttext_file)
    print("vec files loaded")
    source_ft_list = []
    target_ft_list = []
    source_vocab = []
    target_vocab = []
    
    for i,k in enumerate(source_ft_vectors.wv.index2word):
        source_ft_list.append(source_ft_vectors.wv.get_vector(k))
        source_vocab.append(k)
    for i,k in enumerate(target_ft_vectors.wv.index2word):
        target_ft_list.append(target_ft_vectors.wv.get_vector(k))
        target_vocab.append(k)
    source_ft_list = np.array(source_ft_list)
    target_ft_list = np.array(target_ft_list)
    
    print("arrays created")
    
    
    #get u and v
    #(U,V) = get_u_v(source, target)
    transform = np.matmul(U,V)
    source_aligned_vectors = np.matmul(source_ft_list,transform)
    print("multiplying")
    #target_aligned_vectors = np.matmul(target_ft_list, np.transpose(V))
    
    return (source_aligned_vectors, target_ft_vectors, source_vocab, target_vocab)
    

In [87]:
def create_sentence_embeddings(content, embeddings, vocab):
    id = 0
    vectors = []
    for sentence in content:
        sentence = sentence.translate(translate_table)
        words = sentence.split(" ")
        #for chinese: no spaces, use each character in string
        sentence_vec = np.zeros(300)
        for word in words:
        #for word in sentence:
            try:
                word_id = vocab.index(word)
                vect = embeddings[word_id]
                sentence_vec+=vect
            except Exception as e:
                print(e)
                continue
        try: 
            sentence_vec = sentence_vec/np.linalg.norm(sentence_vec)
        except Exception as e:
            print(e)
            pass
        #writer.writerow([id, sentence_vec])
        vectors.append(sentence_vec)
        id+=1
        print(id)
    return np.array(vectors)

In [42]:
def create_sentence_embeddings_thai(content, embeddings, vocab):
    id = 0
    vectors = []
    for sentence in content:
        sentence = sentence.translate(translate_table)
        #words = sentence.split(" ")
        #for chinese: no spaces, use each character in string
        sentence_vec = np.zeros(300)
        #for word in words:
        for word in sentence:
            try:
                word_id = vocab.index(word)
                vect = embeddings[word_id]
                sentence_vec+=vect
            except:
                continue
        try: 
            sentence_vec = sentence_vec/np.linalg.norm(sentence_vec)
        except:
            pass
        #writer.writerow([id, sentence_vec])
        vectors.append(sentence_vec)
        id+=1
        print(id)
    return np.array(vectors)

In [8]:
def extract_bible_content(language):
    path = "/ais/clspace5/u/vkpriya/muse/fastText_multilingual/data/aligned/"
    all_content = []
    dirs = os.listdir(path)
    file_name = "/"+language+".txt"
    for dir in dirs:
            file = path + dir + file_name
            with open(file,"r") as f:
                content = f.readlines()
            content = [x.strip() for x in content]
            all_content.extend(content)
    print(len(all_content))
    return all_content

In [17]:
#def write_vec_to_file(vector_list, file_name):
#need vocab.   

In [71]:
(tu_hu_al, hu_tu_al, tu_vocab, hu_vocab) = align_common_subspace("Turkish", "Hungarian")

26006
26006
get UV done


In [72]:
tu_hu_al.shape

(2000000, 300)

In [92]:
hu_vocab[21000:21010]

['1818',
 'múltú',
 'tetszenek',
 'szakirodalomban',
 'Nyíregyházán',
 'Sikerült',
 'mérnökök',
 'Policy',
 'lekötött',
 'társadalmi-gazdasági']

In [75]:
FastVector.cosine_similarity(tu_hu_al[0], hu_tu_al[0])

0.3375962604554288

# Load test datasets

In [9]:
import os
import string
translate_table = dict((ord(char), None) for char in string.punctuation)

In [65]:
# load test data
with open("data/fr-ru/french.txt","r") as f:
    source_ubuntu = f.readlines()
source_ubuntu = [x.strip() for x in source_ubuntu]

In [66]:
with open("data/fr-ru/russian.txt","r") as f:
    target_ubuntu = f.readlines()
target_ubuntu = [x.strip() for x in target_ubuntu]

In [67]:
print(len(source_ubuntu), len(target_ubuntu))

232 232


In [57]:
source_bible = extract_bible_content("French")

26006


In [58]:
target_bible = extract_bible_content("Russian")

26006


In [59]:
source_bible_test = []
target_bible_test = []
for i in test_indices:
    source_bible_test.append(source_bible[i])
    target_bible_test.append(target_bible[i])

In [60]:
source_bible_test[10]

'Fils de l`homme, j`ai rompu le bras de Pharaon, roi d`Égypte; Et voici, on ne l`a point pansé pour le guérir, On ne l`a point enveloppé d`un bandage Pour le lier et le raffermir, Afin qu`il puisse manier l`épée.'

In [61]:
target_ubuntu[10]

'Предоставляет интерфейс D-Bus для опроса и изменения информации об учётных данных пользователей.'


# Use fastext vectors and then transform

In [62]:
U,V = get_u_v("French", "Russian")

26006
26006
get UV done


In [63]:
source_fasttext_file = "/ais/clspace5/u/vkpriya/muse/MUSE/data/cc.fr.300.bin"
source_ft_vectors = gensim.models.fasttext.FastText.load_fasttext_format(source_fasttext_file)

In [20]:
#new create sentence embeddings
def create_sentence_embeddings_new(content, ft_model, t_vec):
    id = 0
    vectors = []
    for sentence in content:
        sentence = sentence.translate(translate_table)
        words = sentence.split(" ")
        #for chinese: no spaces, use each character in string
        sentence_vec = np.zeros(300)
        #print(words)
        for word in words:
        #for word in sentence:
            try:
                original = ft_model.wv.get_vector(word)
                vect = np.matmul(original, t_vec)
                sentence_vec+=vect
            except Exception as e:
                print(e)
                continue
        try: 
            sentence_vec = sentence_vec/np.linalg.norm(sentence_vec)
        except Exception as e:
            print(e)
            pass
        #writer.writerow([id, sentence_vec])
        vectors.append(sentence_vec)
        id+=1
        print(id)
    return np.array(vectors)

In [21]:
target_fasttext_file = "/ais/clspace5/u/vkpriya/muse/MUSE/data/cc.ru.300.bin"
target_ft_vectors = gensim.models.fasttext.FastText.load_fasttext_format(target_fasttext_file)

In [68]:
source_ub_em = create_sentence_embeddings_new(source_ubuntu, source_ft_vectors, U)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
21
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
22
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
23
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
24
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
25
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
26
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
27
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0» absent from model'
28
'all ngrams for word s\xa0 absent from model'
'all ngrams for word lACL absent from model'
'all ngrams for word «



In [69]:
target_ub_em  = create_sentence_embeddings_new(target_ubuntu, target_ft_vectors, np.transpose(V))

1
2
3
4
5
6
7
8
9
10
'all ngrams for word DBus absent from model'
11
12
13
'all ngrams for word dacl absent from model'
14
'all ngrams for word dacl absent from model'
15
16
17
18
19
'all ngrams for word tне absent from model'
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
'all ngrams for word  absent from model'
40
41
42
43
44
45
46
47
48
49
50
51
52
'all ngrams for word  absent from model'
53
'all ngrams for word  absent from model'
54
'all ngrams for word  absent from model'
55
56
'all ngrams for word  absent from model'
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
'all ngrams for word «s» absent from model'
132
133
'all ngrams for word «s» absent from model'
134
'all ngrams for word «s» absent from model'
135
'all ngrams for word  absent from model'
136
'all ng

In [70]:
source_b_test = create_sentence_embeddings_new(source_bible_test, source_ft_vectors, U)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
'all ngrams for word  absent from model'
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
'all ngrams for word dAï absent from model'
151
152
153
154
155
156
157
158
159
160
161
162
'all ngrams for word  absent from model'
163
164
'all ngrams for word  absent from model'
165
166
167
168
169
170
171
172
173
174
175
176
'all ngrams for word  absent from model'
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225


1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
'all ngrams for word quÉli absent from model'
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
'all ngrams for word Naie absent from model'
1833
1834
1835
'all ngrams for word  absent from model'
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
'all ngrams for word Lâme absent from model'
1853
1854
1855
1856
1857
1858
185

3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
'all ngrams for word dAsa absent from model'
3374
'all ngrams for word dAsa absent from model'
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
'all ngrams for word dAmtsi absent from model'
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
351

4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
'all ngrams for word  absent from model'
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
'all ngrams for word Lâme absent from model'
5004
5005
5006
5007
5008
5009
'all ngrams for word dÉla absent from model'
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
'all ngrams for word Eljoénaï absent from model'
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
'all ngrams for word  absent from model'
5042
'all ngrams for word Puva absent from model'
5043
5044
5045
5046
'all ngrams for word  absent from model'
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
'all ngrams for wo

In [71]:
target_b_test = create_sentence_embeddings_new(target_bible_test, target_ft_vectors, np.transpose(V))

1
2
3
4
5
6
7
8
'all ngrams for word „лев absent from model'
9
'all ngrams for word  absent from model'
10
11
12
'all ngrams for word  absent from model'
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
'all ngrams for word „лев absent from model'
35
36
37
38
'all ngrams for word  absent from model'
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
'all ngrams for word  absent from model'
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
'all ngrams for word  absent from model'
79
80
81
'all ngrams for word  absent from model'
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
'all ngrams for word  absent from model'
104
105
106
107
108
'all ngrams for word  absent from model'
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
'all ngrams for word  absent from model'
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
'all ngrams for word 

1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
'all ngrams for word  absent from model'
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
'all ngrams for word  absent from model'
1206
1207
'all ngrams for word  absent from model'
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
'all ngrams for word „так absent from model'
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
'all ngrams for word „дай absent from model'
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
'all ngrams for word  absent from model'
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1

2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
'all ngrams for word  absent from model'
'all ngrams for word  absent from model'
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
'all ngrams for word нихне absent from model'
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
'all ngrams for word Кеиль absent from model'
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
'all ngrams for word „не absent from model'
2463
2464
2465
2466
2467
2468
'all ngrams for word  absent from model'
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
'all ngrams for word  absent from model'
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498


3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
'all ngrams for word  absent from model'
3551
3552
3553
'all ngrams for word  absent from model'
3554
3555
3556
3557
3558
'all ngrams for word Цору absent from model'
'all ngrams for word ихс absent from model'
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
'all ngrams for word  absent from model'
3614
'all ngrams for word  absent from model'
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
'all ngrams for word  absent from model'
'all ngrams for word  absent from model'
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
'all ngrams for word  absent from model'
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664

4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
'all ngrams for word  absent from model'
4751
'all ngrams for word „так absent from model'
4752
4753
4754
4755
4756
4757
4758
4759
'all ngrams for word  absent from model'
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
'all ngrams for word  absent from model'
4777
4778
4779
4780
4781
4782
'all ngrams for word „я absent from model'
4783
4784
4785
4786
4787
'all ngrams for word  absent from model'
4788
4789
4790
4791
4792
'all ngrams for word  absent from model'
4793
4794
4795
4796
4797
4798
4799
4800
'all ngrams for word  absent from model'
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
'all ngrams for word Моцу absent from model'
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
'all ngrams for word  absent from model'
4829
4830
4831
4832
'all ngrams for word  absent from model'
4833
4834
4835
4836
4837

In [30]:
print("hello")

hello


In [72]:
#sanity check
c1 = source_ft_vectors.wv.get_vector(",")
c2 = target_ft_vectors.wv.get_vector(",")
#c3vec = english_ft_vectors.wv.get_vector(",")

c1vec = np.matmul(c1, U)
c2vec = np.matmul(c2,np.transpose(V))

print(FastVector.cosine_similarity(c1vec, c2vec))

0.5934354663377269


In [73]:
similarities_1 = []

for i in range(len(source_ub_em)):
    
    try:
        similarities_1.append(FastVector.cosine_similarity(source_ub_em[i], target_ub_em[i]))
    except Exception as e:
        print(e)
        
for i in range(len(target_b_test)):
    try:
        similarities_1.append(FastVector.cosine_similarity(source_b_test[i], target_b_test[i]))
    except Exception as e:
        print(e)


In [75]:
np.nanmean(similarities_1[:len(source_ub_em)])

0.46381362678813465

In [522]:
source_bible_test[2561]

'És a mikor eljött az esteli áldozás ideje, oda lépett Illés próféta, és monda: Óh Uram, Ábrahámnak, Izsáknak  és Izráelnek Istene, hadd ismerjék meg e mai napon, hogy te vagy az Isten az Izráelben, és hogy én a te szolgád vagyok, és hogy  mindezeket a te parancsolatodból cselekedtem.'

In [40]:
print("hello")

hello


# Align to English

In [76]:
U1,V1 = get_u_v("French", "English")

26006
26006
get UV done


In [77]:
U2,V2 = get_u_v("Russian", "English")

26006
26006
get UV done


In [78]:
transform1 = np.matmul(U1, V1)
transform2 = np.matmul(U2, V2)

In [34]:
english_fasttext_file = "/ais/clspace5/u/vkpriya/muse/MUSE/data/wiki.en.bin"
english_ft_vectors = gensim.models.fasttext.FastText.load_fasttext_format(english_fasttext_file)

In [79]:
source_ub_em = create_sentence_embeddings_new(source_ubuntu, source_ft_vectors, transform1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
21
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
22
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
23
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
24
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
25
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
26
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0»\xa0 absent from model'
27
'all ngrams for word s\xa0 absent from model'
'all ngrams for word «\xa0s\xa0» absent from model'
28
'all ngrams for word s\xa0 absent from model'
'all ngrams for word lACL absent from model'
'all ngrams for word «



In [80]:
target_ub_em  =create_sentence_embeddings_new(target_ubuntu, target_ft_vectors, transform2)

1
2
3
4
5
6
7
8
9
10
'all ngrams for word DBus absent from model'
11
12
13
'all ngrams for word dacl absent from model'
14
'all ngrams for word dacl absent from model'
15
16
17
18
19
'all ngrams for word tне absent from model'
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
'all ngrams for word  absent from model'
40
41
42
43
44
45
46
47
48
49
50
51
52
'all ngrams for word  absent from model'
53
'all ngrams for word  absent from model'
54
'all ngrams for word  absent from model'
55
56
'all ngrams for word  absent from model'
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
'all ngrams for word «s» absent from model'
132
133
'all ngrams for word «s» absent from model'
134
'all ngrams for word «s» absent from model'
135
'all ngrams for word  absent from model'
136
'all ng

In [81]:
source_b_test_em = create_sentence_embeddings_new(source_bible_test, source_ft_vectors, transform1)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
'all ngrams for word  absent from model'
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
'all ngrams for word dAï absent from model'
151
152
153
154
155
156
157
158
159
160
161
162
'all ngrams for word  absent from model'
163
164
'all ngrams for word  absent from model'
165
166
167
168
169
170
171
172
173
174
175
176
'all ngrams for word  absent from model'
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225


1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
'all ngrams for word quÉli absent from model'
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
'all ngrams for word Naie absent from model'
1833
1834
1835
'all ngrams for word  absent from model'
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
'all ngrams for word Lâme absent from model'
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
195

3215
3216
'all ngrams for word  absent from model'
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
'all ngrams for word dAsa absent from model'
3374
'all ngrams for word dAsa absent from model'
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388

4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
'all ngrams for word  absent from model'
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
'all ngrams for word Lâme absent from model'
5004
5005
5006
5007
5008
5009
'all ngrams for word dÉla absent from model'
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
'all ngrams for wor

In [82]:
target_b_test_em = create_sentence_embeddings_new(target_bible_test, target_ft_vectors, transform2)

1
2
3
4
5
6
7
8
'all ngrams for word „лев absent from model'
9
'all ngrams for word  absent from model'
10
11
12
'all ngrams for word  absent from model'
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
'all ngrams for word „лев absent from model'
35
36
37
38
'all ngrams for word  absent from model'
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
'all ngrams for word  absent from model'
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
'all ngrams for word  absent from model'
79
80
81
'all ngrams for word  absent from model'
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
'all ngrams for word  absent from model'
104
105
106
107
108
'all ngrams for word  absent from model'
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
'all ngrams for word  absent from model'
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
'all ngrams for word 

1354
1355
1356
1357
1358
1359
'all ngrams for word моии absent from model'
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
'all ngrams for word  absent from model'
1370
1371
1372
1373
1374
1375
1376
1377
1378
'all ngrams for word заНим absent from model'
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
'all ngrams for word  absent from model'
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
'all ngrams for word Озию absent from model'
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
'all ngrams for word  absent from model'
1498
1499
'all ngrams

2433
2434
2435
2436
2437
2438
2439
'all ngrams for word нихне absent from model'
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
'all ngrams for word Кеиль absent from model'
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
'all ngrams for word „не absent from model'
2463
2464
2465
2466
2467
2468
'all ngrams for word  absent from model'
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
'all ngrams for word  absent from model'
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
'all ngrams for word  absent from model'
2500
2501
2502
'all ngrams for word  absent from model'
2503
2504
2505
2506
2507
2508
'all ngrams for word „да absent from model'
2509
2510
2511
2512
2513
2514
2515
2516
2517
'all ngrams for word „нет absent from model'
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
'all ngrams

3550
'all ngrams for word  absent from model'
3551
3552
3553
'all ngrams for word  absent from model'
3554
3555
3556
3557
3558
'all ngrams for word Цору absent from model'
'all ngrams for word ихс absent from model'
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
'all ngrams for word  absent from model'
3614
'all ngrams for word  absent from model'
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
'all ngrams for word  absent from model'
'all ngrams for word  absent from model'
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
'all ngrams for word  absent from model'
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
'all ngrams for word Ятб

4686
4687
'all ngrams for word  absent from model'
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
'all ngrams for word  absent from model'
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
'all ngrams for word  absent from model'
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
'all ngrams for word  absent from model'
4751
'all ngrams for word „так absent from model'
4752
4753
4754
4755
4756
4757
4758
4759
'all ngrams for word  absent from model'
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
'all ngrams for word  absent from model'
4777
4778
4779
4780
4781
4782
'all ngrams for word „я absent from model'
4783
4784
4785
4786
4787
'all ngrams for word  absent from model'
4788
4789
4790
4791
4792
'all ngrams for word  absent from model'
4793
4794
4795
4796
4797
4798
4799
4800
'all ngra

In [86]:
#sanity check
c1 = source_ft_vectors.wv.get_vector(",")
c2 = target_ft_vectors.wv.get_vector(",")
c3vec = english_ft_vectors.wv.get_vector(",")

c1vec = np.matmul(c1, transform1)
c2vec = np.matmul(c2,transform2)

print(FastVector.cosine_similarity(c1vec, c2vec))

0.5719628294003795


In [87]:
similarities_2 = []

for i in range(len(source_ub_em)):
    try:
        similarities_2.append(FastVector.cosine_similarity(source_ub_em[i], target_ub_em[i]))
    except Exception as e:
        print(e)

for i in range(len(target_b_test_em)):
    try:
        similarities_2.append(FastVector.cosine_similarity(source_b_test_em[i], target_b_test_em[i]))
    except Exception as e:
        print(e)

In [89]:
np.nanmean(similarities_2[:len(source_ubuntu)])

0.44552527362337185

In [90]:
len(similarities_2)

5434

In [100]:
count = 0
for i in range(len(source_ub_em)):
    if similarities_1[i]>similarities_2[i]:
        count+=1

In [101]:
count/i

0.7012987012987013

In [93]:
for i in range(len(similarities_1)):
    print(similarities_1[i],similarities_2[i])

0.374212500762391 0.33289655127809903
0.5049989334408241 0.5051445188370358
0.43637459190085726 0.40713030425959373
0.36284700011665744 0.341260746256284
0.5217315768012055 0.48804883039517183
0.6143185412571146 0.6277389142317765
0.6697421435219086 0.6650349259665991
0.679637064552443 0.6740662989584371
0.3630040163076218 0.36095206550497405
0.3542240694289055 0.34287782081831464
0.6825170442150864 0.6621273379910899
0.049954273097082245 0.11863494064957285
0.09968219861826413 0.09840890682059274
0.1516409860319256 0.16753359177191618
0.20393798796873164 0.19586569756538444
0.1562924089517077 0.12728180699934188
0.18358853333506706 0.1639130472704392
0.2358632922156237 0.17330445344012527
0.28308964472843096 0.27973732956233865
0.189364994701671 0.19172138215018214
0.41599273627627514 0.37273741427042906
0.526298002449268 0.4975119456968666
0.40753955622929017 0.34216110481028367
0.4825271018563078 0.403662067082988
0.560722903653885 0.49818976348912297
0.5076650139820397 0.4295075253

0.8457783756329063 0.8302121447518971
0.8748233282063065 0.8667694239134025
0.7916314971431163 0.7863575383819206
0.8030956077719357 0.7968163353994641
0.8905477000304464 0.8884757001720862
0.8329753776453406 0.8210592873767539
0.8768267101627737 0.8811734957700106
0.7282895425433348 0.7324759729741207
0.8159817245134491 0.8168876223055376
0.9035027425909639 0.889283834127611
0.8867153789684485 0.8794516248151109
0.9070705735155861 0.9015096533762847
0.8803893194192027 0.8825320178296148
0.8913312592892273 0.8920273967600623
0.8199455857287508 0.8014778960004577
0.8989587933375701 0.888875784914991
0.6359504335269767 0.6193247245701222
0.8348302912365744 0.8337079383092281
0.9008858500479461 0.8909866260255253
0.8528425617823976 0.840315847112828
0.892128821790718 0.8889686293280269
0.904928717721343 0.8801678419601745
0.7437989621610964 0.7485412009696742
0.9209536778888503 0.9220679145205717
0.833434192217298 0.8226806912373551
0.8878099116717149 0.8927392123163576
0.8092689393652579

0.8165248149352605 0.8021351745195955
0.831704147103207 0.8185782280446045
0.8588764244868456 0.8475192170254806
0.8348073875826543 0.8391342710173697
0.8532413865210849 0.8534523867214288
0.8245013151099893 0.8367234217651726
0.8531453443534198 0.851829476319193
0.7958919135020499 0.7866447539160221
0.8807072611282916 0.8753778518445925
0.8196143733213729 0.8189022995036769
0.869310819438438 0.8632956658460778
0.8864738621171948 0.8962826514756486
0.8702475878596616 0.8742744758648937
0.8441366313715484 0.8267045988939098
0.9162714814523055 0.9137234286191004
0.8855610196511936 0.8865279451456942
0.7033921758139495 0.6892490088077206
0.7571910605021321 0.7446675896525436
0.797093331307103 0.7986020490274198
0.7119451155430753 0.721302226962206
0.7210216245057626 0.733169447983982
0.8826402035486483 0.8781909881365196
0.9112453298843265 0.9071860060436506
0.915837778463973 0.9149111349377692
0.8529655055822868 0.8464765015913304
0.9310249518543543 0.9227570763056934
0.9131684527950055 

In [451]:
count/i

0.6851173810036614

In [106]:
np.nanmean(np.array(similarities_1[:len(source_ubuntu)])-np.array(similarities_2[:len(source_ubuntu)]))

0.018288353164762836

In [98]:
source_ubuntu[4] #HAHAHHAHAHAHAHAA

"Il est nécessaire de s'authentifier pour modifier des données utilisateur"

In [99]:
target_ubuntu[4]

'Для изменения пользовательских данных требуется аутентификация'