# Interesting properties of embeddings

* We will look at some of the properties w2v embeddings have
* Test them ourselves in code

In [137]:
#in case you forgot, this is how one can load the embeddings

from gensim.models import KeyedVectors

# load embedding model from a file
# binary: True if saved as binary file (.bin), False if saved as text file (.vectors or .txt for example)
# limit: How many words to read from the model
model_english=KeyedVectors.load_word2vec_format("data/gigaword-and-wikipedia.bin", binary=True, limit=100000)
model_finnish=KeyedVectors.load_word2vec_format("data/pb34_wf_200_v2_skgram.bin", binary=True, limit=100000)

In [138]:
print("Most similar words for 'locomotive':")
print(model_english.most_similar("locomotive",topn=10))
print()
print("Most similar words for 'veturi':")
print(model_finnish.most_similar("veturi",topn=10))
print()

Most similar words for 'locomotive':
[('locomotives', 0.9040292501449585), ('railcar', 0.8419736623764038), ('railcars', 0.806695282459259), ('locos', 0.7832419872283936), ('steam', 0.7655093669891357), ('bogie', 0.764204740524292), ('shunting', 0.7400067448616028), ('steam-powered', 0.7359079122543335), ('carriages', 0.7352656126022339), ('diesel-electric', 0.7294440269470215)]

Most similar words for 'veturi':
[('vaunu', 0.7221510410308838), ('veturin', 0.6715301275253296), ('raide', 0.6576318144798279), ('juna', 0.6463427543640137), ('kuorma-auto', 0.6372989416122437), ('perävaunu', 0.6346830725669861), ('veturina', 0.623757004737854), ('traktori', 0.6218146085739136), ('linja-auto', 0.6097179651260376), ('kiskoilla', 0.6076352596282959)]



# Mapping spaces

* One of the more famous properties of the embeddings
* Learn a **linear** mapping from one language to another
* Can we replicate this?
* Learn a network with a single dense output layer
* English vector in -- Finnish vector out

## Training data

* Need English-Finnish pairs of words to train on
* ...google translate, maybe?
* Googling around finds this https://github.com/ssut/py-googletrans
* ...unofficial API, will get your IP banned if overused, so let's be careful!
* official API needs registration, etc.

`pip3 install --user googletrans`

In [139]:
from googletrans import Translator
translator=Translator()
translations=translator.translate(["locomotive","milk"],src="en",dest="fi")
for t in translations:
    print("origin=",t.origin,"text=",t.text)


origin= locomotive text= veturi
origin= milk text= maito


* Seems to work fine!
* Let's grab some translations
* The docs say "max 16000 characters per request"
* We need to translate some hundreds of words at a time

In [140]:
print("English vocab",len(model_english.vocab))
print(model_english.vocab.__class__)
print(model_english.vocab["car"])
#We need a list, in order of frequency
words=sorted(model_english.vocab.items(),key=lambda word_dim:word_dim[1].count,reverse=True)
print(words[:5])
words_freq_sorted=[w for w,_ in words]
print("Freq sorted",words_freq_sorted[:5])

English vocab 100000
<class 'dict'>
Vocab(count:99365, index:635)
[('</s>', <gensim.models.keyedvectors.Vocab object at 0x7f0cb02be860>), (',', <gensim.models.keyedvectors.Vocab object at 0x7f0cb02bef98>), ('the', <gensim.models.keyedvectors.Vocab object at 0x7f0cb02be320>), ('.', <gensim.models.keyedvectors.Vocab object at 0x7f0cb02be780>), ('of', <gensim.models.keyedvectors.Vocab object at 0x7f0cb02beef0>)]
Freq sorted ['</s>', ',', 'the', '.', 'of']


Now we have things like `</s>` and `.` in the vocabulary, those we don't want to translate

In [141]:
import re
english_word_re=re.compile("^[a-zA-Z]+$") #about as stupid simplification as you can get!
final_word_list=[]
for w in words_freq_sorted:
    if english_word_re.match(w):
        final_word_list.append(w)
print(final_word_list[:20])

['the', 'of', 'to', 'and', 'in', 'a', 'for', 'The', 'is', 'that', 'was', 'on', 'with', 'said', 'as', 'by', 'at', 'from', 'he', 'his']


In [142]:
import re

#same thing as above, nicely packed into a function
def clean_vocab(gensim_model,regexp):
    words=sorted(gensim_model.vocab.items(),key=lambda word_dim:word_dim[1].count,reverse=True)
    words_freq_sorted=[w for w,_ in words]
    word_re=re.compile(regexp)
    final_word_list=[]
    for w in words_freq_sorted:
        if word_re.match(w):
            final_word_list.append(w)
    return final_word_list

finnish_vocab=clean_vocab(model_finnish,'^[a-zA-ZäöåÖÄÅ]+$')
english_vocab=clean_vocab(model_english,'^[a-zA-Z]+$')
print("Final Finnish",finnish_vocab[:15],"...",finnish_vocab[2000:2015])
print("Final English",english_vocab[:15],"...",english_vocab[2000:2015])


Final Finnish ['ja', 'on', 'ei', 'että', 'se', 'oli', 'mutta', 'tai', 'kun', 'myös', 'ovat', 'ole', 'niin', 'jos', 'kuin'] ... ['seurakunnan', 'selvä', 'tulleet', 'seuraavaan', 'sijasta', 'kuollut', 'I', 'asioihin', 'loput', 'luona', 'talven', 'per', 'ihanan', 'palvelua', 'tietokoneen']
Final English ['the', 'of', 'to', 'and', 'in', 'a', 'for', 'The', 'is', 'that', 'was', 'on', 'with', 'said', 'as'] ... ['Greece', 'rally', 'democracy', 'revenue', 'add', 'criticism', 'offices', 'Hussein', 'kids', 'relief', 'promised', 'advance', 'talking', 'boost', 'dispute']


In [42]:
#Little test
import time
def translate(words,src,dest,batch_size=1000):
    result=[] #[("dog","koira"),....]
    translator=Translator()
    for idx in range(0,len(words),batch_size):
        batch=words[idx:idx+batch_size]
        try:
            translations=translator.translate(batch,src=src,dest=dest)
            for t in translations:
                result.append((t.origin,t.text))
            time.sleep(0.2) #sleep between batches
            print(src,"->",dest,"batch at",idx,"....OK")
        except: #we end here, if the lines between try ... except throw an error
            print(src,"->",dest,"batch at",idx,"....FAILED")
            time.sleep(61) #sleep a little longer so Google is not angry
            print(src,"->",dest,"...RESTARTING")
            
    return result

x=translate(english_vocab[:50],"en","fi",20) # a small test

en -> fi batch at 0 ....OK
en -> fi batch at 20 ....OK
en -> fi batch at 40 ....OK


In [33]:
print(x)

[('the', ''), ('of', 'of'), ('to', 'että'), ('and', 'ja'), ('in', 'sisään'), ('a', ''), ('for', 'varten'), ('The', ''), ('is', 'on'), ('that', 'että'), ('was', 'oli'), ('on', 'päällä'), ('with', 'kanssa'), ('said', 'sanoi'), ('as', 'kuten'), ('by', 'mennessä'), ('at', 'at'), ('from', 'alkaen'), ('he', 'hän'), ('his', 'hänen'), ('it', 'se'), ('be', 'olla'), ('are', 'olemme'), ('an', ''), ('has', 'on'), ('have', 'omistaa'), ('were', 'olivat'), ('not', 'ei'), ('who', 'Kuka'), ('had', 'oli'), ('which', 'joka'), ('will', 'tahtoa'), ('or', 'tai'), ('their', 'heidän'), ('but', 'mutta'), ('its', 'sen'), ('In', 'Sisään'), ('this', 'Tämä'), ('they', 'ne'), ('been', 'ollut'), ('I', 'minä'), ('also', 'myös'), ('would', 'olisi'), ('one', 'yksi'), ('He', 'Hän'), ('after', 'jälkeen'), ('more', 'lisää'), ('two', 'kaksi'), ('first', 'ensimmäinen'), ('about', 'noin')]


* looks okay
* let's run this and save the result for later use, so we don't get banned

In [43]:
import json
en_fi=translate(english_vocab,"en","fi",batch_size=150)
with open("en_fi_transl.json","wt") as f:
    json.dump(en_fi,f)
fi_en=translate(finnish_vocab,"fi","en",batch_size=150)
with open("fi_en_transl.json","wt") as f:
    json.dump(fi_en,f)


en -> fi batch at 0 ....OK
en -> fi batch at 150 ....OK
en -> fi batch at 300 ....OK
en -> fi batch at 450 ....OK
en -> fi batch at 600 ....OK
en -> fi batch at 750 ....OK
en -> fi batch at 900 ....OK
en -> fi batch at 1050 ....FAILED
en -> fi ...RESTARTING
en -> fi batch at 1200 ....FAILED
en -> fi ...RESTARTING
en -> fi batch at 1350 ....FAILED


KeyboardInterrupt: 

* well, we got banned :D
* Let's just translate as text files, in the google translate interface
* (quality time manually feeding these into Google translate --- I could have used the official API :)
* ...but now it's done, so who cares

In [46]:
#dump 10K words at a time into a file, which can be fed to google translate
def build_files(words,fname,batch_size):
    for idx in range(0,len(words),batch_size):
        batch=words[idx:idx+batch_size]
        with open("trdata/{}_batch_{}.txt".format(fname,idx),"wt") as f:
            print("\n".join(batch),file=f)

build_files(english_vocab,"en-fi-source",10000)
build_files(finnish_vocab,"fi-en-source",10000)


* I built manually four files like this:

In [9]:
%%bash

ls trdata/fien_* trdata/enfi_*
wc -l trdata/fien_* trdata/enfi_*
echo "FI -> EN"
paste trdata/fien_source_all.txt trdata/fien_target_all.txt  | head -n 10
echo "EN -> FI"
paste trdata/enfi_source_all.txt trdata/enfi_target_all.txt  | head -n 10



trdata/enfi_source_all.txt
trdata/enfi_target_all.txt
trdata/fien_source_all.txt
trdata/fien_target_all.txt
  95275 trdata/fien_source_all.txt
  95275 trdata/fien_target_all.txt
  83618 trdata/enfi_source_all.txt
  83618 trdata/enfi_target_all.txt
 357786 total
FI -> EN
ja	and
on	is
ei	No
että	that
se	it
oli	was
mutta	but
tai	or
kun	when
myös	also
EN -> FI
the	
of	of
to	että
and	ja
in	sisään
a	
for	varten
The	
is	on
that	että


* Read in and filter
* To make sure we get high-quality stuff, we will look for same pairs in fin-eng and eng-fin direction
* That way we will also make sure our translations are among the top 100K words in both languages

In [4]:
fien=[] #list of (fin,eng) pairs obtained from the fin -> eng direction
enfi=[] #list of (fin,eng) pairs, this time obtained from  the eng->fin direction
with open("trdata/fien_source_all.txt") as fi_file, open("trdata/fien_target_all.txt") as en_file:
    for fi,en in zip(fi_file,en_file):
        fi=fi.strip()
        en=en.strip()
        if fi and en:
            fien.append((fi,en))

with open("trdata/enfi_target_all.txt") as fi_file, open("trdata/enfi_source_all.txt") as en_file:
    for fi,en in zip(fi_file,en_file):
        fi=fi.strip()
        en=en.strip()
        if fi and en:
            enfi.append((fi,en))

fien_set=set(fien)
enfi_set=set(enfi)
common=fien_set&enfi_set #keep only pairs which are shared
print("Len fien",len(fien_set))
print("Len enfi",len(enfi_set))
print("Len common",len(common))
print(list(common)[:300])

Len fien 95275
Len enfi 83610
Len common 7100
[('vuokrata', 'rent'), ('Jenkins', 'Jenkins'), ('Roses', 'Roses'), ('Beth', 'Beth'), ('TR', 'TR'), ('taantuma', 'downturn'), ('maskotti', 'mascot'), ('Antonin', 'Antonin'), ('kroonisesti', 'chronically'), ('kompromisseja', 'compromises'), ('tunnollisesti', 'scrupulously'), ('tuntea', 'feel'), ('Linus', 'Linus'), ('innoissaan', 'excited'), ('Napoli', 'Naples'), ('haitata', 'hinder'), ('koskematon', 'pristine'), ('Fiona', 'Fiona'), ('muistio', 'memorandum'), ('muslimit', 'Muslims'), ('safari', 'safari'), ('kiima', 'rut'), ('lopulta', 'eventually'), ('nämä', 'these'), ('mekanismit', 'mechanisms'), ('matkamuistoja', 'souvenirs'), ('Nigerian', 'Nigerian'), ('sari', 'sari'), ('petojen', 'beasts'), ('kutistuu', 'shrinking'), ('Camilla', 'Camilla'), ('Lajunen', 'Lajunen'), ('Joe', 'Joe'), ('tuote', 'product'), ('juhlapäivät', 'holidays'), ('XP', 'XP'), ('hyppii', 'hopping'), ('opaskirja', 'guidebook'), ('Dalai', 'Dalai'), ('Melissa', 'Melissa'), ('

* ouch - we lost most of the stuff, but such is life
* what we got looks good, though :)
* Let us yet filter away pairs like Ivan - Ivan


In [111]:
#Making sure all we found is in the top 100K - just crosschecking really
print(len(set(finnish_vocab)&set(fi for fi,en in common)))
print(len(set(english_vocab)&set(en for fi,en in common)))

#Making sure all words are there exactly once - no risk of mixing train and validation
print(len(set(fi for fi,en in common)))
print(len(set(en for fi,en in common)))
print("...all these four numbers should be the same")

7100
7100
7100
7100
...all these four numbers should be the same


In [143]:
import random
pairs=[(fi,en) for fi,en in common if fi!=en] #Only keep pairs where source does not equal target
print("Left with",len(pairs),"after removing identical pairs")
random.shuffle(pairs) #always, always make sure to shuffle!

print("Shuffled pairs",pairs[:20])

#Now we need to grab the vectors for the words in question
en_indices=[model_english.vocab[en].index for fi,en in pairs] #English
fi_indices=[model_finnish.vocab[fi].index for fi,en in pairs] #Finnish
print("Indices:",en_indices[:10],fi_indices[:10])
#...and the vectors are hidden in the models
print("English model.vectors shape:",model_english.vectors.shape)
print("Finnish model.vectors shape:",model_finnish.vectors.shape)
en_vectors=model_english.vectors[en_indices] #Selects the rows in just the correct order
fi_vectors=model_finnish.vectors[fi_indices] #Selects the rows in just the correct order
print("English selected vectors shape:",en_vectors.shape)
print("Finnish selected vectors shape:",fi_vectors.shape)





Left with 4624 after removing identical pairs
Shuffled pairs [('tutkijat', 'scientists'), ('vahtikoira', 'watchdog'), ('ärsyttää', 'annoy'), ('taksit', 'taxis'), ('vampyyrit', 'vampires'), ('kuningatar', 'queen'), ('suoritettavan', 'executable'), ('tuoksu', 'scent'), ('kiekko', 'puck'), ('taajuus', 'frequency'), ('kulttuurisesti', 'culturally'), ('akku', 'battery'), ('kitaristi', 'guitarist'), ('jäsenyys', 'membership'), ('kuva', 'picture'), ('perinteisesti', 'traditionally'), ('opastus', 'guidance'), ('että', 'that'), ('hamstrata', 'hoard'), ('jäljittää', 'trace')]
Indices: [2552, 7487, 56671, 20734, 30688, 6869, 64895, 23148, 12150, 7147] [3011, 97132, 4169, 44914, 43348, 11172, 53182, 4434, 11044, 31671]
English model.vectors shape: (100000, 200)
Finnish model.vectors shape: (100000, 200)
English selected vectors shape: (4624, 200)
Finnish selected vectors shape: (4624, 200)


* Now `en_vectors` is vectors for the 4624 English words in our translation pairs
* `fi_vectors` is same for Finnish
* ...our training data is done - we have the pairs of input--desired output

## Learning transformation from English to Finnish

* 200-dim vector in, 200-dim vector out
* Loss needs to be different, this is not classification!
* `mse` stands for mean square error

In [144]:
import tensorflow as tf
### Only needed for me, not to block the whole GPU, you don't need this stuff
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))
### ---end of weird stuff

from keras.models import Model
from keras.layers import Input, Dense



inp=Input(shape=(en_vectors.shape[1],)) #input is 200-dim
outp=Dense(fi_vectors.shape[1])(inp) #Simple linear transformation of the input

model=Model(inputs=[inp], outputs=[outp])
model.summary()

model.compile(optimizer="adam",loss="mse")
hist=model.fit(en_vectors,fi_vectors,batch_size=100,verbose=1,epochs=30,validation_split=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_47 (InputLayer)        (None, 200)               0         
_________________________________________________________________
dense_74 (Dense)             (None, 200)               40200     
Total params: 40,200
Trainable params: 40,200
Non-trainable params: 0
_________________________________________________________________
Train on 4161 samples, validate on 463 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## Checking for accuracy

In [145]:
val_en,val_fi,_=hist.validation_data #This we saw before - the validation data
predicted_fi=model.predict(val_en) #Transform the English vectors in the validation data
for en,fi,pred_fi in list(zip(val_en,val_fi,predicted_fi))[:30]:
    print(model_english.similar_by_vector(en,topn=1)) #This is the original English word
    print(model_finnish.similar_by_vector(fi,topn=1)) #This is the target Finnish word
    print(model_finnish.similar_by_vector(pred_fi,topn=5)) # Top five closest hits to the transformed vector
    print("\n")
    

[('small', 1.0)]
[('pieni', 1.0)]
[('pieni', 0.7597441673278809), ('suurehko', 0.745261549949646), ('pienehkö', 0.7438690662384033), ('pikkuruinen', 0.7236558198928833), ('isohko', 0.7211355566978455)]


[('charmer', 1.0)]
[('hurmuri', 1.0)]
[('herttainen', 0.7314914464950562), ('hymyilevä', 0.7270362377166748), ('hyväsydäminen', 0.7208757400512695), ('hurmuri', 0.719153881072998), ('veijari', 0.7173742055892944)]


[('bachelor', 1.0)]
[('poikamies', 1.0)]
[('akateeminen', 0.6675968170166016), ('psykologia', 0.6520403027534485), ('tutkinto', 0.6477102041244507), ('koulutusohjelma', 0.6299420595169067), ('ylioppilas', 0.6283469200134277)]


[('assignment', 0.9999999403953552)]
[('toimeksianto', 0.9999999403953552)]
[('työtehtävä', 0.6690058708190918), ('toimeksianto', 0.6338319778442383), ('aloitus', 0.6132286787033081), ('työ', 0.60988450050354), ('haastattelu', 0.6040139198303223)]


[('settle', 1.0)]
[('asettua', 1.0)]
[('sovitella', 0.6602680683135986), ('neuvotella', 0.634847164154

## Evaluating more formally

* Eyeballing the data is a moving target
* Ideally, we'd have a more solid metric
* Let us try top-1, top-5, and top-10 for the proportion of words which got the correct translation among top-N candidates

In [146]:
def eval(src_model,tgt_model,src_vecs,tgt_vecs,predicted_vecs):
    top1,top5,top10,total=0,0,0,0
    for src_v,tgt_v,pred_v in zip(src_vecs,tgt_vecs,predicted_vecs):
        src_word=src_model.similar_by_vector(src_v)[0][0]
        tgt_word=tgt_model.similar_by_vector(tgt_v)[0][0]
        hits=list(w for w,sim in tgt_model.similar_by_vector(pred_v,topn=10))
        total+=1
        if tgt_word==hits[0]:
            top1+=1
        if tgt_word in hits[:5]:
            top5+=1
        if tgt_word in hits[:10]:
            top10+=1
    print("Top1",top1/total*100,"percent correct")
    print("Top5",top5/total*100,"percent correct")
    print("Top10",top10/total*100,"percent correct")
eval(model_english,model_finnish,val_en,val_fi,predicted_fi)

Top1 33.477321814254864 percent correct
Top5 50.75593952483801 percent correct
Top10 57.45140388768899 percent correct


## Conclusion

* We have seen the vectors have interesting properties
* In particular, spaces can be mapped onto each other
* We have seen how this can be achieved with a simple linear transformation
* Optimal transformation has a closed-form solution, but we were lazy and trained it with Keras quite successfully
* This demonstrates how Keras can be used also for more generic tasks

In [147]:
# Extra stuff - a function to query the translations, so we can play around
def top_n(word,source_model,target_model,transformation_model,topn=5):
    try:
        source_idx=source_model.vocab[word].index
    except:
        print("Cannot retrieve vector for",word)
        return None
    mapped=transformation_model.predict(source_model.vectors[source_idx,:].reshape(1,-1))
    return target_model.similar_by_vector(mapped[0])
    
seen_words=set(en for fi,en in common) #These words were seen during training or validation
while True:
    wrd=input("word> ")
    if wrd=="end":
        break
    if wrd in seen_words:
        print("    WARNING: this word was seen during training")
    hits=top_n(wrd,model_english,model_finnish,model)
    for word,sim in hits:
        print("  ",word,"  ",sim)
    print()

word> dog
   kissa    0.8225976824760437
   kani    0.819080114364624
   aasi    0.7754292488098145
   kisu    0.7641241550445557
   elukka    0.7640793323516846
   koira    0.763005256652832
   katti    0.7530404329299927
   marsu    0.7496534585952759
   apina    0.7409812808036804
   hamsteri    0.7383363842964172

word> end
