# Tradução utilizando SVD

## Bibliotecas

In [35]:
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors

from collections import Counter

## Dados

### Word2Vec - FastText

- English: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
- Portuguese: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz

In [3]:
def read_vec_file(file_name):
    '''
    Read file in .vec format
    '''
    return KeyedVectors.load_word2vec_format(
        "Fasttext/" + file_name, 
        binary = False,
        encoding ='unicode_escape',
        unicode_errors = 'replace'
    )

In [4]:
ENGLISH_MODEL = read_vec_file('cc.en.300.vec')
PORTUGUESE_MODEL = read_vec_file('cc.pt.300.vec')

### Dataset

Amazon MASSIVE: https://github.com/alexa/massive

In [5]:
def read_amazon_corpus(corpus):
    '''
    Read file .jsonl from Amazon MASSIVE dataset
    '''
    return pd.read_json(
        "Amazon_Massive/" + corpus, 
        lines = True
    )['utt']

In [6]:
ENGLISH_CORPUS = read_amazon_corpus('en-US.jsonl')
PORTUGUESE_CORPUS = read_amazon_corpus('pt-PT.jsonl')

## Tradução

### Pareamento dos dois idiomas

In [7]:
EN_PT_DF = pd.merge(ENGLISH_CORPUS, PORTUGUESE_CORPUS, right_index = True, left_index = True)
EN_PT_TUPLES = [tuple(x) for x in EN_PT_DF.values]

In [29]:
EN_LIST, PT_LIST = [], []
for tuple_value in EN_PT_TUPLES:
    EN_LIST.append(tuple_value[0].split(" "))
    PT_LIST.append(tuple_value[1].split(" "))

In [37]:
def unique(language_list):
    aux_list = []
    for sentence in language_list:
        for word in sentence:
            aux_list.append(word)
    
    unique_list = []
    for word in Counter(aux_list):
        unique_list.append(word)

    return unique_list

In [39]:
EN_UNIQUE_WORDS = unique(EN_LIST)
PT_UNIQUE_WORDS = unique(PT_LIST)

### Carregamento de palavras para treinamento

In [75]:
ENGLISH = []
for w in EN_UNIQUE_WORDS:
    try:
        ENGLISH.append(ENGLISH_MODEL[w])
    except KeyError:
        continue

In [76]:
PORTUGUESE = []
for w in PT_UNIQUE_WORDS:
    try:
        PORTUGUESE.append(PORTUGUESE_MODEL[w])
    except KeyError:
        continue

### Cálculo da matriz de tradução utilizando SVD

In [77]:
size = min(len(ENGLISH), len(PORTUGUESE))
U, Sig, Vt = np.linalg.svd(np.transpose(ENGLISH[:size]) @ PORTUGUESE[:size])
translator = np.transpose(Vt) @ np.transpose(U)

### Experimentos

#### Português -> Inglês

In [78]:
ENGLISH_MODEL.most_similar(translator @ PORTUGUESE_MODEL['sapato'])

[('excelled', 0.23170699179172516),
 ('scragged', 0.22841233015060425),
 ('amplifications', 0.2248193770647049),
 ('under-played', 0.2230806052684784),
 ('excels', 0.21914346516132355),
 ('Volcani', 0.21758601069450378),
 ('over-shadowed', 0.2172958105802536),
 ('InformationServing', 0.21379493176937103),
 ('Loathed', 0.2137456089258194),
 ('alliums', 0.2133820503950119)]

#### Inglês -> Português

In [79]:
PORTUGUESE_MODEL.most_similar(translator @ ENGLISH_MODEL['shoe'])

[('deparamos', 0.3502688705921173),
 ('depara', 0.34006789326667786),
 ('thinwhitestripes', 0.33692777156829834),
 ('MOTORCICLISMOCAVALOSESPORTSDESPORTOS', 0.33510300517082214),
 ('tubarÃ£o', 0.33243897557258606),
 ('EspartilhosLegs', 0.33198824524879456),
 ('machuca', 0.32853350043296814),
 ('arranja', 0.32645148038864136),
 ('deparam', 0.3246495723724365),
 ('engatam', 0.3234668970108032)]