## **Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import ast
import nltk
import pickle

import warnings
warnings.filterwarnings("ignore")

## **Utility Functions and Classes**

In [None]:
def load_TextFileasDict(file_path):
  """
  DESCR: This function loads an english and french words file into a dictionary

  file_path: This is the path to the text file containing both english and french words
  
  Return: a dictionary with the english word as key and the french word as value
          e.g 'yes' : 'oui'.
  """
  WordDict = pd.read_csv(file_path, sep =  "\s", header = None)
  list_english = list(WordDict[0])
  list_french = list(WordDict[1])

  dict_lang = dict()
  for i in range(len(list_english)):
    dict_lang[list_english[i]] = list_french[i]

  return dict_lang

In [None]:
#Function to extract the training embeddings
def TrainVector(WordDict, eng_embeddings, fren_embeddings):
  """
  DESCR: This function gets the embedding vectors of some selected english and french words
  
  WordDict: a dictionary with the english word as key and the french word as value
  eng_embeddings: comprehensive english word embeddings
  fren_embeddings: comprehensive french word embeddings
  
  Return: X - Selected english word embeddings
          y - Selected french word embeddings
  """
  EngFran_tup = [(list(WordDict.keys())[i], list(WordDict.values())[i]) for i in range(len(WordDict))]
  pair_lang = [(index, english, french) for (index, (english, french)) in enumerate(EngFran_tup) if french in fren_embeddings.keys() if english in eng_embeddings.keys()]

  French_TrainEmbed = {}
  English_TrainEmbed = {}

  for (index, english, french) in pair_lang:
    English_TrainEmbed[index] = eng_embeddings[english]
    French_TrainEmbed[index] = fren_embeddings[french]

  x = pd.DataFrame(English_TrainEmbed).T.values 
  y = pd.DataFrame(French_TrainEmbed).T.values

  return x, y

In [None]:
def get_matrices(en_fr,eng_embeddings, fren_embeddings):
    """
    Input:
        en_fr: English to French dictionary
        fren_embeddings: French words to their corresponding word embeddings.
        eng_embeddings: English words to their corresponding word embeddings.
    Output: 
        X: a matrix where the columns are the English embeddings.
        Y: a matrix where the columns correspong to the French embeddings.
        R: the projection matrix that minimizes the F norm ||X R -Y||^2.
    """

    ### START CODE HERE ###

    # X_l and Y_l are lists of the english and french word embeddings
    X_l = list()
    Y_l = list()

    # get the english words (the keys in the dictionary) and store in a set()
    english_set = set(eng_embeddings.keys())

    # get the french words (keys in the dictionary) and store in a set()
    french_set = set(fren_embeddings.keys())

    # store the french words that are part of the english-french dictionary (these are the values of the dictionary)
    french_words = set(en_fr.values())

    # loop through all english, french word pairs in the english french dictionary
    for en_word, fr_word in en_fr.items():

        # check that the french word has an embedding and that the english word has an embedding
        if fr_word in french_set and en_word in english_set:

            # get the english embedding
            en_vec = eng_embeddings[en_word]

            # get the french embedding
            fr_vec = fren_embeddings[fr_word]

            # add the english embedding to the list
            X_l.append(en_vec)

            # add the french embedding to the list
            Y_l.append(fr_vec)

    # stack the vectors of X_l into a matrix X
    X = np.array(X_l)

    # stack the vectors of Y_l into a matrix Y
    Y = np.array(Y_l)
    ### END CODE HERE ###

    return X, Y

In [None]:
def loss_func(Transform_Matrix, Xtrain_embed, Ytrain_embed):
  """
  DESCR: This function computes the loss function

  Transform_Matrix: The matrix that transforms the english vectors to the equivalent french vectors
  Xtrain_embed - Selected english word embeddings
  Ytrain_embed - Selected french word embeddings
  
  Return: The loss forbenius_norm(//XR - Y//)
  """
  m = len(Xtrain_embed)

  prod = np.dot(Xtrain_embed, Transform_Matrix)
  diff = prod - Ytrain_embed

  diff_square = np.square(diff)
  loss = diff_square.sum()/m

  return loss


In [None]:
def gradient_func(Transform_Matrix, Xtrain_embed, Ytrain_embed):
  """
  DESCR: This function computes the gradient of the loss

  Transform_Matrix: The matrix that transforms the english vectors to the equivalent french vectors
  Xtrain_embed - Selected english word embeddings
  Ytrain_embed - Selected french word embeddings
  
  Return: The gradient of the loss (2/m)*X.T(XR - Y)
  """
  prod = np.dot(Xtrain_embed, Transform_Matrix)
  diff = prod - Ytrain_embed
  m = len(Xtrain_embed)

  grad = (2/m)*np.dot(Xtrain_embed.T, diff)
  return grad


In [None]:
def gradient_descent(X, Y, learning_rate = 0.01, epochs = 100, random_state = 0, verbose = None):
  """
  DESCR: This function computes the optimal  transformation matrix

  X -  english train data word embeddings
  Y -  french train data word embeddings
  learning_rate: rate at which the transformation matrix will be updated
  epochs: Number of iterations for updating the transformation matrix
  random_state: controls the random selection of value for Transformation matrix if TransformMatrix_init = None
  
  Return: The transformation matrix
  """
  TransMatrix = np.random.rand(X.shape[1], X.shape[1])

  for i in range(1, epochs + 1):
    if (verbose and (i % verbose == 0)):
      print(f"Epoch {i} : Loss : {loss_func(TransMatrix, X, Y)} ")
      
    gradient = gradient_func(TransMatrix, X, Y)
    TransMatrix -= learning_rate * gradient
    
  return TransMatrix

In [None]:
def accuracy(y_pred, y_true):
  sum = 0
  true_sum = 0
  for i in range(len(y_pred)):
    sum += 1;
    if (y_pred[i] == y_true[i]):
      true_sum += 1

  return true_sum / sum

In [None]:
def cosine_similarity(vector_1, vector_2):
  """
  DESCR: This function computes the cosine of angle between two vectors

  vector_1: The first vector
  vector_2: The second vector
  
  Return: the cosine of the angle between vector_1 and vector_2
  """

  cosine_sim = np.dot(vector_1, vector_2) / (np.linalg.norm(vector_1) * np.linalg.norm(vector_2))

  return cosine_sim

In [None]:
def Convert_to_French(english_word, transform_matrix, english_embeddings, french_embeddings, nearest_neigbors = 1):
  """
  DESCR: This function changes an english word to a french word
  
  english_word: This is the english word to be changed
  english_embeddings: comprehensive english word embeddings
  french_embeddings: comprehensive french word embeddings
  nearest_neigbors: Number of french words that are similar to the english word to be returned

  """
  eng_word_embed = english_embeddings.get(english_word, np.zeros(300))
  if (np.sum(eng_word_embed) == 0):
    return list(("WordNotFound", 0))
  else:
    french_word_equiv = np.dot(eng_word_embed, transform_matrix)

    word_similarity = {}
    for french_word in french_embeddings.keys():
      french_word_embed = french_embeddings[french_word]

      similarity = cosine_similarity(french_word_embed, french_word_equiv)
      word_similarity[french_word] = similarity

    sort_orders = sorted(word_similarity.items(), key=lambda x: x[1], reverse=True)

    
    return sort_orders[0:nearest_neigbors]

## **Dataset**

In [None]:
french_embeddings = pickle.load(open("data/fr_embeddings.p", "rb"))
english_embeddings = pickle.load(open("data/en_embeddings.p", "rb"))
 

In [None]:
WordDict_train = load_TextFileasDict("data/en-fr.train.txt")
WordDict_test = load_TextFileasDict("data/en-fr.test.txt")

In [None]:
print(f"The length of the train data english-french word dictionary: {len(WordDict_train)}")
print(f"The length of the test data english-french word dictionary: {len(WordDict_test)}")

The length of the train data english-french word dictionary: 5000
The length of the test data english-french word dictionary: 1500


## **Preprocessing**

### **Extracting Training DataFrame**

In [None]:
X_train, Y_train = get_matrices(WordDict_train, english_embeddings, french_embeddings)

In [None]:
X_train.shape, Y_train.shape

((4932, 300), (4932, 300))

### **Calculating the transformation Matrix**

In [None]:
m = gradient_descent(X = X_train,Y = Y_train, learning_rate = 0.5, epochs = 1000, random_state = 20, verbose = 100)

Epoch 100 : Loss : 16.28496970266297 
Epoch 200 : Loss : 2.3811240429667477 
Epoch 300 : Loss : 0.9235554276962724 
Epoch 400 : Loss : 0.6565489131086197 
Epoch 500 : Loss : 0.5922218106932381 
Epoch 600 : Loss : 0.5738985726990419 
Epoch 700 : Loss : 0.5680201758028866 
Epoch 800 : Loss : 0.565963005683714 
Epoch 900 : Loss : 0.5651970749430207 
Epoch 1000 : Loss : 0.5648994072970945 


## **Word conversion**

In [None]:
english_word = 'university'
print(Convert_to_French(english_word = english_word, transform_matrix = m, english_embeddings = english_embeddings ,
                        french_embeddings = french_embeddings, nearest_neigbors = 3))

[('université', 0.7852209345721833), ('universitaire', 0.7447127058746604), ('universités', 0.7340356792049815)]


In [None]:
english_word = 'professor'
print(Convert_to_French(english_word = english_word, transform_matrix = m, english_embeddings = english_embeddings ,
                        french_embeddings = french_embeddings, nearest_neigbors = 3))

[('professeure', 0.7608659711755349), ('chercheur', 0.7166674788985664), ('université', 0.6532344159622178)]


In [None]:
english_word = 'cat'
print(Convert_to_French(english_word = english_word, transform_matrix = m, english_embeddings = english_embeddings ,
                        french_embeddings = french_embeddings, nearest_neigbors = 3))

[('chat', 0.687437472381717), ('chats', 0.6848670868670468), ('chien', 0.6842748490688814)]


## **Model Evaluation**

In [None]:
french_words_test = WordDict_test.keys()
english_words_pred = [Convert_to_French(word, m, english_embeddings , french_embeddings)[0][0] for word in french_words_test]

In [None]:
english_words_test = list(WordDict_test.values())
print(f"Model Accuracy : {accuracy(english_words_pred, english_words_test) * 100}%")

Model Accuracy : 41.8%
