#### About

> Cross Lingual Information Retrieval

- Cross lingual information retrieval(CLIR) is the process of retrieving information from a source lang and presenting it in a target language. This is often considered as a challenging task because it requires understanding the meaning of the source language and translating it accurately into the target language.

- Various approaches to it include SMT(Statistical machine translation), NMT(Neural machine translation)


Let's look at an example of it using europarl_raw between english and french texts


In [37]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data_path = "/home/suraj/Downloads/"
en_file = os.path.join(data_path, "europarl-v7.fr-en.en")
fr_file = os.path.join(data_path, "europarl-v7.fr-en.fr")

# Load English and French texts
# processing only first 100 lines 
en_texts = []
with open(en_file, encoding="utf8") as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        en_texts.append(line.strip())
fr_texts = []
with open(fr_file, encoding="utf8") as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        fr_texts.append(line.strip())

# Create tokenizers for English and French
en_tokenizer = Tokenizer(num_words=10000)
en_tokenizer.fit_on_texts(en_texts)
fr_tokenizer = Tokenizer(num_words=10000)
fr_tokenizer.fit_on_texts(fr_texts)

# Convert texts to sequences of integers
en_sequences = en_tokenizer.texts_to_sequences(en_texts)
fr_sequences = fr_tokenizer.texts_to_sequences(fr_texts)

# Pad sequences to the same length
maxlen = 100
en_sequences = pad_sequences(en_sequences, maxlen=maxlen)
fr_sequences = pad_sequences(fr_sequences, maxlen=maxlen)


In [38]:
en_sequences[0],fr_sequences[0]

(array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0, 1833,    2,    1,
         403], dtype=int32),
 array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0, 

In [42]:
from keras.layers import Input, Embedding, LSTM
from keras.models import Model
fr_vocab_size = len(fr_tokenizer.word_index) + 1
en_vocab_size = len(en_tokenizer.word_index) + 1

# Define input sequence for the encoder
encoder_inputs = Input(shape=(maxlen,))

# Define embedding layer
en_embedding = Embedding(input_dim=en_vocab_size, output_dim=256, input_length=maxlen)

# Embed the input sequence
embedded_inputs = en_embedding(encoder_inputs)

# Define encoder LSTM layer
encoder_lstm = LSTM(256, return_state=True)

# Get encoder outputs and states
encoder_outputs, state_h, state_c = encoder_lstm(embedded_inputs)
encoder_states = [state_h, state_c]

# Define input sequence for the decoder
decoder_inputs = Input(shape=(None,))

# Define embedding layer for the decoder
fr_embedding = Embedding(input_dim=fr_vocab_size, output_dim=256)

# Embed the decoder input sequence
embedded_decoder_inputs = fr_embedding(decoder_inputs)

# Define decoder LSTM layer
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)

# Get decoder outputs and states
decoder_outputs, _, _ = decoder_lstm(embedded_decoder_inputs, initial_state=encoder_states)

# Define output layer for the decoder
fr_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = fr_dense(decoder_outputs)

# Define the model that maps inputs to decoder outputs
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

2023-05-12 16:59:26.022485: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-05-12 16:59:26.028796: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-05-12 16:59:26.031828: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [44]:
from keras.utils import to_categorical

# Convert French sequences to one-hot encoding
fr_onehot = np.zeros((len(fr_sequences), maxlen, fr_vocab_size), dtype=np.bool)
for i, seq in enumerate(fr_sequences):
    for j, index in enumerate(seq):
        fr_onehot[i,j,index]=1

model.fit([en_sequences, fr_sequences[:, :-1]], fr_onehot[:, 1:, :], batch_size=64, epochs=4, validation_split=0.2)




Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fr_onehot = np.zeros((len(fr_sequences), maxlen, fr_vocab_size), dtype=np.bool)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f7eced59d90>