In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import sklearn
import cv2
from sklearn.metrics import confusion_matrix, roc_curve
import seaborn as sns
import datetime
import pathlib
import io
import os
import re
import string
import time
from numpy import random
import gensim.downloader as api
from PIL import Image
import tensorflow_datasets as tfds
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Data Preperation

In [2]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2024-11-05 15:02:31--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7943074 (7.6M) [application/zip]
Saving to: ‘fra-eng.zip’


2024-11-05 15:02:33 (4.20 MB/s) - ‘fra-eng.zip’ saved [7943074/7943074]



In [3]:
!unzip "/content/fra-eng.zip" -d "/content/dataset/"

Archive:  /content/fra-eng.zip
  inflating: /content/dataset/_about.txt  
  inflating: /content/dataset/fra.txt  


# Kaggle Dataset

In [4]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d dhruvildave/en-fr-translation-dataset

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset
License(s): ODbL-1.0
Downloading en-fr-translation-dataset.zip to /content
100% 2.54G/2.54G [00:29<00:00, 90.0MB/s]
100% 2.54G/2.54G [00:29<00:00, 91.9MB/s]


In [6]:
!unzip "/content/en-fr-translation-dataset.zip" -d "/content/dataset/"

Archive:  /content/en-fr-translation-dataset.zip
  inflating: /content/dataset/en-fr.csv  

In [4]:
text_dataset = tf.data.TextLineDataset("/content/dataset/fra.txt")

In [5]:
VOCAB_SIZE = 20000
SEQUENCE_LENGTH = 64
EMBEDDING_DIM = 300


In [6]:
english_vectorize_layer = TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)
english_vectorize_layer

<TextVectorization name=text_vectorization, built=False>

In [7]:
french_vectorize_layer = TextVectorization(
    standardize = "lower_and_strip_punctuation",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)
english_vectorize_layer

<TextVectorization name=text_vectorization, built=False>

In [66]:
# Define the selector function for preparing input data with start/end tokens
def selector(input_text):
    split_text = tf.strings.split(input_text, sep="\t")
    input_1 = split_text[0:1]  # English sentence
    input_2 = '[start]' + split_text[1:2]  # French sentence with start token
    output = split_text[1:2] + '[end]'  # French sentence with end token
    return {'input_1': input_1, 'input_2': input_2}, output

In [67]:
split_dataset = text_dataset.map(selector)


In [68]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start]Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ![end]'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start]Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche.[end]'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start]En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ![end]'], dtype=object)>)


In [None]:
import tensorflow as tf

# Define the separator function
def separator(input_text):
    split_text = tf.strings.split(input_text, sep="\t")
    # Adding [start] and [end] tokens correctly
    english_text = split_text[0:1]
    french_text = tf.strings.join(['[start]', split_text[1], '[end]'])
    return english_text, french_text

# Assuming `text_dataset` is your initial dataset of strings in the format: "English sentence \t French sentence"
# Map the separator function over the dataset
init_dataset = text_dataset.map(separator)

# Separate English and French sentences for vectorization
english_training_data = init_dataset.map(lambda x, y: x)
french_training_data = init_dataset.map(lambda x, y: y)

# Vectorize the English and French data
english_vectorize_layer.adapt(english_training_data)
french_vectorize_layer.adapt(french_training_data)


In [69]:
# Vectorize the inputs and outputs
def vectorizer(inputs, output):
    return {
        'input_1': english_vectorize_layer(inputs['input_1']),
        'input_2': french_vectorize_layer(inputs['input_2'])
    }, french_vectorize_layer(output)

In [70]:
# Apply the vectorizer to the dataset
dataset = split_dataset.map(vectorizer)

In [71]:
for i in split_dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start]Va !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Va ![end]'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start]Marche.'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Marche.[end]'], dtype=object)>)
({'input_1': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Go.'], dtype=object)>, 'input_2': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[start]En route !'], dtype=object)>}, <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'En route ![end]'], dtype=object)>)


In [72]:
for i in dataset.take(3):
  print(i)

({'input_1': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[45,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])>, 'input_2': <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[1084,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]])>}, <tf.Tensor: shape=(1, 64), dtype=int64, numpy=
array([[108,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 

In [73]:
# Shuffle, batch, and prefetch the dataset
dataset = dataset.shuffle(2048).unbatch().batch(64).prefetch(buffer_size=tf.data.AUTOTUNE)


In [75]:
# Print a sample from the dataset to verify the structure
for example in dataset.take(1):
    print(example)

({'input_1': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[2329,    0,    0, ...,    0,    0,    0],
       [ 112,  153,    0, ...,    0,    0,    0],
       [  20,  427,    0, ...,    0,    0,    0],
       ...,
       [1088,    0,    0, ...,    0,    0,    0],
       [  16,  550,    0, ...,    0,    0,    0],
       [  52,  216,    0, ...,    0,    0,    0]])>, 'input_2': <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[   1,    0,    0, ...,    0,    0,    0],
       [   1,    0,    0, ...,    0,    0,    0],
       [   3,   21,   19, ...,    0,    0,    0],
       ...,
       [   1,    0,    0, ...,    0,    0,    0],
       [  69,  688,    4, ...,    0,    0,    0],
       [8911,    0,    0, ...,    0,    0,    0]])>}, <tf.Tensor: shape=(64, 64), dtype=int64, numpy=
array([[  1,   0,   0, ...,   0,   0,   0],
       [  1,   0,   0, ...,   0,   0,   0],
       [ 20,  21,  19, ...,   0,   0,   0],
       ...,
       [  1,   0,   0, ...,   0,   0,   0],
       [  8, 

In [63]:
NUM_BATCHES = int(200000/64)


In [64]:
train_dataset = dataset.take(int(0.9*NUM_BATCHES))
val_dataset = dataset.skip(int(0.9*NUM_BATCHES))

In [65]:
train_dataset

<_TakeDataset element_spec=({'input_1': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'input_2': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>