In [67]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [10]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip

--2024-04-05 07:49:22--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.216.207, 108.177.12.207, 108.177.13.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.216.207|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip.1’


2024-04-05 07:49:22 (159 MB/s) - ‘spa-eng.zip.1’ saved [2638744/2638744]

replace spa-eng/_about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
replace spa-eng/spa.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y


In [29]:
text_file = "spa-eng/spa.txt"
with open(text_file) as f:
  lines = f.read().split("\n")[:-1]

data = []
source_data = []
target_data = []
for line in lines:
  source, target = line.split('\t')
  source_data.append(source)
  target_data.append(target)
  data.append((source, target))

In [13]:
data[-1]

('If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo.',
 '[start] Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado. [end]')

In [14]:
import random

random.shuffle(data)
num_val_samples = int(0.15 * len(data))
num_train_samples = len(data) - 2 * num_val_samples

train_pairs = data[:num_train_samples]
val_pairs = data[num_train_samples:num_train_samples + num_val_samples]
test_pairs = data[num_train_samples + num_val_samples:]

In [51]:
counter = Counter({"gholam": 2})
counter["gholama"]+=1
# counter.update("y")
counter

Counter({'gholam': 2, 'gholama': 1})

In [52]:
from collections import Counter
import string
from tqdm import tqdm

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


class TextVectorizer:

  def __init__(self, sequence_length, vocab_size, target=False):
    self.target = target
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.vocab_counter = Counter()
    self.stoi = {"[pad]": 0, "[start]": 1, "[end]": 2, "[UNK]": 3}
    self.itos = {0: "[pad]", 1: "[start]", 2: "[end]", 3: "[UNK]"}

  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text
                  if char not in strip_chars)

  def tokenize(self, text):
    text = self.standardize(text)
    return text.split()

  def make_most_common(self, dataset):

    for text in tqdm(dataset):
      tokens = self.tokenize(text)
      for token in tokens:
        self.vocab_counter[token] += 1

    for token, _ in self.vocab_counter.most_common(self.vocab_size):
      indx = len(self.stoi)
      self.stoi[token] = indx
      self.itos[indx] = token

  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    if self.target:
      result = ([self.stoi["[start]"]] + [self.stoi.get(token, 3) for token in tokens]
            + [self.stoi["[end]"]])
    else:
      result = [self.stoi.get(token, 3) for token in tokens]

    if len(result) <= self.sequence_length:
        pad_size = self.sequence_length - len(result)
        result += [self.stoi.get("[pad]")] * (pad_size)
    else:
      #truncate!
      result = result[:self.sequence_length]

    return result

  def decode(self, int_sequence):
    return " ".join(self.itos.get(i, "[UNK]") for i in int_sequence)

In [86]:
vocab_size = 15000
sequence_length = 20

source_vectorizer = TextVectorizer(sequence_length, vocab_size)
target_vectorizer = TextVectorizer(sequence_length + 1, vocab_size, target=True)

In [87]:
source_vectorizer.make_most_common(source_data)
target_vectorizer.make_most_common(target_data)

100%|██████████| 118964/118964 [00:01<00:00, 103539.94it/s]
100%|██████████| 118964/118964 [00:00<00:00, 123404.26it/s]


In [66]:
encoded_ = source_vectorizer.encode('If you want to sound')
source_vectorizer.decode(encoded_)

'if you want to sound [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad] [pad]'

In [80]:
eng, spa = train_pairs[1]
source_vectorizer.encode(eng)

[15, 10, 9, 149, 422, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [88]:
class EngSpaDataset(Dataset):
  def __init__(self, data, source_vectorizer, target_vectorizer):
    self.data = data
    self.source_vectorizer = source_vectorizer
    self.target_vectorizer = target_vectorizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    eng, spa = self.data[idx]
    eng = self.source_vectorizer.encode(eng)
    spa = self.target_vectorizer.encode(spa)
    return ({
          "english": torch.tensor(eng),
          "spanish": torch.tensor(spa[:-1]),
          }, torch.tensor(spa[1:]))

In [89]:
train_ds = EngSpaDataset(train_pairs, source_vectorizer, target_vectorizer)
val_ds = EngSpaDataset(val_pairs, source_vectorizer, target_vectorizer)
test_ds = EngSpaDataset(test_pairs, source_vectorizer, target_vectorizer)

In [90]:
batch_size = 64

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=batch_size)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [91]:
for data, target in train_dl:
  print(data)
  print(target.size())
  break

{'english': tensor([[ 18,   7, 970,  ...,   0,   0,   0],
        [ 82, 285, 379,  ...,   0,   0,   0],
        [ 68,   7, 491,  ...,   0,   0,   0],
        ...,
        [  4, 469,  13,  ...,   0,   0,   0],
        [ 15,  95, 865,  ...,   0,   0,   0],
        [110,  10, 196,  ...,   0,   0,   0]]), 'spanish': tensor([[  1,   1, 302,  ...,   0,   0,   0],
        [  1,   1,  85,  ...,   0,   0,   0],
        [  1,   1,  48,  ...,   0,   0,   0],
        ...,
        [  1,   1,   9,  ...,   0,   0,   0],
        [  1,   1,  79,  ...,   0,   0,   0],
        [  1,   1,  10,  ...,   0,   0,   0]])}
torch.Size([64, 20])
