In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [14]:
from argparse import Namespace
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
import numpy as np
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
!mkdir /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter8_Seq_Modeling_Advanced/data
!wget https://git.io/JqQBE -O data/download.py
!wget https://git.io/JqQB7 -O data/get-all-data.sh
!chmod 755 /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter8_Seq_Modeling_Advanced/data/get-all-data.sh
%cd  /content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter8_Seq_Modeling_Advanced/data
!./get-all-data.sh
%cd ..

mkdir: cannot create directory ‘/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter8_Seq_Modeling_Advanced/data’: File exists
--2024-03-20 11:09:07--  https://git.io/JqQBE
Resolving git.io (git.io)... 140.82.114.22
Connecting to git.io (git.io)|140.82.114.22|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_8/8_5_NMT/data/download.py [following]
--2024-03-20 11:09:07--  https://raw.githubusercontent.com/rickiepark/nlp-with-pytorch/main/chapter_8/8_5_NMT/data/download.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1572 (1.5K) [text/plain]
Saving to: ‘data/download.py’


2024-03-20 11:09:07 (9.06 MB/s) - ‘data/download.py’ saved [1572/

In [60]:
args = Namespace(
    source_data_path="/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter8_Seq_Modeling_Advanced/data/nmt/eng-fra.txt",
    output_data_path="/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter8_Seq_Modeling_Advanced/data/nmt/mytest_eng_fra.csv",
    perc_train=0.7,
    perc_val=0.15,
    perc_test=0.15,
    seed=1337
)
assert args.perc_test > 0 and (args.perc_test + args.perc_val + args.perc_train == 1.0)

In [48]:
with open(args.source_data_path) as fp:
  lines = fp.readlines()

lines = [line.replace("\n", "").lower().split("\t") for line in lines]

In [49]:
data = []
for english_sentence, french_sentence in lines:
  data.append({
      "english_tokens": word_tokenize(english_sentence, language="english"),
      "french_tokens" : word_tokenize(french_sentence, language="french")
  })

In [50]:
for l in data[3000:3050]:
  print(l)

{'english_tokens': ['i', 'have', 'to', 'go', '.'], 'french_tokens': ['il', 'me', 'faut', 'partir', '.']}
{'english_tokens': ['i', 'have', 'to', 'go', '.'], 'french_tokens': ['il', 'me', 'faut', "m'en", 'aller', '.']}
{'english_tokens': ['i', 'have', 'to', 'go', '.'], 'french_tokens': ['je', 'dois', 'partir', '.']}
{'english_tokens': ['i', 'have', 'to', 'go', '.'], 'french_tokens': ['je', 'dois', "m'en", 'aller', '.']}
{'english_tokens': ['i', 'hear', 'music', '.'], 'french_tokens': ["j'entends", 'de', 'la', 'musique', '.']}
{'english_tokens': ['i', 'heard', 'that', '.'], 'french_tokens': ["j'ai", 'entendu', 'ça', '.']}
{'english_tokens': ['i', 'helped', 'out', '.'], 'french_tokens': ["j'ai", 'donné', 'un', 'coup', 'de', 'main', '.']}
{'english_tokens': ['i', 'honor', 'that', '.'], 'french_tokens': ["j'honore", 'cela', '.']}
{'english_tokens': ['i', 'hugged', 'her', '.'], 'french_tokens': ['je', "l'étreignis", '.']}
{'english_tokens': ['i', 'improvised', '.'], 'french_tokens': ["j'ai", 

In [51]:
filter_phrases = (
    ("i", "am"), ("i", "'m"),
    ("he", "is"), ("he", "'s"),
    ("she", "is"), ("she", "'s"),
    ("you", "are"), ("you", "'re"),
    ("we", "are"), ("we", "'re"),
    ("they", "are"), ("they", "'re")
)

In [52]:
data_subset = {
    phrase: [] for phrase in filter_phrases
}

for datum in data:
  key = tuple(datum["english_tokens"][:2])
  if key in data_subset:
    data_subset[key].append(datum)

In [53]:
counts = {k :len(v) for k, v in data_subset .items()}
counts, sum(counts.values())

({('i', 'am'): 805,
  ('i', "'m"): 4760,
  ('he', 'is'): 1069,
  ('he', "'s"): 787,
  ('she', 'is'): 504,
  ('she', "'s"): 316,
  ('you', 'are'): 449,
  ('you', "'re"): 2474,
  ('we', 'are'): 181,
  ('we', "'re"): 1053,
  ('they', 'are'): 194,
  ('they', "'re"): 470},
 13062)

In [54]:
np.random.seed(args.seed)

dataset_stage3 = []
for phrase, datum_list in sorted(data_subset.items()):
  np.random.shuffle(datum_list)
  n_train = int(len(datum_list) * args.perc_train)
  n_val = int(len(datum_list) * args.perc_val)

  for datum in datum_list[:n_train]:
    datum["split"] = "train"
  for datum in datum_list[n_train:n_train+n_val]:
    datum["split"] = "val"
  for datum in datum_list[n_train+n_val:]:
    datum["split"] = "test"

  dataset_stage3.extend(datum_list)


In [55]:
for l in dataset_stage3[:20]:
  print("english tokens:", str(l["english_tokens"]))
  print("french tokens: ", str(l["french_tokens"]))
  print("split:", str(l["split"]))
  print()

english tokens: ['he', "'s", 'the', 'cutest', 'boy', 'in', 'town', '.']
french tokens:  ["c'est", 'le', 'garçon', 'le', 'plus', 'mignon', 'en', 'ville', '.']
split: train

english tokens: ['he', "'s", 'a', 'nonsmoker', '.']
french tokens:  ['il', 'est', 'non-fumeur', '.']
split: train

english tokens: ['he', "'s", 'smarter', 'than', 'me', '.']
french tokens:  ['il', 'est', 'plus', 'intelligent', 'que', 'moi', '.']
split: train

english tokens: ['he', "'s", 'a', 'lovely', 'young', 'man', '.']
french tokens:  ["c'est", 'un', 'adorable', 'jeune', 'homme', '.']
split: train

english tokens: ['he', "'s", 'three', 'years', 'older', 'than', 'me', '.']
french tokens:  ['il', 'a', 'trois', 'ans', 'de', 'plus', 'que', 'moi', '.']
split: train

english tokens: ['he', "'s", 'washing', 'your', 'car', '.']
french tokens:  ['il', 'lave', 'votre', 'voiture', '.']
split: train

english tokens: ['he', "'s", 'your', 'typical', 'workaholic', '.']
french tokens:  ['il', 'est', "l'archétype", 'du', 'bourrea

In [56]:
for datum in dataset_stage3:
  datum["source_language"] = " ".join(datum.pop("english_tokens"))
  datum["target_language"] = " ".join(datum.pop("french_tokens"))

In [57]:
nmt_df = pd.DataFrame(dataset_stage3)

In [65]:
nmt_df.groupby("split").count()


Unnamed: 0_level_0,source_language,target_language
split,Unnamed: 1_level_1,Unnamed: 2_level_1
test,1969,1969
train,9138,9138
val,1955,1955


In [68]:
nmt_df.loc[nmt_df.split == "train"].head()

Unnamed: 0,split,source_language,target_language
0,train,he 's the cutest boy in town .,c'est le garçon le plus mignon en ville .
1,train,he 's a nonsmoker .,il est non-fumeur .
2,train,he 's smarter than me .,il est plus intelligent que moi .
3,train,he 's a lovely young man .,c'est un adorable jeune homme .
4,train,he 's three years older than me .,il a trois ans de plus que moi .


In [69]:
nmt_df.loc[nmt_df.split == "val"].head()


Unnamed: 0,split,source_language,target_language
550,val,he 's all right .,il va bien .
551,val,he 's in grave danger .,il est en danger grave .
552,val,he 's making progress .,il fait des progrès .
553,val,he 's about to leave .,il est sur le point de s'en aller .
554,val,he 's my half-brother .,il est mon demi-frère .


In [70]:
nmt_df.loc[nmt_df.split == "test"].head()

Unnamed: 0,split,source_language,target_language
668,test,he 's a man of his word .,c'est un homme de parole .
669,test,he 's not available .,il n'est pas disponible .
670,test,he 's in the kitchen .,il est dans la cuisine .
671,test,he 's busy and ca n't meet with you .,puisqu'il est occupé il ne peut pas vous renco...
672,test,he 's a talented writer .,c'est un écrivain de talent .


In [71]:
nmt_df.to_csv(args.output_data_path)