In [2]:
!pip install transformers
!pip install sacremoses
!pip install fairseq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m115.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [3]:
import os
import json
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch

In [4]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:  
    from google.colab import drive
    drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
BASE_PATH = "/content/drive/MyDrive/NLP/CA5"
TOKENIZED_DATA_PATH = os.path.join(BASE_PATH, "mbert_tokenized_data")
TRAIN_DATA_PATH = os.path.join(TOKENIZED_DATA_PATH, "train")
VALID_DATA_PATH = os.path.join(TOKENIZED_DATA_PATH, "valid")
TEST_DATA_PATH = os.path.join(TOKENIZED_DATA_PATH, "test")
DATA_DIR_PATH = os.path.join(BASE_PATH, "mbert_data_dir")

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased")
text = "This is a test"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
!mkdir -p ./data_bin

In [8]:
!fairseq-preprocess   --joined-dictionary	 --source-lang en --target-lang fa \
  --trainpref {TRAIN_DATA_PATH} \
  --validpref {VALID_DATA_PATH} \
  --testpref {TEST_DATA_PATH} \
  --destdir {DATA_DIR_PATH}

2023-06-10 16:03:33 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX
2023-06-10 16:03:34 | INFO | fairseq_cli.preprocess | Namespace(no_progress_bar=False, log_interval=100, log_format=None, log_file=None, aim_repo=None, aim_run_hash=None, tensorboard_logdir=None, wandb_project=None, azureml_logging=False, seed=1, cpu=False, tpu=False, bf16=False, memory_efficient_bf16=False, fp16=False, memory_efficient_fp16=False, fp16_no_flatten_grads=False, fp16_init_scale=128, fp16_scale_window=None, fp16_scale_tolerance=0.0, on_cpu_convert_precision=False, min_loss_scale=0.0001, threshold_loss_scale=None, amp=False, amp_batch_retries=2, amp_init_scale=128, amp_scale_window=None, user_dir=None, empty_cache_freq=0, all_gather_list_size=16384, model_parallel_size=1, quantization_config_path=None, profile=False, reset_logging=False, suppress_crashes=False, use_plasma_view=False, plasma_path='/tmp/plasma', criterion='cross_entropy', tokenizer=None, bpe=None

In [78]:
with open(os.path.join(DATA_DIR_PATH, "dict.en.txt"), "r") as f:
    mbert_dict = f.read().splitlines()

In [79]:
len(mbert_dict)

33212

In [80]:
mbert_dict_normalized = {}
for index,item in enumerate(mbert_dict):
    if item.split()[1].isdigit():
        mbert_dict_normalized[item.split()[0]] = item.split()[1]
    else:
        mbert_dict_normalized[item.split()[1]] = item.split()[0]

In [81]:
mbert_dict_embeddings = {}

for key in tqdm(mbert_dict_normalized.keys()):
    encoded = tokenizer.encode(key)
    input_ids = torch.tensor([encoded])

    with torch.no_grad():
      embedding = model(input_ids)[0].mean(1)[0].tolist()

    mbert_dict_embeddings[key] = embedding



100%|██████████| 33212/33212 [36:31<00:00, 15.16it/s]


In [85]:
with open(os.path.join(TOKENIZED_DATA_PATH,"mbert_dict_embeddings.txt"), 'w') as f:
    f.write(str(len(mbert_dict_embeddings)))
    f.write(" ")
    f.write(str(len(list(mbert_dict_normalized.values())[0])))
    f.write("\n")
    for key in tqdm(mbert_dict_embeddings.keys()):
        f.write(key)
        f.write(" ")
        f.write(' '.join([str(x) for x in mbert_dict_embeddings[key]]))
        f.write("\n")

100%|██████████| 33212/33212 [00:30<00:00, 1094.41it/s]


In [None]:
!fairseq-train \
    {DATA_DIR_PATH} \
    --arch lstm --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9,0.98)' --clip-norm 0.0 \
    --lr 2.5e-3 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.25 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
    --max-tokens 4096 \
    --eval-bleu \
    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
    --eval-bleu-detok moses \
    --eval-bleu-print-samples \
    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric \
    --fp16 --memory-efficient-fp16 \
    --max-epoch 5 \
    --save-dir ./content/drive/MyDrive/bpe/checkpoints/ \
    --tensorboard-logdir ./content/drive/MyDrive/bpe/log/



In [82]:
!ls /content/drive/MyDrive/NLP/CA5/mbert_tokenized_data

test.en  test.fa  train.en  train.fa  valid.en	valid.fa
