In [None]:
mkdir -p /workspace/miniconda3
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /workspace/miniconda3/miniconda.sh
bash /workspace/miniconda3/miniconda.sh -b -u -p /workspace/miniconda3
rm /workspace/miniconda3/miniconda.sh

/workspace/miniconda3/bin/conda init bash
/workspace/miniconda3/bin/conda init zsh

In [None]:
conda env create --file environment.yml
conda activate py37_ZmBART

In [None]:
tar -xf xlsum.zip -C checkpoints


In [None]:
#Pretrain
import subprocess
import numpy as np

# # Define the variables
# PRETRAIN = "checkpoints/xlsum/pytorch_model"
# langs = "ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN"
# SRC = "isk"
# TGT = "isk"
# NAME = "isk-isk"
# DATADIR = "../dataset/postprocess/aux/isk-isk"
# SAVEDIR = "checkpoints/checkpoint_ZmT5"

# # Construct the command
# command = [
#     "python", "-u", "../ZmBART/train.py", DATADIR,
#     "--arch", "t5",
#     "--task", "translation_from_pretrained_bart",
#     "--source-lang", SRC,
#     "--target-lang", TGT,
#     "--criterion", "label_smoothed_cross_entropy",
#     "--label-smoothing", "0.2",
#     "--optimizer", "adam",
#     "--adam-eps", "1e-06",
#     "--lr-scheduler", "polynomial_decay",
#     "--lr", "3e-05",
#     "--warmup-updates", "2500",
#     "--max-update", "100000",
#     "--dropout", "0.3",
#     "--max-tokens", "2048",
#     "--update-freq", "2",
#     "--save-interval", "6",
#     "--save-interval-updates", "50000",
#     "--keep-interval-updates", "4",
#     "--seed", "222",
#     "--log-interval", "100",
#     "--restore-file", SAVEDIR,
#     "--langs", langs,
#     "--save-dir", SAVEDIR,
#     "--skip-invalid-size-inputs-valid-test"
# ]

# # Monkey patch np.float to np.float64
# # np.float = np.float64
# print("Running command:", " ".join(command))

# # Monkey patch np.float to np.float64
# np.float = np.float64

# # Run the command with error handling
# try:
#     result = subprocess.run(command, check=True, capture_output=True, text=True)
#     print("Command output:", result.stdout)
#     print("Command error (if any):", result.stderr)
# except subprocess.CalledProcessError as e:
#     print("An error occurred while running the command.")
#     print("Return code:", e.returncode)
#     print("Output:", e.output)
#     print("Error:", e.stderr)

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset


# Define variables
PRETRAIN = "puntoChecks/xlsum"
langs = "ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN"
SRC = "isk"
TGT = "isk"
NAME = "isk-isk"
DATADIR = "../data"
SAVEDIR = "puntoChecks/checkpoint_ZmT5"

tokenizer = MT5Tokenizer.from_pretrained("puntoChecks")

def load_pre_tokenized_data(source_file, target_file, max_length=10):
    with open(source_file, 'r', encoding="utf-8") as src_f, open(target_file, 'r', encoding="utf-8") as tgt_f:
        source_lines = src_f.readlines()
        target_lines = tgt_f.readlines()
        
    # Convert data into a format compatible with Hugging Face's Dataset
    data = {
        "input_ids": [
            tokenizer.encode(src.strip(), max_length=max_length, padding='max_length', truncation=True) for src in source_lines
        ],
        "labels": [
            tokenizer.encode(tgt.strip(), max_length=max_length, padding='max_length', truncation=True) for tgt in target_lines
        ],
    }
    
    return Dataset.from_dict(data)

# Load pre-tokenized train and test data
train_dataset = load_pre_tokenized_data(f"{DATADIR}/train.spm.isk", f"{DATADIR}/train.isk.target")
test_dataset = load_pre_tokenized_data(f"{DATADIR}/test.spm.isk", f"{DATADIR}/test.isk.target")


In [None]:
import subprocess
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset


# Load pretrained tokenizer and model

model = MT5ForConditionalGeneration.from_pretrained(PRETRAIN)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


training_args = TrainingArguments(
    output_dir=SAVEDIR,                # Directory for checkpoints
    evaluation_strategy="steps",       # Evaluate every few steps
    learning_rate=3e-05,               # As per your command
    per_device_train_batch_size=64,    # Adjust based on memory
    per_device_eval_batch_size=64,     # Adjust based on memory
    weight_decay=0.01,                 # Helps in regularization
    save_steps=5000,                   # Save checkpoint every 5000 steps
    save_total_limit=2,                # Keep only last 4 checkpoints
    num_train_epochs=150,                # Number of epochs                 # Based on your max update
    warmup_steps=2500,                 # For warmup updates
    logging_dir="./logs",              # Directory for logging
    logging_steps=100,                 # Log every 100 steps
    label_smoothing_factor=0.2,        # For label smoothing
    gradient_accumulation_steps=2,     # To simulate a larger batch size
    adam_epsilon=1e-06,                # Epsilon for Adam optimizer
    lr_scheduler_type="polynomial",    # Learning rate scheduler
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the final model and tokenizer
model.save_pretrained(SAVEDIR)
tokenizer.save_pretrained(SAVEDIR)

In [None]:
import subprocess
import os

# Define the environment variables
env_vars = {
    "task_name": "sum",
    "input_data_dir_name": "xlsum",
    "BASE_DIR": ".",
    "input_dir": "XLSum_input/",
    "output_dir": "outputs/xlsum_out",
    "model_type": "t5",
    "model_chkpt": "ZmT5_checkpoint",
    "cache_dir": "../cache_dir",
    "config_file_name": "auxi_tgt_lang_config",
}

# Create the command to run
command = [
    "python", "train.py",
    "--input_dir", f"{env_vars['input_dir']}{env_vars['input_data_dir_name']}",
    "--output_dir", env_vars['output_dir'],
    "--model_type", env_vars['model_type'],
    "--model_chkpt", env_vars['model_chkpt'],
    "--max_source_length", "512",
    "--max_target_length", "84",
    "--train_batch_size", "4",
    "--learning_rate", "1e-4",
    "--meta_lr", "1e-5",
    "--weight_decay", "0.01",
    "--adam_epsilon", "1e-08",
    "--num_train_epochs", "10",
    "--logging_steps", "10",
    "--save_steps", "1",
    "--cache_dir", env_vars['cache_dir'],
    "--read_n_data_obj", "1000",
    "--task_name", env_vars['task_name'],
    "--freeze_embeds_and_decoder",
    "--task_data_name", env_vars['input_data_dir_name'],
    "--config_file_name", env_vars['config_file_name'],
    "--n_inner_iter", "2",
]

# Run the command in a subprocess with error handling
try:
    result = subprocess.run(command, env={**env_vars, **os.environ}, capture_output=True, text=True, check=True)
    
    # Print the output if the command is successful
    print("Output:", result.stdout)

except subprocess.CalledProcessError as e:
    # Handle the case where the subprocess returns a non-zero exit status
    print("Error: The command failed with the following message:")
    print("Output:", e.stdout)
    print("Error:", e.stderr)

except Exception as e:
    # Handle any other exceptions
    print("An unexpected error occurred:", str(e))


In [4]:
import subprocess
import os


# Define the environment variables
env_vars = {
    "seed": "1234",
    "task_name": "sum",
    "input_data_dir_name": "xlsum",
    "BASE_DIR": ".",
    "input_dir": "../XLSum_input/",
    "output_dir": "outputs/xlsum_14",
    "gen_file_name": "pred.tsv",
    "cache_dir": "../cache_dir",
    "model_type": "t5",
    "model_chkpt": "outputs/xlsum",
}

# Create the command to run
command = [
    "python", "train.py",
    "--input_dir", f"{env_vars['input_dir']}{env_vars['input_data_dir_name']}",
    "--output_dir", env_vars['output_dir'],
    "--model_type", env_vars['model_type'],
    "--model_chkpt", env_vars['model_chkpt'],
    "--test_batch_size", "32",
    "--max_source_length", "512",
    "--max_target_length", "84",
    "--length_penalty", "0.6",
    "--beam_size", "4",
    "--early_stopping",
    "--num_of_return_seq", "1",
    "--min_generated_seq_len", "0",
    "--max_generated_seq_len", "200",
    "--cache_dir", env_vars['cache_dir'],
    "--read_n_data_obj", "-1",
    "--gen_file_name", env_vars['gen_file_name'],
    "--task_name", env_vars['task_name'],
    "--task_data_name", env_vars['input_data_dir_name'],
    "--do_test",
]

# Combine environment variables and include the current environment
combined_env = {**env_vars, **os.environ}

# Activate the conda environment and run the command
try:
    
    # Run the main command
    result = subprocess.run(command, env=combined_env, capture_output=True, text=True, check=True)

    # Print the output if the command is successful
    print("Output:", result.stdout)

except subprocess.CalledProcessError as e:
    # Handle the case where the subprocess returns a non-zero exit status
    print("Error: The command failed with the following message:")
    print("Output:", e.stdout)
    print("Error:", e.stderr)

except Exception as e:
    # Handle any other exceptions
    print("An unexpected error occurred:", str(e))


Error: The command failed with the following message:
Output: 
Error: Traceback (most recent call last):
  File "train.py", line 40, in <module>
    from utils_trans import (
  File "/workspace/Tesis/O3_modelos/Meta_XNLG/utils_trans.py", line 18, in <module>
    from sacrebleu import corpus_bleu
  File "/workspace/Tesis/O3_modelos/Meta_XNLG/sacrebleu/__init__.py", line 25, in <module>
    from .tokenizers import TOKENIZERS, DEFAULT_TOKENIZER
  File "/workspace/Tesis/O3_modelos/Meta_XNLG/sacrebleu/tokenizers/__init__.py", line 10, in <module>
    from .tokenizer_indic import TokenizerIndic
  File "/workspace/Tesis/O3_modelos/Meta_XNLG/sacrebleu/tokenizers/tokenizer_indic.py", line 3, in <module>
    from indicnlp.tokenize import indic_tokenize 
ModuleNotFoundError: No module named 'indicnlp'

