In [2]:
# from transformers import T5Config, T5ForConditionalGeneration

# config = T5Config(
#     d_model=512,  # Embedding dimension
#     d_ff=2048,    # Feed-forward layer size
#     num_layers=8,  # Number of encoder and decoder layers
#     num_heads=8,   # Attention heads
#     vocab_size=32128  # Size of the vocabulary
# )

# model = T5ForConditionalGeneration(config)


In [None]:
import re
# Load the vocabulary (token -> index) mapping
vocab = {}  # You should populate this with your BPE vocabulary
with open('tokenizadorIskonawa.vocab', 'r', encoding='utf-8') as vocab_file:
    for idx, line in enumerate(vocab_file):
        token, code = re.split(r'\t', line.strip())
        # Save as integer
        vocab[token] = idx

# Load the BPE tokenized dataset
def load_bpe_dataset(file_path, vocab):
    dataset = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Tokenize each line into subwords and convert them to indices
            tokens = line.strip().split()  # Assuming tokens are space-separated
            token_ids = [vocab.get(token, vocab['<unk>']) for token in tokens]  # Handle unknown tokens
            dataset.append(token_ids)
    return dataset

bpe_tokenized_dataset = load_bpe_dataset('tokens.txt', vocab)

# Check dataset example
print(bpe_tokenized_dataset[:1])

In [3]:
split_ratio = 0.2

for split in ['eval', 'train']:
    corpus = 'tokens.txt'
    output = f'iskCorpus.t5.{split}.tsv'
    with open(corpus, 'r') as reader:
        lines = reader.readlines()
        total_lines = len(lines)
        split_index = int(total_lines * split_ratio)
        
        with open(output, 'w') as writer:
            writer.write('input_text\ttarget_text\n')
            for number, line in enumerate(lines):
                if split == 'train' and number >= split_index:
                    line = line.replace('\t', ' ').replace('\n', '')
                    writer.write(line+'\t'+line+'\n')
                elif split == 'eval' and number < split_index:
                    line = line.replace('\t', ' ').replace('\n', '')
                    writer.write(line+'\t'+line+'\n')



In [2]:
# import functools
# import seqio
# import t5.data
# from t5.data import preprocessors
# import tensorflow as tf
# TaskRegistry = seqio.TaskRegistry
# MixtureRegistry = seqio.MixtureRegistry

# # Define your dataset splits
# datasplit = {
#     "train": "iskCorpus.t5.train.tsv",
#     "validation": "iskCorpus.t5.eval.tsv"
# }

# DEFAULT_OUTPUT_FEATURES = {
#     "inputs": seqio.Feature(
#         seqio.SentencePieceVocabulary(vocab), add_eos=True,
#         required=False, dtype=tf.int32),
#     "targets": seqio.Feature(
#         seqio.SentencePieceVocabulary(vocab), add_eos=True, dtype=tf.int32)
# }

# # Add the text generation tasks
# TaskRegistry.add(
#     "text_generation_span_corruption",
#     source=seqio.TextLineDataSource(split_to_filepattern=datasplit),
#     preprocessors=[
#         functools.partial(preprocessors.parse_tsv),
#         seqio.preprocessors.tokenize,
#         preprocessors.span_corruption,  # 15% span corruption
#         seqio.preprocessors.append_eos_after_trim,
#     ],
#     output_features=DEFAULT_OUTPUT_FEATURES,
#     metric_fns=[]  # Add evaluation metrics if needed
# )

# TaskRegistry.add(
#     "text_generation_iid_denoising",
#     source=seqio.TextLineDataSource(split_to_filepattern=datasplit),
#     preprocessors=[
#         functools.partial(preprocessors.parse_tsv),
#         seqio.preprocessors.tokenize,
#         preprocessors.iid_denoising,    # 15% i.i.d. denoising
#         seqio.preprocessors.append_eos_after_trim,
#     ],
#     output_features=DEFAULT_OUTPUT_FEATURES,
#     metric_fns=[]  # Add evaluation metrics if needed
# )

# # Mixture of both tasks
# MixtureRegistry.add(
#     "text_generation_mixture",
#     ["text_generation_span_corruption", "text_generation_iid_denoising"],
#     default_rate=1.0
# )

In [None]:
# !export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/harvymartinez16_gmail_com/Tesis/
import tensorflow as tf

# Print TensorFlow version
print("Tensorflow version " + tf.__version__)

# List and print GPU devices available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print("GPUs available:")
    for gpu in gpus:
        print(gpu)
else:
    print("No GPU found.")

# Define a function for addition using @tf.function
@tf.function
def add_fn(x, y):
    z = x + y
    return z

# Example of running the function
x = tf.constant(2.0)
y = tf.constant(3.0)
result = add_fn(x, y)
print("Result: ", result)


In [None]:
import os
import t5
import gin
import logging
import tensorflow as tf


# Set up your environment variables and paths
MODEL_DIR = "t5_sl_small"  # Change to your desired model directory
# os.environ["PYTHONPATH"] = f"{os.environ['PYTHONPATH']}:/text-to-text-transfer-transformer:/text-to-text-transfer-transformer/t5"

# Define training parameters
train_steps = 1000000
save_checkpoints_steps = 40000
keep_checkpoint_max = 2

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set up the gin configuration for the model and dataset
# You can create your .gin files or configure them directly in the notebook.
gin.parse_config_file("config/dataset.gin")
gin.parse_config_file("config/t5.1.1.small.gin")
gin.bind_parameter("utils.run.mesh_shape", "model:1,batch:1")
gin.bind_parameter("utils.run.mesh_devices", ["GPU:0"])
# gin.bind_parameter("MIXTURE_NAME", "mixture_iskonawa_test")  # Your mixture name
gin.bind_parameter("utils.run.save_checkpoints_steps", save_checkpoints_steps)
gin.bind_parameter("utils.run.keep_checkpoint_max", keep_checkpoint_max)
gin.bind_parameter("utils.run.train_steps", train_steps)

class ProgressCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logger.info(f"Epoch {epoch + 1}: loss = {logs['loss']:.4f}, accuracy = {logs['accuracy']:.4f}")
    def on_train_batch_end(self, batch, logs=None):
        if batch % 100 == 0:  # Log every 1000 steps
            logger.info(f"Step {batch}: loss = {logs.get('loss', 'N/A'):.4f}, accuracy = {logs.get('accuracy', 'N/A'):.4f}")

# strategy = tf.distribute.MirroredStrategy()  # Supports multi-GPU, defaults to single GPU if only one available

# # Start the training
# with strategy.scope():
#     mesh_transformer(
#         model_dir=MODEL_DIR,
#         gin_file=["/config/dataset.gin","/config/t5.1.1.small.gin"],
#         module_import="/config/mytask3",  
#         callbacks=[ProgressCallback()]
#     )


In [None]:
import subprocess
import os
# Define the model directory
MODEL_DIR = "t5_isk_small"
train_steps = 1000000
save_checkpoints_steps = 40000
keep_checkpoint_max = 2 
# Construct the command
os.environ["PYTHONPATH"] = os.path.abspath(".")  # Adjust according to your structure

subprocess.run("pwd")
command = [
    "t5_mesh_transformer",
    f"--model_dir={MODEL_DIR}",
    "--gin_file=config/dataset.gin",
    "--gin_file=config/t5.1.1.small.gin",
    "--gin_param=utils.run.mesh_shape='model:1,batch:1'",
    "--gin_param=utils.run.mesh_devices=['GPU:0']",
    f"--gin_param=utils.run.save_checkpoints_steps={save_checkpoints_steps}",
    f"--gin_param=utils.run.keep_checkpoint_max={keep_checkpoint_max}",
    f"--gin_param=utils.run.train_steps={train_steps}",
    "--module_import=config.mytask3"
]

# Execute the command
try:
    result = subprocess.run(command, check=True, text=True, capture_output=True)
    print("Output:", result.stdout)
    print("Errors:", result.stderr)
except subprocess.CalledProcessError as e:
    print(f"Error: {e}")
    print(f"Output: {e.stdout}")
    print(f"Errors: {e.stderr}")
