In [2]:
# No es necesario Tensorflow
!pip uninstall -y tensorflow
# Instalar transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1
!pip install transformers[torch] accelerate -U

[0mtokenizers                               0.19.1
transformers                             4.42.3


In [3]:
import os
!pip install gdown

Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


In [4]:
#Descargar archivos de google Drive
import gdown
# Lista de archivos necesarios
files = [
    ("16SkLOsfja22kIwExs4NiU5pjrOV7SUdP", "pytorch_model.bin"),
    ("1PrM9LMJ9Pmrc8yqKBT1OMRPXD1urkJ1r", "merges.txt"),
    ("1i6L13u5P9HVzzmKsNZxe_wICteulIWY5", "vocab.json"),
    ("1lDaVeJc90TKbBrhxZKZbIfRTPv9VSsOg", "config.json")
]
folder_name = "quechuaBERT"
os.makedirs(folder_name, exist_ok=True)

# Descargar
for file_id, file_name in files:
    destination_path = os.path.join(folder_name, file_name)
    gdown.download(f"https://drive.google.com/uc?id={file_id}", destination_path, quiet=False)
#tokenizer.save_model("quechuaBERT")

Downloading...
From (original): https://drive.google.com/uc?id=16SkLOsfja22kIwExs4NiU5pjrOV7SUdP
From (redirected): https://drive.google.com/uc?id=16SkLOsfja22kIwExs4NiU5pjrOV7SUdP&confirm=t&uuid=800d0c06-da93-4fe3-9f3b-72bfe830f705
To: /kaggle/working/quechuaBERT/pytorch_model.bin
100%|██████████| 334M/334M [00:12<00:00, 27.6MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1PrM9LMJ9Pmrc8yqKBT1OMRPXD1urkJ1r
To: /kaggle/working/quechuaBERT/merges.txt
100%|██████████| 483k/483k [00:00<00:00, 75.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1i6L13u5P9HVzzmKsNZxe_wICteulIWY5
To: /kaggle/working/quechuaBERT/vocab.json
100%|██████████| 837k/837k [00:00<00:00, 90.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1lDaVeJc90TKbBrhxZKZbIfRTPv9VSsOg
To: /kaggle/working/quechuaBERT/config.json
100%|██████████| 676/676 [00:00<00:00, 989kB/s]


In [5]:
#Descargar el corpus de nuestro repositorio https://github.com/Xnehil/TACC-Lexemas/blob/main/data/corpus/corpus.csv
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/Xnehil/TACC-Lexemas/main/data/corpus/corpus.csv")
df.head()

#Y guardar un txt; una fila por 'sentence' del dataframe
with open('corpus.txt', 'w') as f:
    for sentence in df['sentence']:
        f.write(sentence + '\n')

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

## QuBERT

Toda esta sección está basada en el código del repositorio https://github.com/Llamacha/QuBERT. Este modelo fue presentado en el paper [Introducing QuBERT: A Large Monolingual Corpus and BERT Model for Southern Quechua](https://aclanthology.org/2022.deeplo-1.1) (Zevallos et al., DeepLo 2022)

In [6]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

# Cargar el tokenizador de quechuaBERT
tokenizer = ByteLevelBPETokenizer(
    "./quechuaBERT/vocab.json",
    "./quechuaBERT/merges.txt",
)

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [8]:
tokenizer.encode("allinllachu manan allinlla huk wasipita").tokens

['<s>',
 'allin',
 'llachu',
 'Ġmanan',
 'Ġallinlla',
 'Ġhuk',
 'Ġwasipi',
 'ta',
 '</s>']

In [None]:
# Verificar GPU
!nvidia-smi

In [6]:
# Verificar que PyTorch lo vea
import torch
torch.cuda.is_available()

True

In [7]:
from transformers import RobertaConfig

# configuración de Roberta 
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [8]:
import json
tokenizer_config = {"max_len": 512}

with open("./quechuaBERT/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [9]:
from transformers import RobertaTokenizerFast

# Cargar el tokenizador para el modelo
tokenizer = RobertaTokenizerFast.from_pretrained("./quechuaBERT", max_len=512)

In [None]:
from transformers import RobertaForMaskedLM, RobertaForCausalLM, RobertaTokenizer

# El modelo original
model_mlm = RobertaForMaskedLM.from_pretrained('./quechuaBERT')

# Las siguientes modificaciones fueron hechas para la generación de textos
# RobertaFoCausal es para generación
model_clm = RobertaForCausalLM.from_pretrained('./quechuaBERT')

# Copiar los pesos del model MLM al modelo CLM
model_clm.roberta = model_mlm.roberta

In [None]:
model_clm.num_parameters()

In [None]:
%%time
from transformers import LineByLineTextDataset

# Cargar corpus generado a Dataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./corpus.txt",
    block_size=128,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

# Generar el DataCollator con el tokenizador
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [None]:
from transformers import Trainer, TrainingArguments
import os

# Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to="none",
)

# Declarar trainer con parámetros necesarios
trainer = Trainer(
    model=model_clm,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
# para evitar errores con cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# entrenar modelo
trainer.train()

In [None]:
# guardar modelo en directorio prueba
trainer.save_model("./prueba")

In [None]:
# Codificar el prompt de entrada
input_ids = tokenizer.encode("", return_tensors='pt').to(device)

# Generar texto
output = model_clm.generate(input_ids, max_length=40)

# Decodificar texto generado
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


## GPT -2

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch

# Para evitar errores con cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar el tokenizador de gpt2
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Cargar el modelo 
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

2024-07-02 14:13:51.131065: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 14:13:51.131189: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 14:13:51.278601: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Función de carga del corpus
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

file_path = './corpus.txt'

# Cargar y tokenizar el corpus en el dataset
dataset = load_dataset(file_path, tokenizer)

In [None]:
# definir DataCollator con el tokenizador
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    logging_steps=200,
)

# Inicializar el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
# Entrenar el modelo
trainer.train()

# Guardar el modelo y el tokenizador
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

In [None]:
# prompt de entrada
input_prompt = "Yaqapis"

# Codificar prompt de entrada
input_ids = tokenizer.encode(input_prompt, return_tensors='pt').to(device)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)

# generar texto
output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.9, temperature=0.8,
                        pad_token_id=tokenizer.eos_token_id)

# Dedodificar el texto generado
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Texto generado:")
print(generated_text)

## gpt2-genre-generator

Este modelo fue sacado de https://huggingface.co/aspis/gpt2-genre-story-generation. Parte del código fue basado en sus instrucciones de uso.

### Descargar pesos

In [21]:
!pip install gdown 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


In [22]:
#Descargar pesos de google Drive
import gdown
import zipfile
# Lista de archivos necesarios
files = [
    ("18V2NdQ8sExQZanPh3RpQ7URwDaBEc0K7", "data.zip"),
]
folder_name = "story_generator_fined_tuned"
os.makedirs(folder_name, exist_ok=True)

# Descargar
for file_id, file_name in files:
    destination_path = os.path.join(folder_name, file_name)
    gdown.download(f"https://drive.google.com/uc?id={file_id}", destination_path, quiet=False)

# Descomprimir los archivos
for file_id, file_name in files:
    zip_path = os.path.join(folder_name, file_name)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(folder_name)

# Opcional: eliminar el archivo ZIP después de descomprimir
for file_id, file_name in files:
    zip_path = os.path.join(folder_name, file_name)
    os.remove(zip_path)

Downloading...
From (original): https://drive.google.com/uc?id=18V2NdQ8sExQZanPh3RpQ7URwDaBEc0K7
From (redirected): https://drive.google.com/uc?id=18V2NdQ8sExQZanPh3RpQ7URwDaBEc0K7&confirm=t&uuid=70a33c21-fb54-4e1e-adad-dbadcecc2c5e
To: /kaggle/working/story_generator_fined_tuned/data.zip

  0%|          | 0.00/464M [00:00<?, ?B/s][A
  1%|          | 4.72M/464M [00:00<00:23, 19.3MB/s][A
  4%|▎         | 17.3M/464M [00:00<00:10, 41.9MB/s][A
  5%|▌         | 24.1M/464M [00:00<00:09, 48.9MB/s][A
  7%|▋         | 34.1M/464M [00:00<00:08, 50.9MB/s][A
 10%|▉         | 44.6M/464M [00:00<00:06, 63.7MB/s][A
 11%|█         | 51.9M/464M [00:00<00:07, 58.7MB/s][A
 13%|█▎        | 59.2M/464M [00:01<00:07, 56.8MB/s][A
 15%|█▍        | 68.2M/464M [00:01<00:06, 64.6MB/s][A
 16%|█▋        | 76.0M/464M [00:01<00:06, 63.2MB/s][A
 18%|█▊        | 85.5M/464M [00:01<00:05, 70.6MB/s][A
 20%|██        | 93.3M/464M [00:01<00:05, 65.2MB/s][A
 23%|██▎       | 105M/464M [00:01<00:04, 77.8MB/s] [A
 24

### Entrenamiento

In [23]:
from transformers import pipeline, TextGenerationPipeline, GPT2LMHeadModel, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling, RobertaTokenizerFast
import torch

# para evitar errores con el cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar modelo y tokenizador
model_source=""
if(False):
    model_source = "aspis/gpt2-genre-story-generation"
else:
    #Pesos ya finetuneados
    model_source = './story_generator_fined_tuned'

model = GPT2LMHeadModel.from_pretrained(model_source)
tokenizer = AutoTokenizer.from_pretrained(model_source)
#tokenizer = RobertaTokenizerFast.from_pretrained("./quechuaBERT")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
# Función para cargar el corpus
def load_dataset(file_path, tokenizer, block_size=512):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

file_path = './corpus.txt'

# Cargar y tokenizar el dataset
dataset = load_dataset(file_path, tokenizer)



In [25]:
from transformers import Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Declarar DataCollator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Inicializar el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

Ejecutar la siguiente celda la cantidad de épocas que se desee para entrenar el modelo.

In [7]:
# Entrenar el modelo
trainer.train()

# Guardar modelo y tokenizador
model.save_pretrained('./story_generator_fined_tuned')
tokenizer.save_pretrained('./story_generator_fined_tuned')

Step,Training Loss
500,2.2491
1000,2.1957
1500,2.1437
2000,2.104
2500,2.0597
3000,2.028
3500,2.004
4000,1.9734
4500,1.9615
5000,1.9435


('./story_generator_fined_tuned/tokenizer_config.json',
 './story_generator_fined_tuned/special_tokens_map.json',
 './story_generator_fined_tuned/vocab.json',
 './story_generator_fined_tuned/merges.txt',
 './story_generator_fined_tuned/added_tokens.json',
 './story_generator_fined_tuned/tokenizer.json')

### Usar

In [26]:
# Declarar el generador con pipeline de transformers
generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)

In [27]:
# prompt de entrada 
input_prompt = "<BOS> <adventure> Huk kutin allqu kasqa" # <adventure> indica el género de la entrada
story = generator(input_prompt, max_length=250, do_sample=True,
               repetition_penalty=1.5, temperature=1.2, 
               top_p=0.95, top_k=50)
print(story)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': '<BOS> <adventure> Huk kutin allqu kasqa, chaymi: —\t Ñuqapaqpas qhipamanchikta takisunmanchu aswan kimsa killa puriqtapuni pukyupura rimakuyani; paqarisparaq minkawan rimanakuyniykitaqa.\nAllintaraq uywaspallañataq: “Ch’ampallaykunawanmi qullaña apawaykuway imatá mikhunawankichí,  manachu haywakunkuschiki uywasunki nispacha karqa. …Kay supipis mana pipas chikanllaptinsi wakcha uywatahinas maypin risqariwaq.ìnachus kaymanta apayaq ari apachitamanraq kaypi umaykin; ñanpa sunquymantam ichhuku wasiman ripaspa hapirqusunchikkama, hina wataqa pi yawar masinchikman sapatullasaq —nispan kayhinatan ” nin chiqaqtallas ruraw'}]


In [28]:
#Solo para descargar de Kaggle
import os
os.chdir(r'/kaggle/working')


from IPython.display import FileLinks

FileLinks(r'story_generator_fined_tuned')