In [1]:
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
#!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1
!pip install transformers[torch] accelerate -U


[0mtokenizers                       0.19.1
transformers                     4.41.2


In [2]:
#Descargar archivos de google Drive
import gdown
import os
# Lista de archivos necesarios
files = [
    ("16SkLOsfja22kIwExs4NiU5pjrOV7SUdP", "pytorch_model.bin"),
    ("1PrM9LMJ9Pmrc8yqKBT1OMRPXD1urkJ1r", "merges.txt"),
    ("1i6L13u5P9HVzzmKsNZxe_wICteulIWY5", "vocab.json"),
    ("1lDaVeJc90TKbBrhxZKZbIfRTPv9VSsOg", "config.json")
]
folder_name = "quechuaBERT"
os.makedirs(folder_name, exist_ok=True)

# Descargar
for file_id, file_name in files:
    destination_path = os.path.join(folder_name, file_name)
    gdown.download(f"https://drive.google.com/uc?id={file_id}", destination_path, quiet=False)
#tokenizer.save_model("quechuaBERT")

Downloading...
From (original): https://drive.google.com/uc?id=16SkLOsfja22kIwExs4NiU5pjrOV7SUdP
From (redirected): https://drive.google.com/uc?id=16SkLOsfja22kIwExs4NiU5pjrOV7SUdP&confirm=t&uuid=027f065c-5009-4684-91cf-68426495fa4c
To: /content/quechuaBERT/pytorch_model.bin
100%|██████████| 334M/334M [00:02<00:00, 151MB/s]
Downloading...
From: https://drive.google.com/uc?id=1PrM9LMJ9Pmrc8yqKBT1OMRPXD1urkJ1r
To: /content/quechuaBERT/merges.txt
100%|██████████| 483k/483k [00:00<00:00, 68.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=1i6L13u5P9HVzzmKsNZxe_wICteulIWY5
To: /content/quechuaBERT/vocab.json
100%|██████████| 837k/837k [00:00<00:00, 92.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1lDaVeJc90TKbBrhxZKZbIfRTPv9VSsOg
To: /content/quechuaBERT/config.json
100%|██████████| 676/676 [00:00<00:00, 769kB/s]


In [3]:
#Descargar el corpus de nuestro repositorio https://github.com/Xnehil/TACC-Lexemas/blob/main/data/corpus/corpus.csv
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/Xnehil/TACC-Lexemas/main/data/corpus/corpus.csv")
df.head()

#Y guardar un txt; una fila por 'sentence' del dataframe
with open('corpus.txt', 'w') as f:
    for sentence in df['sentence']:
        f.write(sentence + '\n')

## QuBERT

In [4]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./quechuaBERT/vocab.json",
    "./quechuaBERT/merges.txt",
)

In [5]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [6]:
tokenizer.encode("allinllachu manan allinlla huk wasipita").tokens

['<s>',
 'allin',
 'llachu',
 'Ġmanan',
 'Ġallinlla',
 'Ġhuk',
 'Ġwasipi',
 'ta',
 '</s>']

In [7]:
# Check that we have a GPU
!nvidia-smi

Thu Jun 20 21:58:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [9]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [10]:
import json
tokenizer_config = {"max_len": 512}

with open("./quechuaBERT/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

In [11]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./quechuaBERT", max_len=512)

In [12]:
from transformers import RobertaForMaskedLM, RobertaForCausalLM, RobertaTokenizer
# El modelo original

model_mlm = RobertaForMaskedLM.from_pretrained('./quechuaBERT')

# RobertaFoCausal es para generación
model_clm = RobertaForCausalLM.from_pretrained('./quechuaBERT')

# Copy the weights from the MLM model to the CLM model
model_clm.roberta = model_mlm.roberta

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


In [13]:
model_clm.num_parameters()
# => 84 million parameters

123440416

In [14]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./corpus.txt",
    block_size=128,
)



CPU times: user 4.27 s, sys: 273 ms, total: 4.54 s
Wall time: 5.88 s


In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [16]:
from transformers import Trainer, TrainingArguments
import os



training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    log_level="debug",
)

trainer = Trainer(
    model=model_clm,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [17]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


trainer.train()

Currently training with a batch size of: 2
***** Running training *****
  Num examples = 21,254
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 10,627
  Number of trainable parameters = 123,440,416


Step,Training Loss
500,6.5446
1000,4.9016
1500,3.4139
2000,2.3786
2500,1.6425
3000,1.2684
3500,0.9863
4000,0.7887
4500,0.6132
5000,0.5571


Saving model checkpoint to ./results/checkpoint-10000
Configuration saved in ./results/checkpoint-10000/config.json
Configuration saved in ./results/checkpoint-10000/generation_config.json
Model weights saved in ./results/checkpoint-10000/model.safetensors


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=10627, training_loss=1.24686365638188, metrics={'train_runtime': 952.3075, 'train_samples_per_second': 22.318, 'train_steps_per_second': 11.159, 'total_flos': 381802933095168.0, 'train_loss': 1.24686365638188, 'epoch': 1.0})

In [21]:
trainer.save_model("./prueba")

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
# Encode input prompt
input_ids = tokenizer.encode("", return_tensors='pt').to(device)

# Generate text
output = model_clm.generate(input_ids, max_length=40)

# Decode generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)


 miimimi


In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer="./quechuaBERT"
)

In [None]:
fill_mask("allinllachu <mask> allinlla huk wasipita.")

[{'score': 0.23992183804512024,
  'token': 334,
  'token_str': ' nisqaqa',
  'sequence': 'allinllachu nisqaqa allinlla huk wasipita.'},
 {'score': 0.061005160212516785,
  'token': 16,
  'token_str': ',',
  'sequence': 'allinllachu, allinlla huk wasipita.'},
 {'score': 0.028719963505864143,
  'token': 11,
  'token_str': "'",
  'sequence': "allinllachu' allinlla huk wasipita."},
 {'score': 0.012927922420203686,
  'token': 377,
  'token_str': ' kay',
  'sequence': 'allinllachu kay allinlla huk wasipita.'},
 {'score': 0.012300901114940643,
  'token': 18,
  'token_str': '.',
  'sequence': 'allinllachu. allinlla huk wasipita.'}]

In [None]:
#Runap wiñarquypa puriyninmantam rikuchin.
#Presenta el grado del desarrollo humano .
#<mask>
fill_mask("<mask> wiñarquypa puriyninmantam rikuchin.")

[{'score': 0.03649432212114334,
  'token': 920,
  'token_str': 'Chay',
  'sequence': 'Chay wiñarquypa puriyninmantam rikuchin.'},
 {'score': 0.027767308056354523,
  'token': 2953,
  'token_str': '¿',
  'sequence': '¿ wiñarquypa puriyninmantam rikuchin.'},
 {'score': 0.01818837597966194,
  'token': 664,
  'token_str': 'Kay',
  'sequence': 'Kay wiñarquypa puriyninmantam rikuchin.'},
 {'score': 0.004317810758948326,
  'token': 2031,
  'token_str': 'Ima',
  'sequence': 'Ima wiñarquypa puriyninmantam rikuchin.'},
 {'score': 0.004133380018174648,
  'token': 18,
  'token_str': '.',
  'sequence': '. wiñarquypa puriyninmantam rikuchin.'}]

In [None]:
# Conexion a Google Colaborative
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


## GPT -2

In [40]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load the model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer_config.json
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/tokenizer.json


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_vers

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/model.safetensors
Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}

All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/607a30d783dfa663caf39e06633721c8d4cfcd7e/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256
}



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [41]:
# Load the dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

# Path to your dataset
file_path = './corpus.txt'

# Load and tokenize the dataset
dataset = load_dataset(file_path, tokenizer)

Creating features from dataset file at .
Saving features into cached file ./cached_lm_GPT2Tokenizer_128_corpus.txt [took 0.025 s]


In [42]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir='./logs',
    logging_steps=200,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [43]:
# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

Currently training with a batch size of: 2
***** Running training *****
  Num examples = 8,815
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 4,408
  Number of trainable parameters = 124,439,808


Step,Training Loss
200,4.0752
400,3.455
600,3.3
800,3.2152
1000,3.1079
1200,3.059
1400,3.0269
1600,2.9686
1800,2.9141
2000,2.8735




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in ./fine_tuned_model/config.json
Configuration saved in ./fine_tuned_model/generation_config.json
Model weights saved in ./fine_tuned_model/model.safetensors
tokenizer config file saved in ./fine_tuned_model/tokenizer_config.json
Special tokens file saved in ./fine_tuned_model/special_tokens_map.json


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json')

In [53]:
# input prompt
input_prompt = "Yaqapis"

input_ids = tokenizer.encode(input_prompt, return_tensors='pt').to(device)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=device)

output = model.generate(input_ids, max_length=100, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.9, temperature=0.8,
                        pad_token_id=tokenizer.eos_token_id)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Texto generado:")
print(generated_text)

Texto generado:
Yaqapis  llaqtapi kawsaykunata  qillqasqata qispichiy.
Chayhinatam  kutichisqanmanhina,  chaymi kaptinqa  yuyaychaykuna, chaysi  ñawpaqtaqa kasqa, huk ichaqa chakramanta  ruray.  ¿Imamantataq  ll
