In [1]:
# !pip install --upgrade transformers accelerate

In [2]:
# !pip install datasets

In [3]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

In [4]:
corpus_path = 'corpus.csv'
df = pd.read_csv(corpus_path, encoding='utf-8')
df['text'] = df['Title'] + '. ' + df['Body']

In [5]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=[corpus_path], vocab_size=52000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model(".", "shipibo_tokenizer")

['./shipibo_tokenizer-vocab.json', './shipibo_tokenizer-merges.txt']

In [6]:
from transformers import GPT2TokenizerFast

# Cargar el tokenizador desde los archivos vocab y merges
tokenizer = GPT2TokenizerFast(
    vocab_file="shipibo_tokenizer-vocab.json",
    merges_file="shipibo_tokenizer-merges.txt"
)

# Añadir tokens especiales
tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "unk_token": "<unk>",
    "mask_token": "<mask>",
    "bos_token": "<s>",
    "eos_token": "</s>",
})

# Verificar si se han añadido correctamente los tokens especiales
print("Tokens especiales añadidos:", tokenizer.special_tokens_map)

Tokens especiales añadidos: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>'}


In [7]:
# Ejemplo de texto en Shipibo-Konibo
input_text = "Huestíora joni ronqui"
tokens = tokenizer.encode(input_text)
print("Tokens:", tokens)

# Des-tokenización
decoded_text = tokenizer.decode(tokens)
print("Texto decodificado:", decoded_text)

Tokens: [44, 89, 992, 88, 132, 260, 363, 321, 795, 85, 89, 77]
Texto decodificado: Huestíora joni ronqui


In [8]:
from transformers import GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# Crear el modelo desde cero
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

model = GPT2LMHeadModel(config)

# Crear el dataset de Hugging Face
corpus_path = 'corpus.csv'
df = pd.read_csv(corpus_path)
df['text'] = df['Title'] + ' ' + df['Body']
dataset = Dataset.from_pandas(df[['text']])

# Tokenizar el corpus
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Crear el data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# Inicializar el Trainer y entrenar el modelo
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

trainer.train()


Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=370, training_loss=5.987418179898649, metrics={'train_runtime': 163.4094, 'train_samples_per_second': 9.057, 'train_steps_per_second': 2.264, 'total_flos': 386712207360000.0, 'train_loss': 5.987418179898649, 'epoch': 10.0})

In [9]:
# Guardar el modelo y el tokenizador
trainer.save_model("./results")  # Esto guarda el modelo, la configuración y el tokenizador
tokenizer.save_pretrained("./results")

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.json',
 './results/merges.txt',
 './results/added_tokens.json',
 './results/tokenizer.json')

In [10]:
# Cargar el modelo entrenado
model = GPT2LMHeadModel.from_pretrained('./results')


In [11]:
import re

def capitalize_sentences(text):
    sentences = re.split('(?<=[.!?]) +', text)  # Dividir el texto en oraciones
    capitalized_sentences = [sentence.capitalize() for sentence in sentences]
    return ' '.join(capitalized_sentences)

# Función para generar texto

def generate_story(prompt, max_length=512, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(
        inputs['input_ids'],
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=True,  # Para generar texto de manera más creativa
        top_k=50,  # Para limitar el número de palabras a considerar para cada paso
        top_p=0.95  # Para la estrategia de muestreo de núcleo (nucleus sampling)
    )

    stories = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    capitalized_stories = [capitalize_sentences(story) for story in stories]
    return capitalized_stories


# Ejemplo de uso
prompt = "moatian ronki ipaonike"
generated_stories = generate_story(prompt, max_length=300, num_return_sequences=3)


for i, story in enumerate(generated_stories):
    print(f"Cuento {i + 1}:\n{story}\n")

Cuento 1:
Moatian ronki ipaonike ronki ipaonike icha riki, jawen jonibaonbo. Jatianki akin. Ja ainbo ika iki; jato baritia iki, jaweki ati iki. Caxon, ja joni ronki ipaonike. Jatian ronki aka iki, ja xeati yoikin riki, ja ainbora, ja ainbo ronki ipaonike, ani kaa iki. Natoya, jainxon, ja, ja jatian jakasi. Nato ja bakeai, jawen ibo yoia iki. Ja jawen rayos. Jatian wetsa nete'ronki ipaonike nato kikin soiki ika iki, ja ani, ja jonin ronki aka iki. Jatian ja ronki ikai, jawen akin. Ja bake, jakiribiki nato ainbo ronki aka iki, jawen awin eata. Jatian ja bake, jawen awin. Earaax, jawen awin betan jawen bake ika iki. Ja yakata iki. Ja jonin icha jain ikenbi, ika iki. Ja jonin westiora joni ronki ipaonike, ja nonti ronki ika iki, jawen jawen ibon benai. Jatian jaton piti aka iki, jawen rayoski, moa ja inka jawen bene ipaonike jawen rayosbi. Ja bake. Ja xeati jatian jawen rayos. Ja kaa iki. Jatian ja joninki yoia iki. Jainxon, ja bake. Ja ainbaon moa moa moa ikai jawen rayos. Jatian jawen aw

# Metricas para evaluar el cuento generado