# PROYECTO 3 DEEP LEARNING

## Instalación de bibliotecas

In [22]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import os

# Deshabilitar Weights & Biases (W&B) para evitar solicitudes de API
os.environ["WANDB_DISABLED"] = "true"

## Cargar el dataset

In [23]:
# Cargar el dataset IMDb desde Hugging Face
dataset = load_dataset("imdb")

In [24]:
# Dividir en conjuntos de entrenamiento y prueba
train_data = dataset["train"].shuffle(seed=42)  # Conjunto completo de entrenamiento (25,000)
test_data = dataset["test"].shuffle(seed=42)    # Conjunto completo de prueba (25,000)


In [25]:
# Mostrar un ejemplo del dataset
print("Ejemplo del dataset:")
print(train_data[0])


Ejemplo del dataset:
{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1}


## Seleccionar el modelo base y el tokenizador

In [27]:
# Seleccionar el modelo preentrenado
model_name = "distilgpt2"

# Cargar el tokenizador y el modelo
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Asegurar que exista un token de padding
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))  # Ajustar la capacidad del modelo para los nuevos tokens


## Tokenizar los datos

In [28]:
# Función para tokenizar los datos e incluir las etiquetas
def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Las etiquetas son iguales a los input_ids
    return tokenized

# Tokenizar todos los datos
tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_test = test_data.map(tokenize_function, batched=True)

# Preparar los datos para PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## Configurar de los hyperparametros

In [29]:
# Argumentos del entrenamiento
training_args = TrainingArguments(
    output_dir="./results",        # Carpeta de salida
    evaluation_strategy="epoch",  # Evaluar en cada época
    num_train_epochs=5,           # Incrementar las épocas para más datos
    per_device_train_batch_size=4,  # Reducir tamaño del lote si hay limitaciones
    per_device_eval_batch_size=4,   # Reducir tamaño del lote para evaluación
    save_strategy="epoch",        # Guardar en cada época
    logging_dir="./logs",         # Carpeta de logs
    logging_steps=100,            # Frecuencia de logs para conjuntos más grandes
    save_total_limit=2,           # Guardar solo 2 modelos
    load_best_model_at_end=True,  # Cargar el mejor modelo al final
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [30]:
# Configurar Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
)

  trainer = Trainer(


## Entrenamiento

In [31]:
# Entrenar el modelo
trainer.train()

# Guardar el modelo y el tokenizador
trainer.save_model("./trained_gpt_model")
tokenizer.save_pretrained("./trained_gpt_model")

Epoch,Training Loss,Validation Loss
1,3.6795,3.632132
2,3.5406,3.60602
3,3.4695,3.600171
4,3.4098,3.598434
5,3.4382,3.600963


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


('./trained_gpt_model/tokenizer_config.json',
 './trained_gpt_model/special_tokens_map.json',
 './trained_gpt_model/vocab.json',
 './trained_gpt_model/merges.txt',
 './trained_gpt_model/added_tokens.json')

## Inferencia

In [32]:
# Cargar el modelo entrenado
model = GPT2LMHeadModel.from_pretrained("./trained_gpt_model")
tokenizer = GPT2Tokenizer.from_pretrained("./trained_gpt_model")

In [33]:
# Función para realizar predicciones
def predict(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output = model.generate(input_ids, max_length=50, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [40]:
# Probar la inferencia con un ejemplo
input_text = "The movie was not"
print("Predicción generada:")
print(predict(input_text))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Predicción generada:
The movie was not very good. The acting was bad. The story was not very good. The movie was not very good. The movie was not very good. The movie was not very good. The movie was not very good. The movie was
