# Importar librerias

In [None]:
import datasets
from datasets import Dataset, DatasetDict, load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from IPython.display import display
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
import random
import math
import pandas as pd
from tqdm.auto import tqdm
import tensorflow as tf
import numpy as np
import os

#### Comprobar que la GPU esta disponible

In [None]:
tf.config.list_physical_devices('GPU')

In [None]:
tf.test.gpu_device_name()

# Importar el dataset 

In [None]:
recipe_dataset = datasets.load_dataset("recipe_nlg", data_dir='/workspace/datasets/recipe_nlg')

print(recipe_dataset)
# Convertir los datos a dataframe de pandas
recipe_dataset.set_format(type='pandas')
df_recipe_dataset=recipe_dataset['train'][:]

# Mostrar la cabecera del dataframe
df_recipe_dataset.head()

## Acondicionamiento del dataset

In [None]:
# Fusionar las columnas "title", "ingredients" y "directions" para tener la receta completa como un texto, los "ner" igual
full_recipe=[]
Ingredients=[]
for row in range(df_recipe_dataset.shape[0]):
    ingredients = "\n".join(df_recipe_dataset['ner'][row])
    directions = "\n".join(df_recipe_dataset['directions'][row])
    Ingredients.append('ingredients:\n'+ingredients.lower())
    full_recipe.append('<|startofrecipe|>ingredients:\n'+ingredients.lower()+'\ndirections:\n'+directions.lower()+'<|endofrecipe|>')

df_recipe_dataset['Full Recipe']=full_recipe
df_recipe_dataset['Ingredients']=Ingredients
# Eliminar columnas que no son útiles
df_recipe_dataset=df_recipe_dataset.drop(['link', 'source','title','directions','ingredients','id','ner'], axis=1)

# Visualizar el dataset acondicionado
print("DATASET ACONDICIONADO:")
display(df_recipe_dataset.head())

### Guardar subsamplings del dataset acondicionado

In [None]:
if not os.path.isdir("/workspace/datasets/clean_recipe_nlg/"):
    os.mkdir("/workspace/datasets/clean_recipe_nlg/")
df_recipe_dataset.iloc[:100,:].to_csv("/workspace/datasets/clean_recipe_nlg/recipe_nlg_100_new.csv", sep=";")
df_recipe_dataset.iloc[:1000,:].to_csv("/workspace/datasets/clean_recipe_nlg/recipe_nlg_1000_new.csv", sep=";")
df_recipe_dataset.iloc[:10000,:].to_csv("/workspace/datasets/clean_recipe_nlg/recipe_nlg_10000_new.csv", sep=";")
df_recipe_dataset.iloc[:100000,:].to_csv("/workspace/datasets/clean_recipe_nlg/recipe_nlg_100000_new.csv", sep=";")
df_recipe_dataset.iloc[:1000000,:].to_csv("/workspace/datasets/clean_recipe_nlg/recipe_nlg_1000000_new.csv", sep=";")
df_recipe_dataset.to_csv("/workspace/datasets/clean_recipe_nlg/recipe_nlg_full.csv", sep=";")
del df_recipe_dataset
del recipe_dataset

### Cargar dataset acondicionado

In [None]:
df_recipe_dataset = pd.read_csv("/workspace/datasets/clean_recipe_nlg/recipe_nlg_full.csv", sep=";")

In [None]:
# Convertir a Dataset de HuggingFace
recipe_dataset = Dataset.from_pandas(df_recipe_dataset.sample(frac=1))

# Dividir el dataset en train y test de forma aleatoria
recipe_dataset_train_valid = recipe_dataset.train_test_split(test_size=0.2, shuffle=True)

print("DATASET:")
recipe_dataset = DatasetDict({
    'train': recipe_dataset_train_valid['train'],
    'valid': recipe_dataset_train_valid['test'],
})
print(recipe_dataset)

# GPT2

## Cargar Tokenizer

In [None]:
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast, GPT2Config, AutoTokenizer
# Load the GPT tokenizer.
tokenizer = AutoTokenizer.from_pretrained('gpt2', 
                                              bos_token='<|startofrecipe|>', 
                                              eos_token='<|endofrecipe|>',
                                              unk_token='<|unknown|>', 
                                              pad_token='<|pad|>',
                                              use_fast=True)
#special_tokens_dict = {'additional_special_tokens': ['<|section|>']}

In [None]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

## Cargar modelo

In [None]:
from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig
context_length=128
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = TFGPT2LMHeadModel(config)
# model(model.dummy_inputs)  # Builds the model
# model.summary()

In [None]:
config

## Tokenizar el dataset

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["Full Recipe"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = recipe_dataset.map(
    tokenize, batched=True, remove_columns=recipe_dataset["train"].column_names
)
tokenized_datasets

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

### Convertir a dataset de Tensorflow

In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=2,
)
tf_eval_dataset = tokenized_datasets["valid"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=2,
)

## Compilar y entrenar el Modelo

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=5000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.001,
)
model.compile(optimizer=optimizer)
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,epsilon=1e-08, clipnorm=1.0)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=2)

### Guardar pesos del modelo

In [None]:
if not os.path.isdir("/workspace/Models/"):
    os.mkdir("/workspace/Models/")

In [None]:
model.save("/workspace/Models/gpt2_recipe_basic")
model.save_weights('/workspace/Models/gpt2_recipe_basic_weights')

## Realizar inferencia y generar la receta 

### Cargar pipeline de generación de texto

In [None]:
from transformers import pipeline
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device=0
)

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['butter,brown sugar,milk,eggs']

In [None]:
def filter_output(out):
    return out#out.replace('<|startofrecipe|>', '')

### Realizar la inferencia en base a los ingredientes

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    out = pipe(prompt,
        max_length=context_length,
        min_length=64,
        penalty_alpha=0.9,
        top_k=60,
        pad_token_id=50260
        )[0]['generated_text']
    print(filter_output(out))

# Facebook OPT

## Cargar Tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', 
                                              bos_token='<|startofrecipe|>', 
                                              eos_token='<|endofrecipe|>',
                                              unk_token='<|unknown|>', 
                                              pad_token='<|pad|>',
                                              use_fast=True)

In [None]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

## Cargar modelo

In [None]:
from transformers import AutoTokenizer, TFOPTForCausalLM, AutoConfig
context_length=128
config = AutoConfig.from_pretrained(
    "facebook/opt-125m",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = TFOPTForCausalLM(config)

In [None]:
config

## Tokenizar el dataset

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["Full Recipe"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = recipe_dataset.map(
    tokenize, batched=True, remove_columns=recipe_dataset["train"].column_names
)
tokenized_datasets

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

### Convertir a dataset de Tensorflow

In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=2,
)
tf_eval_dataset = tokenized_datasets["valid"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=2,
)

## Compilar y entrenar el Modelo

### Cargar pipeline de generación de texto

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=5000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.001,
)
model.compile(optimizer=optimizer)
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,epsilon=1e-08, clipnorm=1.0)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=2)

### Guardar pesos del modelo

In [None]:
model.save_weights('/workspace/Models/opt_recipe_basic_weights')

## Realizar inferencia y generar la receta 

In [None]:
from transformers import pipeline
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device=0
)

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['butter,brown sugar,milk,eggs']

### Realizar la inferencia en base a los ingredientes

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    out = pipe(prompt,
        max_length=context_length,
        min_length=64,
        penalty_alpha=0.9,
        top_k=60,
        pad_token_id=50260
        )[0]['generated_text']
    print(out)

# GPT2-large

## Cargar Tokenizer

In [None]:
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast, GPT2Config, AutoTokenizer
# Load the GPT tokenizer.
tokenizer = AutoTokenizer.from_pretrained('gpt2-large', 
                                              bos_token='<|startofrecipe|>', 
                                              eos_token='<|endofrecipe|>',
                                              unk_token='<|unknown|>', 
                                              pad_token='<|pad|>',
                                              use_fast=True)

## Cargar modelo

In [None]:
from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig
context_length=128
config = AutoConfig.from_pretrained(
    "gpt2-large",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = TFGPT2LMHeadModel(config)

In [None]:
config

## Tokenizar el dataset

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["Full Recipe"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = recipe_dataset.map(
    tokenize, batched=True, remove_columns=recipe_dataset["train"].column_names
)
tokenized_datasets

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

### Convertir a dataset de Tensorflow

In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=2,
)
tf_eval_dataset = tokenized_datasets["valid"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=2,
)

## Compilar y entrenar el Modelo

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=5000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.001,
)
model.compile(optimizer=optimizer)
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5,epsilon=1e-08, clipnorm=1.0)
# loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
# model.compile(optimizer=optimizer, loss=loss, metrics=[metric])


# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=2)

### Guardar pesos del modelo

In [None]:
model.save_weights('/workspace/Models/gpt2-large_recipe_basic_weights')

## Realizar inferencia y generar la receta 

### Cargar pipeline de generación de texto

In [None]:
from transformers import pipeline
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device=0
)

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['butter,brown sugar,milk,eggs','rice,chocolate,lemon']

### Realizar la inferencia en base a los ingredientes

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    out = pipe(prompt,
        max_length=context_length,
        min_length=64,
        penalty_alpha=0.9,
        top_k=60,
        pad_token_id=50260
        )[0]['generated_text']
    print(out)
    print("###############################################")
    print()

# DistlGPT2

## Cargar Tokenizer

In [None]:
from transformers import TFGPT2LMHeadModel, GPT2TokenizerFast, GPT2Config, AutoTokenizer
# Load the GPT tokenizer.
tokenizer = AutoTokenizer.from_pretrained('distilgpt2', 
                                              bos_token='<|startofrecipe|>', 
                                              eos_token='<|endofrecipe|>',
                                              unk_token='<|unknown|>', 
                                              pad_token='<|pad|>',
                                              use_fast=True)

## Cargar modelo

In [None]:
from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig
context_length=128
config = AutoConfig.from_pretrained(
    "distilgpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = TFGPT2LMHeadModel(config)

In [None]:
config

## Tokenizar el dataset

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["Full Recipe"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = recipe_dataset.map(
    tokenize, batched=True, remove_columns=recipe_dataset["train"].column_names
)
tokenized_datasets

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

### Convertir a dataset de Tensorflow

In [None]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=2,
)
tf_eval_dataset = tokenized_datasets["valid"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=2,
)

## Compilar y entrenar el Modelo

In [None]:
from transformers import create_optimizer
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=5000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.001,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=2)

### Guardar pesos del modelo

In [None]:
model.save_weights('/workspace/Models/distilgpt2_recipe_basic_weights')

## Realizar inferencia y generar la receta 

### Cargar pipeline de generación de texto

In [None]:
from transformers import pipeline
pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device=0
)

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['butter,brown sugar,milk,eggs','chicken,rice']

### Realizar la inferencia en base a los ingredientes

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    out = pipe(prompt,
        max_length=context_length-1,
        min_length=64,
        penalty_alpha=0.9,
        top_k=60,
        pad_token_id=50260
        )[0]['generated_text']
    print(out)
    print("####################")
    print()

# Gradio

In [None]:
import gradio as gr
from transformers import TFGPT2LMHeadModel, AutoTokenizer, AutoConfig, TFOPTForCausalLM
from transformers import pipeline

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"ingredients:\n{ingredients}\n"
    return s

## Cargar modelo

In [None]:
def load_model(name):
    # Load the GPT tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(name, 
                                                bos_token='<|startofrecipe|>', 
                                                eos_token='<|endofrecipe|>',
                                                unk_token='<|unknown|>', 
                                                pad_token='<|pad|>',
                                                use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    context_length=128
    config = AutoConfig.from_pretrained(
        name,
        vocab_size=len(tokenizer),
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    if name.find("gpt") ==-1:
        model = TFOPTForCausalLM(config)
        model.load_weights("/workspace/Models/opt_recipe_basic_weights")
    else:
        model = TFGPT2LMHeadModel(config)
        model.load_weights(f"/workspace/Models/{name}_recipe_basic_weights")
    return model, tokenizer

## Ejecutar aplicación Gradio

In [None]:
def greet(Ingredients,model_name):
    
    prompt = create_prompt(Ingredients)
    model, tokenizer = load_model(model_name)
    pipe = pipeline(
    "text-generation", model=model, tokenizer=tokenizer, device=0)
    context_length=150
    out = pipe(prompt,
        max_length=context_length,
        min_length=64,
        penalty_alpha=0.9,
        top_k=60,
        pad_token_id=50260
        )[0]['generated_text']
    out = out.replace("ingredients:", "Ingredients:")
    return out.replace("directions:", "\nDirections:")

        

demo = gr.Interface(fn=greet, 
                    inputs=["text", gr.Dropdown(["distilgpt2","gpt2","gpt2-large","facebook/opt-125m"], label="Models", info="Choose a Model!")], 
                            outputs="text")

demo.launch()