# Fine-Tuning GPT for Personalized Recipe Recommendation and Generating Visualizations with GANs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [10]:
file_path = "processed_recipes.txt"

text = open(file_path, "r").read()

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # Initialize GPT-2 tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to end-of-sequence token
encoded_text = tokenizer.encode(text, max_length=1024, truncation=True, padding="max_length")  # Tokenize and encode text
encoded_tensor = torch.tensor(encoded_text).unsqueeze(0)  # Convert encoded text to tensor format and add batch dimension

dataset = torch.utils.data.TensorDataset(encoded_tensor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)

model = GPT2LMHeadModel.from_pretrained("gpt2")  # Load pre-trained GPT-2 model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Training the model for 100 epochs
for epoch in range(100):
    for i, batch in enumerate(dataloader):
        batch_input = batch[0].to(device)  # Move batch input to device
        model.train()
        
        optimizer.zero_grad()  # Clear gradients
        outputs = model(batch_input)  # Forward pass
        predictions = outputs.logits[:, :-1]  # Remove last token from predictions
        batch_input = batch_input[:, :-1]  # Remove last token from batch input to align dimensions
        
        loss = loss_fn(predictions.view(-1, predictions.size(-1)), batch_input.view(-1))  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
    
    # Print loss every 10 epochs
    if (epoch + 1) % 10 == 0:   
        print(f"Epoch: {epoch+1}, Batch: {i+1}, Loss: {loss.item()}")

model.save_pretrained("trained_recipe_model_pytorch")  # Save trained model
print("Training complete! Model saved as 'trained_recipe_model_pytorch'")  # Print completion message


Epoch: 10, Batch: 1, Loss: 6.145355701446533
Epoch: 20, Batch: 1, Loss: 4.667580604553223
Epoch: 30, Batch: 1, Loss: 2.798748254776001
Epoch: 40, Batch: 1, Loss: 1.0829516649246216
Epoch: 50, Batch: 1, Loss: 0.3258196711540222
Epoch: 60, Batch: 1, Loss: 0.18544597923755646
Epoch: 70, Batch: 1, Loss: 0.08730714023113251
Epoch: 80, Batch: 1, Loss: 0.11106064915657043
Epoch: 90, Batch: 1, Loss: 0.056963592767715454
Epoch: 100, Batch: 1, Loss: 0.05025213584303856
Training complete! Model saved as 'trained_recipe_model_pytorch'


In [13]:
# Load the trained model
model_path = "trained_recipe_model_pytorch"  # Path to the trained model directory
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # Load the tokenizer
model = GPT2LMHeadModel.from_pretrained(model_path)  # Load the pre-trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available
model.to(device) 

# Define input prompt
input_prompt = "I want to make cranberry burritos."

# Tokenize input prompt
input_ids = tokenizer.encode(input_prompt, return_tensors="pt").to(device)

# Generate recipe suggestions
output = model.generate(
    input_ids,
    max_length=200,  # Maximum length of the generated text
    num_return_sequences=3,  # Number of recipe suggestions to generate
    no_repeat_ngram_size=2,  # Ensure generated sequences do not repeat n-grams of length 2
    top_p=0.92,  # Probability threshold for nucleus sampling
    temperature=0.85,  # Temperature for sampling
    do_sample=True,  # Enable sampling from the output distribution
    top_k=50,  # Top-k sampling parameter
    early_stopping=False,  # Disable early stopping to enforce maximum length
    pad_token_id=tokenizer.eos_token_id,  # Pad token ID for end of sequence
)

# Decode and print recipe suggestions
for i, recipe in enumerate(output):
    print(f"Recipe suggestion {i+1}: {tokenizer.decode(recipe, skip_special_tokens=True)}")


Recipe suggestion 1: I want to make cranberry burritos. I want want make make MAKE MAKE Make Make Made Make Making Making Make Get Get Got Got Get

1919202070707171 71 71 70 70704704704694694714714694664664464464646 46 464648483232 32 32 33 33 32 31 31 30 30
...-.-.
 ( (((<<
)) ) ) ( (( (((((( (( ((( ()(((((()( ((( (()(( (///././ and/ (.//(/(/ / //*//?/?/.,,,,.., 8 8 9 9 8 10 10 12 12 13 13 14 14 15 151515 15 2015 201520152015 2015 2016 2016 16 16 8 7 7 8 4 4 88899 9 10 20 20 25 25 26 26 24 24 25 27 27 28 28 29 29 30 31 32
 or
Recipe suggestion 2: I want to make cranberry burritos.

I'm not going to take take. I III I ( I ) ( ( ) )
 () ((()()(((((( (( (((( ( (( ( [ [ ] ] [ 1 1 2 2 3 3 4 4 5 5 4 6 6 66 66 68 68 69 69 70 70 71 71 7070702020 20 20
,,..!!!!''
//... [,,....... - - ( -      -  * * 

Recipe suggestion 3: I want to make cranberry burritos.   I I      
   - - - --- 

I IIIIIIIIIIIII III III  IIIIIIIIIVIVIIIIVIIIIIVIIIXIXIVIXIVIIII II II I 
 
        '' " " "" """""""""""" """ "

# References

1. https://pytorch.org/tutorials/
1. https://huggingface.co/transformers/
1. https://huggingface.co/transformers/model_doc/gpt2.html

In [None]:
# Define file path
file_path = "processed_recipes_main.txt"

# Load the text file
text = open(file_path, "r").read()

# Tokenizer for GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Encode the text
encoded_text = tokenizer.encode(text, max_length=1024, truncation=True, padding="max_length")
encoded_tensor = torch.tensor(encoded_text).unsqueeze(0)

# Prepare training dataset
dataset = torch.utils.data.TensorDataset(encoded_tensor)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)

# Create GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

epochs = 500

training_loss_plt_arr = []

for epoch in range(epochs):
    epoch_loss = 0.0  # Track total loss for the epoch
    for i, batch in enumerate(dataloader):
        batch_input = batch[0].to(device)
        
        model.train()
        
        optimizer.zero_grad()
        
        outputs = model(input_ids=batch_input, labels=batch_input)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
    # Average epoch loss
    epoch_loss /= len(dataloader)
    
    training_loss_plt_arr.append(epoch_loss)
    
    # Print training loss
    if (epoch + 1) % 25 == 0:   
        print(f"Epoch: {epoch+1}/{epochs}, Loss: {epoch_loss:.5f}")

# Save the trained model
model.save_pretrained("trained_recipe_GPT2_model")

print("Training complete! Model saved as 'trained_recipe_GPT2_model'")