# **Prompt results for GPT2 and LLaMA3**

## Preparations

In [None]:
# Import necessary packages
import os
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline




In [None]:
# Check if a CUDA-enabled GPU is available for PyTorch to use
device = "cuda" if torch.cuda.is_available() else "cpu"
device

## Define prompting function

In [None]:
# Define Prompts
prompts = ["Give a brief introduction about the Mardi Gras event in Sydney",
            "Answer in two sentences when and why the Mardi Gras event in Sydney emerged",
            "Provide the average annual attendance figures for the Sydney Mardi Gras event", 
            "Assess whether Mardi Gras Sydney can improve its marketing"]
# Define how many outputs to generate
num_return_sequences = 5

In [None]:
def generate_evaluate_model(model, tokenizer, prompts, num_return_sequences, device):
    """
    Generates and evaluates text using a pre-trained language model.

    This function takes a list of prompts, generates text for each prompt using the specified model,
    and calculates the perplexity for both the input prompt and the generated text. The results are 
    stored in a DataFrame for further analysis.
    """
    # Define df for storing results
    df = pd.DataFrame()
    # Set model to evaluation mode
    model.eval()
    # Iterate over each prompt
    for prompt in prompts:
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        
        outputs = model.generate(inputs["input_ids"],
                                 attention_mask = inputs["attention_mask"],
                                 do_sample=True,  # Enable sampling to generate diverse sequences.
                                 max_new_tokens = 512, # Maximum legth of context for GPT2
                                 top_p=0.95, # Use nucleus sampling (top-p)
                                 temperature = 1.0, # Use high remperature value
                                 num_return_sequences=num_return_sequences, # Define how many texts to generate
                                 # Set special tokens
                                 pad_token_id=tokenizer.eos_token_id, 
                                 eos_token_id=tokenizer.eos_token_id 
                             )
        
        # Iterate over each generated text
        for i in range(num_return_sequences):
            generated_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
            # Remove the prompt from the output
            generated_text = generated_text.replace(prompt,'').strip()
            generated_text_tokenized = tokenizer(generated_text, return_tensors="pt").to(device)
            
            # Create a DataFrame storing results for one generated text
            data = {
                "prompt": [prompt], # Save prompt
                "output": [generated_text], # Sabe prompt output
            }
            output_df = pd.DataFrame(data)
            # Add row to df by concatenation
            df = pd.concat([df,output_df])
    
    return df

    

## GPT 2

### Load the model

In [None]:
# Load the model directly from Huggingface
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)



In [None]:
df = generate_evaluate_model(model, tokenizer, prompts, num_return_sequences, device)
# Save to Excel
df.to_excel("gpt2_rating.xlsx", index = False)



## LLaMA3

In [None]:
# Set HF access token to use LLaMA3
os.environ['HF_TOKEN']="your_token"
os.environ['HUGGINGFACEHUB_API_TOKEN']= "your_token"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
# Load model
pipeline = pipeline(
    "text-generation",
    model="meta-llama/Meta-Llama-3-8B",
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype":torch.float16},
    device_map=device
)
model = pipeline.model




In [None]:
df = generate_evaluate_model(model, tokenizer, prompts, num_return_sequences, device)
# Save to Excel
df.to_excel("llama3_results.xlsx", index=False)
