## Login to Hugging Face

In [1]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()
token = os.getenv("HUGGINGFACE_TOKEN")
login(
    token=token, # ADD YOUR TOKEN HERE
    add_to_git_credential=True
)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/pathfinder/.cache/huggingface/token
Login successful


## Downloads

In [2]:
#!pip install huggingface_hub
#!pip install transformers
#!pip install bitsandbytes
#!pip install peft
#!pip install trl
#!pip install accelerate
#!pip install datasets
#!pip install scikit-learn
#!pip install packaging
#!pip install ninja
#!pip install flash-attn --no-build-isolation

## Imports

In [3]:
from tqdm import tqdm
from IPython.display import display, Markdown

# pytorch
import torch

# huggingface
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig
from trl import SFTTrainer

# datasets
import pandas as pd
from datasets import Dataset

## Device

In [4]:
device = (
    "cuda:0" if torch.cuda.is_available() else # Nvidia GPU
    "mps" if torch.backends.mps.is_available() else # Apple Silicon GPU
    "cpu"
)
print(f"Device = {device}")

Device = cuda:0


## Hyperparameters

In [5]:
# seed
seed=42
torch.manual_seed(seed)

# Tokenizer arguments
max_length=512
padding="max_length"
truncation=True

# model arguments
max_new_tokens=100

# mixed precision
dtype=torch.bfloat16

# quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=dtype,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_doulbe_quant=True
)

# LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none"
)

# training arguments
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    save_strategy="epoch",
    logging_strategy="steps",
    evaluation_strategy="steps",
    logging_steps=1,
    save_total_limit=1,
    
    learning_rate=2e-4,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    optim="adamw_torch",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    warmup_ratio=0,
    warmup_steps=0,
    seed=seed
)

# train-validation split
validation_size=0.1

# SFTTrainer arguments
max_seq_length=512

## Model

In [6]:
# Model List

# gemma variants

# llama2 variants

# phi variants


In [7]:
model_id = "google/gemma-7b-it"

In [8]:
model_save_path = "model"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    attn_implementation="flash_attention_2",
    torch_dtype=dtype,
    quantization_config=quantization_config,
    trust_remote_code=True
)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaFlashAttention2(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()


## Dataset

In [12]:
# Dataset Path
train_dataset_path1 = "dataset/llm-prompt-recovery-synthetic-datastore/gemma1000_w7b.csv"
train_dataset_path2 = "dataset/3000-rewritten-texts-prompt-recovery-challenge/prompts_0_500_wiki_first_para_3000.csv"
test_dataset_path = "data-sample/test.csv"

In [13]:
# `LLM Prompt Recovery - Synthetic Datastore dataset` by @dschettler8845
df1 = pd.read_csv(train_dataset_path1)
df1 = df1[["original_text", "rewrite_prompt", "gemma_7b_rewritten_text_temp0"]]
df1 = df1.rename(columns={"gemma_7b_rewritten_text_temp0":"rewritten_text"})
df1.head()

Unnamed: 0,original_text,rewrite_prompt,rewritten_text
0,"Port-au-Prince, Haiti (CNN) -- Earthquake vict...",Turn this into an association to be joined.,"Sure, here is the association you requested:\n..."
1,Former secretary of state Hillary Clinton meet...,Convert this into a gain to be gained.,"Sure, here is the gain to be gained from the t..."
2,The opinions expressed by columnists are their...,Frame this as a political debate.,## The Obama Legacy: A Tale of Two Sides\n\nTh...
3,BIGBANG is one of those musical entities that ...,Imagine this as a mathematician's equation.,"Sure, here is the equation:\n\n**BIGBANG's imp..."
4,WHAT?!??! I know. That’s what you’re saying ri...,Frame this as an accountant's thrilling advent...,"Sure, here's the framed text as an accountant'..."


In [14]:
# `3000 Rewritten texts - Prompt recovery Challenge` by @dipamc77
df2 = pd.read_csv(train_dataset_path2)
df2.head()

Unnamed: 0,original_text,rewrite_prompt,rewritten_text
0,"Sfiso Ncwane (April 21, 1979 - December 5, 201...",Transform the text into a series of riddles th...,"Sure, here's the text transformed into riddles..."
1,The 1959–60 California Golden Bears men's bask...,Make this a market entry strategy for a new re...,## Market Entry Strategy: Launching a Brand in...
2,Franck Passi (born 28 March 1966) is a French ...,Write it as the last chapter of a book that ch...,## The Final Chapter: Ode to a Changed Soul\n\...
3,Hollandaea diabolica is a species of Australia...,Turn it into a vaudeville stage act introduction.,**Vaudeville Stage Act Introduction:**\n\nLadi...
4,The QF 6-inch Gun Mark N5 (initially designate...,Transform the text into a series of instructio...,The text does not provide any information abou...


In [15]:
# Merge all datasets
df = pd.concat([df1, df2], axis=0)
#df = df.sample(2000).reset_index(drop=True) # to reduce training time we are only using 2k samples
print(df.shape)
df.head()

(4000, 3)


Unnamed: 0,original_text,rewrite_prompt,rewritten_text
0,"Port-au-Prince, Haiti (CNN) -- Earthquake vict...",Turn this into an association to be joined.,"Sure, here is the association you requested:\n..."
1,Former secretary of state Hillary Clinton meet...,Convert this into a gain to be gained.,"Sure, here is the gain to be gained from the t..."
2,The opinions expressed by columnists are their...,Frame this as a political debate.,## The Obama Legacy: A Tale of Two Sides\n\nTh...
3,BIGBANG is one of those musical entities that ...,Imagine this as a mathematician's equation.,"Sure, here is the equation:\n\n**BIGBANG's imp..."
4,WHAT?!??! I know. That’s what you’re saying ri...,Frame this as an accountant's thrilling advent...,"Sure, here's the framed text as an accountant'..."


## Prompt Engineering

In [16]:
template = """
Instruction:\n
Below, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.\n\n

Original Text:\n
{original_text}\n\n

Rewritten Text:\n
{rewritten_text}\n\n

Response:\n
{rewrite_prompt}
"""

In [17]:
def format_prompt(row):
    original_text = row.get("original_text", "")
    rewritten_text = row.get("rewritten_text", "")
    rewrite_prompt = row.get("rewrite_prompt", "")
    
    return template.format(
        original_text=original_text,
        rewritten_text=rewritten_text,
        rewrite_prompt=rewrite_prompt
    )

df["prompt"] = df.apply(format_prompt, axis=1)
data = df.prompt.tolist()

## Preprocessing

In [18]:
# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

def preprocess_function(examples):
    # Tokenize the prompts
    return tokenizer(examples['prompt'], padding=padding, truncation=truncation, max_length=max_length)

# Preprocess the dataset
dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [19]:
# Split the dataset into a training set and a validation set
dataset = dataset.train_test_split(test_size=validation_size, seed=seed)

# Get the training and validation sets
train_dataset = dataset['train']
val_dataset = dataset['test']

In [20]:
train_dataset

Dataset({
    features: ['original_text', 'rewrite_prompt', 'rewritten_text', 'prompt', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 3600
})

In [21]:
train_dataset[0]

{'original_text': 'The 2017 Catalan motorcycle Grand Prix was the seventh round of the 2017 MotoGP season. It was held at the Circuit de Barcelona-Catalunya in Montmeló on June 11, 2017.',
 'rewrite_prompt': 'Rewrite it as a narrative of the first rain after a decade-long drought.',
 'rewritten_text': 'The sun beat down on the track, baking the asphalt and drying the earth below. It had been a decade since the last rain had fallen upon Montmeló, a testament to the merciless grip of the drought that gripped the land. The track echoed with the roar of engines, the sweat of the racers streaking down their visors and the cheers of the fans reverberating into the air.\n\nThe day had started with a glimmer of hope. A few wispy clouds had gathered, promising a sprinkle of rain to quench the parched earth below. And just as the checkered flag waved to signal the start of the race, a drizzle began to fall. It started as',
 'prompt': '\nInstruction:\n\nBelow, the `Original Text` passage has been

## Sample

In [22]:
def colorize_text(text):
    for word, color in zip(["Instruction", "Original Text", "Rewritten Text", "Response"],
                           ["red", "yellow", "blue", "green"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

In [23]:
# Take a random sample
sample = data[10]

# Give colors to Instruction, Response and Category
sample = colorize_text(sample)

# Show sample in markdown
display(Markdown(sample))




**<font color='red'>Instruction:</font>**

Below, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.





**<font color='yellow'>Original Text:</font>**

Story highlights Tyka Nelson says her brother's favorite color was ... orange The late musical artist's brand has been all about the color purple (CNN) Tyka Nelson just tweaked a major part of Prince's legacy. The sister of the late superstar talked to the Evening Standard about an upcoming exhibit of Prince artifacts set to open in London and mentioned one of his beloved instruments. "The standout piece for me is his orange Cloud guitar," the publication quoted Nelson as saying. "It is strange because people always associate the color purple with Prince, but his favorite color was actually orange."





**<font color='blue'>Rewritten Text:</font>**

In the land of musical legends, I stumbled upon a hidden gem that shed light on the enigmatic life of the late Prince. As I ventured through the archives of his legacy, I stumbled upon a revelation that challenged my understanding of the artist's vibrant persona.

The exhibit, set to open in London, will showcase a collection of Prince's treasured artifacts, including a guitar that held a special place in his heart. "The standout piece for me is his orange Cloud guitar," Nelson said in an interview with the Evening Standard. "It is strange because people always associate the color purple with Prince, but his favorite color was actually orange."

This discovery was like a treasure map leading me to a hidden chamber where Prince's soul lived on through the prism of his favorite hue. It was a moment of profound connection to the artist's inner world, revealing a hidden layer of his creative spirit.





**<font color='green'>Response:</font>**

Rewrite this as an explorer's discovery.


## Inference before Fine-Tuning

In [24]:
def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(input_ids=input_ids.to(model.device), max_new_tokens=max_new_tokens)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [25]:
# Take one sample
row = df.iloc[10]

# Generate Prompt using template
prompt = template.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = generate_response(prompt)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))




**<font color='red'>Instruction:</font>**

Below, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.





**<font color='yellow'>Original Text:</font>**

Story highlights Tyka Nelson says her brother's favorite color was ... orange The late musical artist's brand has been all about the color purple (CNN) Tyka Nelson just tweaked a major part of Prince's legacy. The sister of the late superstar talked to the Evening Standard about an upcoming exhibit of Prince artifacts set to open in London and mentioned one of his beloved instruments. "The standout piece for me is his orange Cloud guitar," the publication quoted Nelson as saying. "It is strange because people always associate the color purple with Prince, but his favorite color was actually orange."





**<font color='blue'>Rewritten Text:</font>**

In the land of musical legends, I stumbled upon a hidden gem that shed light on the enigmatic life of the late Prince. As I ventured through the archives of his legacy, I stumbled upon a revelation that challenged my understanding of the artist's vibrant persona.

The exhibit, set to open in London, will showcase a collection of Prince's treasured artifacts, including a guitar that held a special place in his heart. "The standout piece for me is his orange Cloud guitar," Nelson said in an interview with the Evening Standard. "It is strange because people always associate the color purple with Prince, but his favorite color was actually orange."

This discovery was like a treasure map leading me to a hidden chamber where Prince's soul lived on through the prism of his favorite hue. It was a moment of profound connection to the artist's inner world, revealing a hidden layer of his creative spirit.





**<font color='green'>Response:</font>**


**Prompt:**

Write a story about the legacy of the late musical artist Prince, focusing on his favorite color, orange, and the upcoming exhibit of his artifacts in London. Use vivid imagery and creative storytelling to bring the artist's legacy to life.

In [26]:
# Take one sample
row = df.iloc[20]

# Generate Prompt using template
prompt = template.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = generate_response(prompt)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))




**<font color='red'>Instruction:</font>**

Below, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.





**<font color='yellow'>Original Text:</font>**

Refined mansion tax proposal being fed into debate on abolishing 50p tax rate for those earning more than £150,000 The Liberal Democrats are pushing for the eventual disbanding of the 50p rate of tax to see the implementation of a new land tax levied on properties above £1m. In a refinement of their controversial mansion tax policy launched at their party conference two years ago, the Lib Dems now believe there is an argument for levying capital gains tax on any money made from the sale of a property after the first £1m. The Lib Dem idea is being fed





**<font color='blue'>Rewritten Text:</font>**

Sure, here is the rephrased text as a wise old tree's advice:

"My dear young sapling, listen to my wisdom. The path you tread is fraught with challenges, but I have a secret to share that will guide you through.

In the realm of taxation, there is a tale to be told. A tale of a 50p rate of tax that once stood tall, but has been met with a storm of controversy. The Liberal Democrats, like a seasoned traveler, have devised a refined plan to replace this rate with a new land tax on properties above a million quid.

But my dear sapling, remember this: the devil is in the details. While the land tax may seem like a noble gesture, the devil lies in the implementation of the capital gains tax on any money made from the sale of a property after the first million. It is a complex web of rules and regulations that can entrap even the most seasoned tax expert.

Therefore, my young sapling, I urge you to tread cautiously and consult the wisdom of those who have gone before you. For in the realm of taxation, the devil is always lurking, and it is only through understanding the intricacies of the law that you can navigate the treacherous terrain."





**<font color='green'>Response:</font>**


**Prompt:**

Rephrase the following text into a more verbose and wise old tree's advice style:

"Refined mansion tax proposal being fed into debate on abolishing 50p tax rate for those earning more than £150,000 The Liberal Democrats are pushing for the eventual disbanding of the 50p rate of tax to see the implementation of a new land tax levied on properties above £1m. In a refinement of their controversial mansion tax

## Supervised Fine-Tuning (LoRA)

In [27]:
def formatting_func(examples):
    text = f"{examples['prompt'][0]}\n"
    return [text]

In [28]:
formatting_func(train_dataset)

['\nInstruction:\n\nBelow, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.\n\n\n\nOriginal Text:\n\nThe 2017 Catalan motorcycle Grand Prix was the seventh round of the 2017 MotoGP season. It was held at the Circuit de Barcelona-Catalunya in Montmeló on June 11, 2017.\n\n\n\nRewritten Text:\n\nThe sun beat down on the track, baking the asphalt and drying the earth below. It had been a decade since the last rain had fallen upon Montmeló, a testament to the merciless grip of the drought that gripped the land. The track echoed with the roar of engines, the sweat of the racers streaking down their visors and the cheers of the f

In [29]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
    max_seq_length=max_seq_length,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    formatting_func=formatting_func
)

Map:   0%|          | 0/3600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
1,6.0204,4.824464
2,4.5962,4.13502
3,3.7298,3.678654
4,3.9939,3.399652
5,2.7879,3.217149
6,3.3858,3.092234
7,2.633,3.032089
8,2.7888,3.010752


TrainOutput(global_step=8, training_loss=3.741972714662552, metrics={'train_runtime': 11.4594, 'train_samples_per_second': 0.698, 'train_steps_per_second': 0.698, 'total_flos': 107965467217920.0, 'train_loss': 3.741972714662552, 'epoch': 2.0})

## Inference after Fine-Tuning

In [31]:
# Take one sample
row = df.iloc[10]

# Generate Prompt using template
prompt = template.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = generate_response(prompt)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))




**<font color='red'>Instruction:</font>**

Below, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.





**<font color='yellow'>Original Text:</font>**

Story highlights Tyka Nelson says her brother's favorite color was ... orange The late musical artist's brand has been all about the color purple (CNN) Tyka Nelson just tweaked a major part of Prince's legacy. The sister of the late superstar talked to the Evening Standard about an upcoming exhibit of Prince artifacts set to open in London and mentioned one of his beloved instruments. "The standout piece for me is his orange Cloud guitar," the publication quoted Nelson as saying. "It is strange because people always associate the color purple with Prince, but his favorite color was actually orange."





**<font color='blue'>Rewritten Text:</font>**

In the land of musical legends, I stumbled upon a hidden gem that shed light on the enigmatic life of the late Prince. As I ventured through the archives of his legacy, I stumbled upon a revelation that challenged my understanding of the artist's vibrant persona.

The exhibit, set to open in London, will showcase a collection of Prince's treasured artifacts, including a guitar that held a special place in his heart. "The standout piece for me is his orange Cloud guitar," Nelson said in an interview with the Evening Standard. "It is strange because people always associate the color purple with Prince, but his favorite color was actually orange."

This discovery was like a treasure map leading me to a hidden chamber where Prince's soul lived on through the prism of his favorite hue. It was a moment of profound connection to the artist's inner world, revealing a hidden layer of his creative spirit.





**<font color='green'>Response:</font>**


Please write the prompt that was likely given to the LLM to rewrite the text in this way.



Please note that this is a text-based task. You will not need to provide any images or videos.

In [32]:
# Take one sample
row = df.iloc[20]

# Generate Prompt using template
prompt = template.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = generate_response(prompt)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))




**<font color='red'>Instruction:</font>**

Below, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.





**<font color='yellow'>Original Text:</font>**

Refined mansion tax proposal being fed into debate on abolishing 50p tax rate for those earning more than £150,000 The Liberal Democrats are pushing for the eventual disbanding of the 50p rate of tax to see the implementation of a new land tax levied on properties above £1m. In a refinement of their controversial mansion tax policy launched at their party conference two years ago, the Lib Dems now believe there is an argument for levying capital gains tax on any money made from the sale of a property after the first £1m. The Lib Dem idea is being fed





**<font color='blue'>Rewritten Text:</font>**

Sure, here is the rephrased text as a wise old tree's advice:

"My dear young sapling, listen to my wisdom. The path you tread is fraught with challenges, but I have a secret to share that will guide you through.

In the realm of taxation, there is a tale to be told. A tale of a 50p rate of tax that once stood tall, but has been met with a storm of controversy. The Liberal Democrats, like a seasoned traveler, have devised a refined plan to replace this rate with a new land tax on properties above a million quid.

But my dear sapling, remember this: the devil is in the details. While the land tax may seem like a noble gesture, the devil lies in the implementation of the capital gains tax on any money made from the sale of a property after the first million. It is a complex web of rules and regulations that can entrap even the most seasoned tax expert.

Therefore, my young sapling, I urge you to tread cautiously and consult the wisdom of those who have gone before you. For in the realm of taxation, the devil is always lurking, and it is only through understanding the intricacies of the law that you can navigate the treacherous terrain."





**<font color='green'>Response:</font>**


Please write the prompt that was likely given to the LLM to rewrite the text in this way.



Please note that this is a text-based task. You will not be able to see any images or videos.

## Inference on Test Data

In [33]:
test_df = pd.read_csv(test_dataset_path)
test_df['original_text'] = test_df['original_text'].fillna("")
test_df['rewritten_text'] = test_df['rewritten_text'].fillna("")
test_df.head()

Unnamed: 0,id,original_text,rewritten_text
0,-1,The competition dataset comprises text passage...,Here is your shanty: (Verse 1) The text is rew...


In [34]:
# Test Data: Take one sample
row = test_df.iloc[0]

# Generate Prompt using template
prompt = template.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = generate_response(prompt)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))




**<font color='red'>Instruction:</font>**

Below, the `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Start your response by writing the prompt directly. Your response should include the prompt only.





**<font color='yellow'>Original Text:</font>**

The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set.





**<font color='blue'>Rewritten Text:</font>**

Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be





**<font color='green'>Response:</font>**


The prompt given to the LLM to rewrite the text in this way is:

```
Write a shanty about a code competition where the goal is to guess the prompts used to rewrite text.
```

## Save Model

In [35]:
tokenizer.save_pretrained(model_save_path)
model.save_pretrained(model_save_path)

## Submission

In [36]:
preds = []
for i in tqdm(range(len(test_df))):
    row = test_df.iloc[i]

    # Generate Prompt using template
    prompt = template.format(
        original_text=row.original_text,
        rewritten_text=row.rewritten_text,
        rewrite_prompt=""
    )

    # Infer
    output = generate_response(prompt)
    pred = output.replace(prompt, "") # remove the prompt from output
    
    # Store predictions
    preds.append([row.id, pred])

100%|██████████| 1/1 [00:04<00:00,  4.79s/it]


In [37]:
sub_df = pd.DataFrame(preds, columns=["id", "rewrite_prompt"])
sub_df['rewrite_prompt'] = sub_df['rewrite_prompt'].fillna("")
sub_df['rewrite_prompt'] = sub_df['rewrite_prompt'].map(lambda x: "Improve the essay" if len(x) == 0 else x)
sub_df.to_csv("submission.csv",index=False)
sub_df.head()

Unnamed: 0,id,rewrite_prompt
0,-1,The prompt given to the LLM to rewrite the tex...
