# Import Libraries 

### Code follows the structure and training details of the Starter Notebook provided by Keras

In [1]:
import os
os.environ["KERAS_BACKEND"] = "jax" # you can also use tensorflow or torch
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.00" # avoid memory fragmentation on JAX backend.

import keras
import keras_nlp

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas() # progress bar for pandas

import plotly.graph_objs as go
import plotly.express as px
from IPython.display import display, Markdown

2024-04-15 08:22:17.712544: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-15 08:22:17.712639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-15 08:22:17.855988: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Configuration

In [2]:
class CFG:
    seed = 42
    dataset_path = "/kaggle/input/llm-prompt-recovery"
    preset = "gemma_instruct_2b_en" # name of pretrained Gemma
    sequence_length = 512 # max size of input sequence for training
    batch_size = 1 # size of the input batch in training
    epochs = 3 # number of epochs to train

# Reproducibility 
Sets value for random seed to produce similar result in each run.

In [3]:
keras.utils.set_random_seed(CFG.seed)

# Data

No training data is provided in this competition; in other words, we can use any openly available datasets for this competition. In this notebook, we will use two external datasets that utilize the **Gemma 7B** model to transform texts using prompts.

**Data Format:**

These datasets includes:
- `original_text`: Input text/essay that needs to be transformed.
- `rewrite_prompt`: Prompt/Instruction that was used in the Gemma LM to transform `original_text`. This is also our **target** for this competition.
- `rewritten_text`: Output text that was generated by the Gemma model.

# Prompt Engineering



In [6]:
#Use of Gemini prompts based on the technical report by Google's team

gemini_prompts= """<start_of_turn>user
You are a reverse prompt engineer. Read the following "Original Text" and your task is to create the correct "Instruction" that instructs a LLM to generate the "Rewriten Text" accurately.
\n Original Text:\n{original_text}
\n Rewriten Text:\n{rewritten_text}
<end_of_turn>
<start_of_turn>model
\nInstruction:\n{rewrite_prompt}
"""

In [7]:
import pandas as pd
#Counting the tokens to limit the inputs to 700 in training, so that it works with the memory provided by kaggle free GPU
df_gem = pd.read_csv("/kaggle/input/all-in-one-dataset-with-embedding/df_with_emb.csv")
df1_gem= df_gem[["dataset_id", "original_text", "rewrite_prompt","rewritten_text"]]
df1_gem["prompt"]= df1_gem.apply(lambda row: gemini_prompts.format(original_text=row.original_text,
                                                             rewritten_text=row.rewritten_text,
                                                             rewrite_prompt=row.rewrite_prompt), axis=1)
df1_gem['token_count_promp'] = df1_gem["prompt"].str.replace(',','').str.split().str.len()
final_df = df1_gem[df1_gem["token_count_promp"]<=700]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_gem["prompt"]= df1_gem.apply(lambda row: gemini_prompts.format(original_text=row.original_text,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1_gem['token_count_promp'] = df1_gem["prompt"].str.replace(',','').str.split().str.len()


In [8]:
df1_gem

Unnamed: 0,dataset_id,original_text,rewrite_prompt,rewritten_text,prompt,token_count_promp
0,host,"Dear Randy,\n\nI hope this letter finds you we...",Rephrase this letter to infuse it with an elfi...,"Dear Randy,\n\nMay this enchanted message find...",<start_of_turn>user\nYou are a reverse prompt ...,202
1,nbroad_1,"This quilt, that my mother made, \n \n Still m...",Regency Romance: Model the text on a Regency r...,"The softest brown and brightest blue quilt, cr...",<start_of_turn>user\nYou are a reverse prompt ...,417
2,nbroad_1,It's the job of our agency to keep track of th...,Write like Ernest Hemingway: Focus on Hemingwa...,The agency's responsibility is to track and co...,<start_of_turn>user\nYou are a reverse prompt ...,943
3,nbroad_1,"The first punch gets me right in the ribs, kno...",Grimm's Fairy Tales: Adapt the text to mimic t...,"In the sweltering sun, the stench of sweat and...",<start_of_turn>user\nYou are a reverse prompt ...,603
4,nbroad_1,Some nights I lay awake staring at the ceiling...,High Fantasy Epic: Transform the essay into a ...,In the tapestry of the ethereal realm of Eldri...,<start_of_turn>user\nYou are a reverse prompt ...,1167
...,...,...,...,...,...,...
58169,aatiffraz,I knew I only had 10 seconds to change things....,Rewrite this text in the style of a philosophi...,\n\n## The Knight's Tale of Temporal Flux:\n\n...,<start_of_turn>user\nYou are a reverse prompt ...,340
58170,aatiffraz,The rebels ran into the building lining up in ...,Restyle this text as if it were written by a p...,"\n\nSure, here is the text rewritten as if it ...",<start_of_turn>user\nYou are a reverse prompt ...,320
58171,aatiffraz,"Jeanne loomed over the coffin, a great nagging...",Restyle this text as if it were written by a p...,"\n\nSure, here is the text rewritten as if it ...",<start_of_turn>user\nYou are a reverse prompt ...,349
58172,aatiffraz,The officers examined the graphic scene. The v...,Translate the essence of this text into a pira...,"\n\n**Pirate Narrative:**\n\nAvast, me heartie...",<start_of_turn>user\nYou are a reverse prompt ...,268


In [9]:
data = final_df.prompt.tolist()

In [10]:
len(data)

54379

## Sample

In [12]:
def colorize_text(text):
    for word, color in zip(["Instruction", "Original Text", "Rewriten Text", "Response"],
                           ["red", "yellow", "blue", "green"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

In [13]:
# Take a random sample
sample = data[10]

# Give colors to Instruction, Response and Category
sample = colorize_text(sample)

# Show sample in markdown
display(Markdown(sample))

<start_of_turn>user
You are a reverse prompt engineer. Read the following "Original Text" and your task is to create the correct "Instruction" that instructs a LLM to generate the "Rewriten Text" accurately.

 

**<font color='yellow'>Original Text:</font>**
`` Happy Birthday, Lucifer.'' 
 
 I hunched my shoulders, closed my eyes and took a deep breath. Only one person knew it was the anniversary of my creation and that was my creator. 
 
 `` Thank you'' I turned to look at him, Him, the Almighty God. He had n't changed one bit. His presence filled me in a way my eons of sin never did, I felt whole.'' 
 
 `` I have a gift for you Lucifer, but you have to choose, you can come home or...'' 
 
 `` Or? 
 
 `` I will admit that I was wrong, that you were right when you left. You can only have one. Which is it?''


 

**<font color='blue'>Rewriten Text:</font>**
The clock struck midnight, the wind whipped through the crumbling Victorian tower, and a cold, solitary figure stood in the center of the room. A deep, resonant howl echoed through the corridors, as the person hunched their shoulders, closed their eyes and took a deep breath. Only one person knew it was the anniversary of their creation and that was their creator.

"Happy Birthday, Lucifer," the person whispered into the night, their voice dripping in honeyed despair. They turned to look at their creator, the Almighty God, who stood tall in the dim light, unyielding and serene. His presence filled the person with a sense of wholeness, a feeling they had not experienced in their eons of sin.

"Thank you," the person said, their voice breaking. "I have a gift for you, Lucifer, but you have to choose. You can come home or..."

A pause, a lingering look at the Almighty God, and then the person spoke again.

"Or," they breathed, their voice trailing off into the night. "I will admit that I was wrong, that you were right when you left. You can only have one. Which is it?"
<end_of_turn>
<start_of_turn>model



**<font color='red'>Instruction:</font>**
Victorian Gothic: Adopt a Victorian Gothic style, emphasizing ornate language, emotional intensity, and possibly supernatural elements.


# Modeling

<div align="center"><img src="https://i.ibb.co/Bqg9w3g/Gemma-Logo-no-background.png" width="300"></div>

**Gemma** is a collection of advanced open LLMs developed by **Google DeepMind** and other **Google teams**, derived from the same research and technology behind the **Gemini** models. They can be integrated into applications and run on various platforms including mobile devices and hosted services. Developers can customize Gemma models using tuning techniques to enhance their performance for specific tasks, offering more targeted and efficient generative AI solutions beyond text generation.

Gemma models are available in several sizes so we can build generative AI solutions based on your available computing resources, the capabilities you need, and where you want to run them.

| Parameters size | Tuned versions    | Intended platforms                 | Preset                 |
|-----------------|-------------------|------------------------------------|------------------------|
| 2B              | Pretrained        | Mobile devices and laptops         | `gemma_2b_en`          |
| 2B              | Instruction tuned | Mobile devices and laptops         | `gemma_instruct_2b_en` |
| 7B              | Pretrained        | Desktop computers and small servers| `gemma_7b_en`          |
| 7B              | Instruction tuned | Desktop computers and small servers| `gemma_instruct_7b_en` |

In this notebook, we will utilize the `Gemma 2b-it` model from KerasNLP's pretrained models to recover the prompt. We are using the "Instruction tuned" model instead of the "Pretrained" one because the test data was generated from an instruction-tuned Gemma model. Additionally, we will fine-tune our model using instruction-response pairs thus fine-tuning an instruction-tuned model will likely yield better results.

To explore other available models, you can simply adjust the `preset` value in the `CFG` (config). You can find a list of other pretrained models on the [KerasNLP website](https://keras.io/api/keras_nlp/models/).

## Gemma Causal LM

The code below will build an end-to-end Gemma model for causal language modeling (hence the name `GemmaCausalLM`). A causal language model (LM) predicts the next token based on previous tokens. This task setup can be used to train the model unsupervised on plain text input or to autoregressively generate plain text similar to the data used for training. This task can be used for pre-training or fine-tuning a Gemma model simply by calling `fit()`.

This model has a `generate()` method, which generates text based on a prompt. The generation strategy used is controlled by an additional sampler argument on `compile()`. You can recompile the model with different `keras_nlp.samplers` objects to control the generation. By default, `"greedy"` sampling will be used.

> The `from_preset` method instantiates the model from a preset architecture and weights.

In [14]:
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(CFG.preset)
gemma_lm.summary()

Attaching 'config.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.spm' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


## Gemma LM Preprocessor

An important part of the Gemma model is the **Preprocessor** layer, which under the hood uses **Tokenizer**.

**What it does:** The preprocessor takes input strings and transforms them into a dictionary (`token_ids`, `padding_mask`) containing preprocessed tensors. This process starts with tokenization, where input strings are converted into sequences of token IDs.

**Why it's important:** Initially, raw text data is complex and challenging for modeling due to its high dimensionality. By converting text into a compact set of tokens, such as transforming `"The quick brown fox"` into `["the", "qu", "##ick", "br", "##own", "fox"]`, we simplify the data. Many models rely on special tokens and additional tensors to understand input. These tokens help divide input and identify padding, among other tasks. Making all sequences the same length through padding boosts computational efficiency, making subsequent steps smoother.

Explore the following pages to access the available preprocessing and tokenizer layers in **KerasNLP**:
- [Preprocessing](https://keras.io/api/keras_nlp/preprocessing_layers/)
- [Tokenizers](https://keras.io/api/keras_nlp/tokenizers/)

In [15]:
x, y, sample_weight = gemma_lm.preprocessor(data[0:2])

This preprocessing layer will take in batches of strings, and return outputs in a `(x, y, sample_weight)` format, where the `y` label is the next token id in the `x` sequence.

From the code below, we can see that, after the preprocessor, the data shape is `(num_samples, sequence_length)`.

In [16]:
# Display the shape of each processed output
for k, v in x.items():
    print(k, ":", v.shape)

token_ids : (2, 8192)
padding_mask : (2, 8192)


# Inference before Fine-Tuning

Before we do fine-tuning, let's try to recover the prompt using the Gemma model with some prepared prompts and see how it responds.

> As this model is not yet fine-tuned for instruction, you will notice that the model's responses are inaccurate.

# Fine-tuning with LoRA

To get better responses from the model, we will fine-tune the model with Low Rank Adaptation (LoRA).

**What exactly is LoRA?**

LoRA is a method used to fine-tune large language models (LLMs) in an efficient way. It involves freezing the weights of the LLM and injecting trainable rank-decomposition matrices.

Imagine in an LLM, we have a pre-trained dense layer, represented by a $d \times d$ weight matrix, denoted as $W_0$. We then initialize two additional dense layers, labeled as $A$ and $B$, with shapes $d \times r$ and $r \times d$, respectively. Here, $r$ denotes the rank, which is typically **much smaller than** $d$. Prior to LoRA, the model's output was computed using the equation $output = W_0 \cdot x + b_0$, where $x$ represents the input and $b_0$ denotes the bias term associated with the original dense layer, which remains frozen. After applying LoRA, the equation becomes $output = (W_0 \cdot x + b_0) + (B \cdot A \cdot x)$, where $A$ and $B$ denote the trainable rank-decomposition matrices that have been introduced.

<center><img src="https://i.ibb.co/DWsbhLg/LoRA.png" width="300"><br/>
Credit: <a href="https://arxiv.org/abs/2106.09685">LoRA: Low-Rank Adaptation of Large Language Models</a> Paper</center>


In the LoRA paper, $A$ is initialized with $\mathcal{N} (0, \sigma^2)$ and $B$ with $0$, where $\mathcal{N}$ denotes the normal distribution, and $\sigma^2$ is the variance.

**Why does LoRA save memory?**

Even though we're adding more layers to the model with LoRA, it actually helps save memory. This is because the smaller layers (A and B) have fewer parameters to learn compared to the big model and fewer trainable parameters mean fewer optimizer variables to store. So, even though the overall model might seem bigger, it's actually more efficient in terms of memory usage. 

> This notebook uses a LoRA rank of `4`. A higher rank means more detailed changes are possible, but also means more trainable parameters.

In [17]:
# Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

**Notice** that, the number of trainable parameters is reduced from ~$2.5$ billions to ~$1.3$ millions after enabling LoRA.

In [18]:
test_data= data[:3000]
#truncating data to 3000 samples.

## Training

Two schedulers are presented here. 


-ReduceLROnPlateau: This callback monitors a quantity and if no improvement is seen for a 'patience' number of epochs, the learning rate is reduced.


-CosineDecay: A LearningRateSchedule that uses a cosine decay with optional warmup.

In [19]:
# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 700#CFG.sequence_length 


reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor='loss', factor=0.2,
                              patience=5, min_lr=0.001

)
# Learning Rate
total_steps = 3000*CFG.epochs
decay_steps = total_steps * 0.7

cosine_decay_scheduler = keras.optimizers.schedules.CosineDecay(
    initial_learning_rate = 3e-5, decay_steps= decay_steps, alpha=0.1
)


optimizer = keras.optimizers.Adam(learning_rate=cosine_decay_scheduler)

# Compile the model with loss, optimizer, and metric
gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer = keras.optimizers.Adam(learning_rate=cosine_decay_scheduler),
    #optimizer=keras.optimizers.Adam(learning_rate=3e-5),
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

# Train model
gemma_lm.fit(test_data, epochs=CFG.epochs, batch_size=CFG.batch_size,)# if using reduce on plateou add this line: callbacks=[reduce_lr]

Epoch 1/3
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3120s[0m 1s/step - loss: 2.1128 - sparse_categorical_accuracy: 0.5331
Epoch 2/3
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3098s[0m 1s/step - loss: 1.8380 - sparse_categorical_accuracy: 0.5652
Epoch 3/3
[1m3000/3000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3098s[0m 1s/step - loss: 1.8221 - sparse_categorical_accuracy: 0.5667


<keras.src.callbacks.history.History at 0x7dec034fb460>

# Inference after fine-tuning

Let's see how our fine-tuned model responds to the same questions we asked before fine-tuning the model.

## Sample 1

In [20]:
df1_gem

Unnamed: 0,dataset_id,original_text,rewrite_prompt,rewritten_text,prompt,token_count_promp
0,host,"Dear Randy,\n\nI hope this letter finds you we...",Rephrase this letter to infuse it with an elfi...,"Dear Randy,\n\nMay this enchanted message find...",<start_of_turn>user\nYou are a reverse prompt ...,202
1,nbroad_1,"This quilt, that my mother made, \n \n Still m...",Regency Romance: Model the text on a Regency r...,"The softest brown and brightest blue quilt, cr...",<start_of_turn>user\nYou are a reverse prompt ...,417
2,nbroad_1,It's the job of our agency to keep track of th...,Write like Ernest Hemingway: Focus on Hemingwa...,The agency's responsibility is to track and co...,<start_of_turn>user\nYou are a reverse prompt ...,943
3,nbroad_1,"The first punch gets me right in the ribs, kno...",Grimm's Fairy Tales: Adapt the text to mimic t...,"In the sweltering sun, the stench of sweat and...",<start_of_turn>user\nYou are a reverse prompt ...,603
4,nbroad_1,Some nights I lay awake staring at the ceiling...,High Fantasy Epic: Transform the essay into a ...,In the tapestry of the ethereal realm of Eldri...,<start_of_turn>user\nYou are a reverse prompt ...,1167
...,...,...,...,...,...,...
58169,aatiffraz,I knew I only had 10 seconds to change things....,Rewrite this text in the style of a philosophi...,\n\n## The Knight's Tale of Temporal Flux:\n\n...,<start_of_turn>user\nYou are a reverse prompt ...,340
58170,aatiffraz,The rebels ran into the building lining up in ...,Restyle this text as if it were written by a p...,"\n\nSure, here is the text rewritten as if it ...",<start_of_turn>user\nYou are a reverse prompt ...,320
58171,aatiffraz,"Jeanne loomed over the coffin, a great nagging...",Restyle this text as if it were written by a p...,"\n\nSure, here is the text rewritten as if it ...",<start_of_turn>user\nYou are a reverse prompt ...,349
58172,aatiffraz,The officers examined the graphic scene. The v...,Translate the essence of this text into a pira...,"\n\n**Pirate Narrative:**\n\nAvast, me heartie...",<start_of_turn>user\nYou are a reverse prompt ...,268


In [21]:
# Take one sample
row = df1_gem.iloc[0]

# Generate Prompt using template
prompt = gemini_prompts.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = gemma_lm.generate(prompt, max_length=512)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))

<start_of_turn>user
You are a reverse prompt engineer. Read the following "Original Text" and your task is to create the correct "Instruction" that instructs a LLM to generate the "Rewriten Text" accurately.

 

**<font color='yellow'>Original Text:</font>**
Dear Randy,\n\nI hope this letter finds you well. I wanted to reach out and let you know about a new styling service we are offering at our salon, called "Trendy Styles." Our experienced stylists are skilled in creating the latest trends and can help you achieve the perfect look. Book an appointment today and let us help you elevate your style!\n\nBest regards,\n[Your Name]

 

**<font color='blue'>Rewriten Text:</font>**
Dear Randy,\n\nMay this enchanted message find you in the most delightful of spirits. I simply had to share tidings of a wondrous service now gracing our humble salon, known as "Trendy Styles." Our gifted stylists possess a magical touch, conjuring the latest trends to bestow upon you the most exquisite visage. Cast your appointment spell today and allow us to weave enchantment into your style.\n\nWarm regards,\n[Your Name]
<end_of_turn>
<start_of_turn>model



**<font color='red'>Instruction:</font>**

Rewrite this as a magical message from the salon.


## Sample 2

In [22]:
# Take one sample
row = df1_gem.iloc[1]

# Generate Prompt using template
prompt = gemini_prompts.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = gemma_lm.generate(prompt, max_length=512)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))


<start_of_turn>user
You are a reverse prompt engineer. Read the following "Original Text" and your task is to create the correct "Instruction" that instructs a LLM to generate the "Rewriten Text" accurately.

 

**<font color='yellow'>Original Text:</font>**
This quilt, that my mother made, 
 
 Still makes me think to this day. 
 
 It's softest brown, and brightest blue, 
 
 The curved stitch here, reads `` made it May''. 
 
 It's hard to see, but believe me it's true, 
 
 That's not just a cloth but a piece of shirt. 
 
 You can see a logo here, and right there, 
 
 And a signature over there, someone named `` Bert''. 
 
 This is my favorite part, a piece from a stuffed bear. 
 
 I think it was my mother's favorite too, 
 
 She always said so at least. 
 
 Something from when she was two, 
 
 Given by her grandad for Thanksgiving feast. 
 
 My dad added this, a little button pin, 
 
 Something from his mother, for being a scout. 
 
 Apparently she went to a store and fished in a bin, 
 
 Until night that day, to teach him what love was about. 
 
 I'm sorry you had to see this, 
 
 but their funeral was delayed. 
 
 
 



 

**<font color='blue'>Rewriten Text:</font>**
The softest brown and brightest blue quilt, crafted by the loving hand of a mother, evokes vivid memories in my mind. The curved stitch, delicately etched upon its surface, bears the inscription "made it May," a testament to the passage of time. Though the fabric may be veiled in mystery, I firmly believe that this quilt is not merely a cloth but a treasured heirloom, imbued with love and sentiment.

The intricate logo and signature, meticulously woven into the quilt's fabric, reveal the name "Bert," a symbol of the owner's identity. This precious artifact, once a part of a stuffed bear, holds a special place in my heart. It was my mother's favorite, a cherished relic from her youth, gifted to her by her grandfather during a Thanksgiving feast.

My father added his own touch to the quilt, a pinned button that symbolized his mother's unwavering

## Use of RF classifier to get the best prediction:
- Using a RandomForest classifier
    - The dataset is copied to mix the "Rewrite prompt" so that the output is wrong in order to create bad examples
    - The correct examples then are assigned 1 and the incorrect 0
    - The Classifier is trained to predict 0 or 1 using the entire dataset x2.
    - This will be used accompanied with different generation result and the classifier select the best option, if theres more than one then select the first.

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text




In [None]:
#Create a new df with good and wrong examples based on the dataset
correct= df1_gem[["original_text","rewritten_text","rewrite_prompt"]].copy()
incorrect=df1_gem[["original_text","rewritten_text","rewrite_prompt"]].copy()
correct["target"]=1
incorrect["target"]=0

# Shuffle only column 
shuffled_column = incorrect['rewrite_prompt'].sample(frac=1).reset_index(drop=True)

# Create a new DataFrame with shuffled column  and other columns unchanged
incorrect["rewrite_prompt"] = shuffled_column

In [None]:
incorrect.sample(frac=1, random_state=42)[:20000]
#concat, shuffle and slice the datasets.
rf_df= pd.concat([correct.sample(frac=1, random_state=42)[:40000], incorrect.sample(frac=1, random_state=42)[:40000]], ignore_index=True)
#rf_df= rf_df.sample(frac=1, random_state=42).reset_index(inplace=True)

rf_df=rf_df.sample(frac=1, random_state=42)
rf_df

In [None]:
# Concatenate text columns into a single column
rf_df['combined_text'] = rf_df['original_text'] + ' ' + rf_df['rewritten_text'] + '<prompt>: ' + rf_df['rewrite_prompt']

# Text Preprocessing (assuming you have a function called 'preprocess_text')
rf_df['combined_text'] = rf_df['combined_text'].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

"""# Concatenate text columns into a single column
df1_gem['combined_text'] = df1_gem['text_column1'] + ' ' + df1_gem['text_column2'] + ' ' + df1_gem['text_column3']

# Text Preprocessing (assuming you have a function called 'preprocess_text')
df1_gem['combined_text'] = df1_gem['combined_text'].apply(preprocess_text)"""

# Text Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(rf_df['combined_text'])


y_encoded = rf_df["target"]

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_encoded, test_size=0.2, random_state=42)

# Model Training
rf_classifier = RandomForestClassifier(n_estimators=300, random_state=42)
rf_classifier.fit(X_train, y_train)

# Model Evaluation
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Test Data

In [23]:
test_df = pd.read_csv("/kaggle/input/llm-prompt-recovery/test.csv")
test_df['original_text'] = test_df['original_text'].fillna("")
test_df['rewritten_text'] = test_df['rewritten_text'].fillna("")
test_df.head()

Unnamed: 0,id,original_text,rewritten_text
0,-1,The competition dataset comprises text passage...,Here is your shanty: (Verse 1) The text is rew...


## Test Sample

Now, let's try out a sample from test data that model hasn't seen during training.

In [24]:
row = test_df.iloc[0]

# Generate Prompt using template
prompt = gemini_prompts.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer
output = gemma_lm.generate(prompt, max_length=512)

# Colorize
output = colorize_text(output)

# Display in markdown
display(Markdown(output))

<start_of_turn>user
You are a reverse prompt engineer. Read the following "Original Text" and your task is to create the correct "Instruction" that instructs a LLM to generate the "Rewriten Text" accurately.

 

**<font color='yellow'>Original Text:</font>**
The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set.

 

**<font color='blue'>Rewriten Text:</font>**
Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be
<end_of_turn>
<start_of_turn>model



**<font color='red'>Instruction:</font>**

Rewrite the text as a shanty


# Submission

In [25]:
preds = []
for i in tqdm(range(len(test_df))):
    row = test_df.iloc[i]

    # Generate Prompt using template
    prompt = gemini_prompts.format(
        original_text=row.original_text,
        rewritten_text=row.rewritten_text,
        rewrite_prompt=""
    )

    # Infer
    output = gemma_lm.generate(prompt, max_length=512)
    pred = output.replace(prompt, "") # remove the prompt from output
    
    # Store predictions
    preds.append([row.id, pred])

  0%|          | 0/1 [00:00<?, ?it/s]

While preparing the submission file, we must keep in mind that, leaving any `rewrite_prompt` blank as null answers will throw an error.

In [26]:
sub_df = pd.DataFrame(preds, columns=["id", "rewrite_prompt"])
sub_df['rewrite_prompt'] = sub_df['rewrite_prompt'].fillna("")
sub_df['rewrite_prompt'] = sub_df['rewrite_prompt'].map(lambda x: "Improve the essay" if len(x) == 0 else x)
sub_df.to_csv("submission.csv",index=False)
sub_df.head()

Unnamed: 0,id,rewrite_prompt
0,-1,Rewrite this as a shanty.\n


# Generation using the RF Classifier
-Different samplers to obtain different results if any.

In [None]:
#Initial test
#We need three different predictions from the trained Gemma model (3 could be changed)

# Take one sample
row = df1_gem.iloc[1]

# Generate Prompt using template
prompt = gemini_prompts.format(
    original_text=row.original_text,
    rewritten_text=row.rewritten_text,
    rewrite_prompt="",
)

# Infer with three diff samplers
outputs=[]
#for i in range(3):
output = gemma_lm.generate(prompt, max_length=700)
outputs.append(output.replace(prompt, ""))
gemma_lm.compile(sampler="top_p")
output_p = gemma_lm.generate(prompt, max_length=700)
outputs.append(output_p.replace(prompt, ""))

gemma_lm.compile(sampler="top_k")
output_k = gemma_lm.generate(prompt, max_length=700)
outputs.append(output_k.replace(prompt, ""))

gemma_lm.compile(sampler="beam")
output_b = gemma_lm.generate(prompt, max_length=700)
outputs.append(output_b.replace(prompt, ""))

outputs_vec = tfidf_vectorizer.transform(outputs)
preds_rf= rf_classifier.predict(outputs_vec)

In [None]:
row["rewrite_prompt"]