# Prompt Engineering

In [None]:
# Global Variables
DRIVE_HOME = '/content/drive'
CODE_HOME = '/MyDrive/LawDigestAI'

# Drive Mount
from google.colab import drive
drive.mount(DRIVE_HOME)

Mounted at /content/drive


In [None]:
import pandas as pd

input_file_path = f"{DRIVE_HOME}{CODE_HOME}/2_Generation/catchphrase_Extraction/preprocessed_summ_data.csv"
summ_data = pd.read_csv(input_file_path)

summ_data_samp = summ_data[:10].copy()
# summ_data_samp = summ_data.copy()

## Test run with the model - Basline

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5-small model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Enable GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


# Function to summarize selected sentences
def generate_catchphrases_gpu(sentence, max_length=50, min_length=5):
    # Preprocess the input for T5
    input_text = "Extract the main legal issues and significant phrases from this legal case in a list of phrases: " + sentence
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(device)

    # Generate the output
    summary_ids = model.generate(
        input_ids,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Apply summarization to one row in the dataset
selected_sentence = summ_data_samp.loc[0, "selected_sentences"]
catchphrase = generate_catchphrases_gpu(selected_sentence)

# Add the result to the DataFrame
summ_data_samp.loc[0, "predicted_catchphrases"] = catchphrase
summ_data_samp

Unnamed: 0,filename,name,AustLII,catchphrases,sentences,word_count,num_catchphrases,total_tokens,selected_sentences,predicted_catchphrases
0,08_1056.xml,University of Western Australia v Gray (No 21)...,http://www.austlii.edu.au/au/cases/cth/FCA/200...,"['indemnity costs', 'calderbank letter', 'refu...",1 On 17 April 2008 the application brought by ...,5488,7,29,32 The general principles governing the exerci...,"of a ""Calderbank offer"" does not itself warran..."
1,09_930.xml,Parker v Parker [2009] FCA 930 (25 August 2009),http://www.austlii.edu.au/au/cases/cth/FCA/200...,"['judicial review', 'application brought pursu...",INTRODUCTION \n The applicant seeks review of ...,5046,8,62,INTRODUCTION \n The applicant seeks review of ...,
2,06_1301.xml,CSL Limited v GlaxoSmithKline Australia Pty Lt...,http://www.austlii.edu.au/au/cases/cth/FCA/200...,"['interlocutory injunction', 'quia timet injun...","1 The applicant, CSL Limited (""CSL""), seeks an...",9115,7,68,56 It is often said that where an interlocutor...,
3,08_1730.xml,Deputy Commissioner of Taxation v BK Ganter Ho...,http://www.austlii.edu.au/au/cases/cth/FCA/200...,"['winding up', 'neglect to pay debt after stat...",1 This matter came on for hearing before me ye...,3993,10,89,Inferentially from the submissions made to me ...,
4,06_1473.xml,Paligorov v Cohen [2006] FCA 1473 (10 November...,http://www.austlii.edu.au/au/cases/cth/FCA/200...,['appeal from sequestration order of federal m...,1 This appeal is from a judgment of Federal Ma...,2668,6,94,The third issue is whether his Honour erred in...,
5,09_924.xml,"Zoia v Secretary, Department of Education, Emp...",http://www.austlii.edu.au/au/cases/cth/FCA/200...,['appeal from decision of administrative appea...,INTRODUCTION \n The applicant (Mr Zoia) 'appea...,3485,7,93,"On 4 September 2008, the Administrative Appeal...",
6,06_355.xml,"Dana Australia (Holdings) Pty Ltd, in the matt...",http://www.austlii.edu.au/au/cases/cth/FCA/200...,"['extension of times fixed by class order', 'w...",1 Section 1322(4)(d) of the Corporations Act 2...,1950,3,21,"Put another way, when a time fixed by the Corp...",
7,07_1329.xml,"Hayes, in the matter of Estate Property Group ...",http://www.austlii.edu.au/au/cases/cth/FCA/200...,['second extension of convening period for sec...,1 On 15 June 2007 I made orders extending the ...,1943,4,28,On 13 July 2007 the convening period for the s...,
8,06_427.xml,Australian Securities &amp; Investments Commis...,http://www.austlii.edu.au/au/cases/cth/FCA/200...,"['unregistered managed investment scheme', 'un...",1 This proceeding was commenced by the Austral...,9513,6,34,Under this provision an interest in a managed ...,
9,06_341.xml,"Lawrance v Chief Executive Officer, CRS Austra...",http://www.austlii.edu.au/au/cases/cth/FCA/200...,"['review of administrative appeals tribunal (""...",Introduction \n \n 1 This is an appeal under s...,5529,13,84,Introduction \n \n 1 This is an appeal under s...,


## Experiment with various prompts for good results

In [None]:
# Function to test different prompts
def generate_key_phrases_with_prompt(sentence, prompt, max_length=100, min_length=10):
    # Preprocess the input with the given prompt
    input_text = f"{prompt} {sentence}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(device)

    # Generate the output
    output_ids = model.generate(
        input_ids,
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode and split into a list of phrases
    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    key_phrases = [phrase.strip() for phrase in output.split(",")]
    return key_phrases

# Example sentence (selected sentences from your dataset)
selected_sentence = summ_data_samp.loc[8, "selected_sentences"]

# Test different prompts
prompts = [
    "Identify the key points of this legal case as a list of phrases:",
    "Extract the main legal issues and significant phrases from this legal case:",
    "List the most important phrases related to this case:",
    "Focus on extracting concise and relevant key phrases from the following legal document:",
    "Generate an abstract list of important phrases that summarize the case document:",
    "Summarize the legal case as a list of abstracted key phrases:"
]
# This can also be tried
# "Generate an abstract list of important phrases and main legal issues that summarize the case document:"

# Store results for each prompt
results = {}
for prompt in prompts:
    key_phrases = generate_key_phrases_with_prompt(selected_sentence, prompt)
    results[prompt] = key_phrases

# Display results for comparison
for prompt, key_phrases in results.items():
    print(f"Prompt: {prompt}\nKey Phrases: {key_phrases}\nActual catchphrase: {summ_data_samp['catchphrases'][8]}\n")


Prompt: Identify the key points of this legal case as a list of phrases:
Key Phrases: ['Defendants contravened section 601ED(5) of the Corporations Act 2001 by operating an unregistered managed investment scheme. Section 601ED(5) provides that a person must not operate in this jurisdiction a managed investment scheme that is required to be registered under s 601EB unless the scheme is so registered. the proceeding seeks relief in respect of an unregistered managed investment scheme known as "the Scheme"']
Actual catchphrase: ['unregistered managed investment scheme', 'unlicensed financial services business', 'whether declarations appropriate', 'whether scope of injunctive relief appropriate', 'whether winding up of company necessary', 'corporations']

Prompt: Extract the main legal issues and significant phrases from this legal case:
Key Phrases: ['to operate a managed investment scheme that is not a registered scheme. Section 601ED(5) provides that a person must not operate in this ju

### FINAL_CHOSEN_PROMPT = **Generate an abstract list of important phrases that summarize the case document:**