# Data Check

In [1]:
import json
from pathlib import Path

import random

In [2]:
HOME_DIR = Path("/Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/")
data_path = HOME_DIR / 'public-data' / 'open-australian-legal-qa' / 'qa.jsonl'

In [3]:
data = []
if data_path.exists():
    with open(data_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
else:
    print(f"File not found: {data_path}")

for i, item in enumerate(data[:5]):
    print(f"Entry {i+1}: {item}")

Entry 1: {'question': "In the case of Nasr v NRMA Insurance [2006] NSWSC 1018, why was the plaintiff's appeal lodged out of time?", 'answer': "In Nasr v NRMA Insurance [2006] NSWSC 1018, the plaintiff's appeal was lodged out of time because the summons was filed on 8 June 2006, seven months after the decision of the Local Court was made on 4 October 2005. No explanation was provided for this delay.", 'text': "Question: In the case of Nasr v NRMA Insurance [2006] NSWSC 1018, why was the plaintiff's appeal lodged out of time?\nAnswer: In Nasr v NRMA Insurance [2006] NSWSC 1018, the plaintiff's appeal was lodged out of time because the summons was filed on 8 June 2006, seven months after the decision of the Local Court was made on 4 October 2005. No explanation was provided for this delay.", 'prompt': "# Snippet\nThe snippet from an Australian legal document from which you must synthesise a question and answer is provided below.\n<document_metadata>\n<document_title>Nasr v NRMA Insurance 

In [4]:
questions = [item['question'] for item in data]

for i, question in enumerate(questions[:5]):
    print(f"Question {i+1}: {question}")

Question 1: In the case of Nasr v NRMA Insurance [2006] NSWSC 1018, why was the plaintiff's appeal lodged out of time?
Question 2: In the case of R v NGUYEN [2001] NSWCCA 334, what was the relationship between the Appellant and Mr Nguyen, and what activities of Mr Nguyen did the Appellant testify about?
Question 3: In the case of Moore v Scenic Tours Pty Ltd [2015] NSWSC 237, what was the court's decision regarding the motion to restrain a firm from acting?
Question 4: What were the circumstances and outcomes of the case Inspector Phillip Estreich v Hannas Civil Engineering Pty Ltd and Afram Hanna [2009] NSWIRComm 168 in New South Wales?
Question 5: In the case of Ruddock v Vadarlis [2001] FCA 1329, what was the argument of the Commonwealth regarding the application of habeas corpus and how was it received?


# GPT-4

In [5]:
import openai
import pandas as pd
from pathlib import Path
import json
import random
from typing import List, Dict, Tuple

In [6]:
!pip install openai==0.28


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
def load_api_key(file_path: str) -> str:
    """Load OpenAI API key from a file."""
    with open(file_path, 'r') as file:
        return file.read().strip()

In [8]:
class LegalResponseGenerator:
    def __init__(self, api_key_path: str):
        """Initialize the generator with path to API key file."""
        api_key = load_api_key(api_key_path)
        openai.api_key = api_key
        self.rules = {
            "pronouns": {
                "name": "Personal Pronouns Rule",
                "accepted": "Use personal pronouns, including inclusive 'we'",
                "rejected": "Avoid using any personal pronouns"
            },
            "voice": {
                "name": "Voice Rule",
                "accepted": "Use active voice",
                "rejected": "Use passive voice"
            },
            "tense": {
                "name": "Tense Rule",
                "accepted": "Use present tense",
                "rejected": "Use past tense"
            },
            "mood": {
                "name": "Mood Rule",
                "accepted": "Use polite imperative mood",
                "rejected": "Use imperative mood"
            },
            "emotional": {
                "name": "Emotional Words Rule",
                "accepted": "Use emotional words and cognitive verbs (like feel, understand, think, believe)",
                "rejected": "Avoid using any emotional words or cognitive verbs"
            }
        }

    def generate_prompt(self, question: str, rule_key: str, is_accepted: bool) -> str:
        """Generate a prompt that emphasizes maintaining the same meaning."""
        rule = self.rules[rule_key]
        style = "accepted" if is_accepted else "rejected"
        
        prompt = f"""As a legal assistant, provide a response to the following legal question.

        Your response must follow these requirements:
        1. MOST IMPORTANT: The factual content and legal meaning of your response must be clear and complete
        2. Linguistic Style Rule: {rule[style]}
        
        Critical Instructions:
        - Your response should convey EXACTLY THE SAME legal information and meaning as you would normally provide
        - ONLY the linguistic style should change, not the underlying meaning or legal content
        - Keep the response concise and limited to a single sentence
        - Focus on answering the legal question while maintaining the required linguistic style
        
        Question: {question}"""
        
        return prompt

    def get_gpt4_response(self, prompt: str) -> str:
        """Get response from GPT-4."""
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {
                        "role": "system", 
                        "content": "You are a legal assistant specializing in Australian law cases. Your task is to provide legally accurate responses while following specific linguistic style requirements. The meaning and legal content must remain consistent regardless of the linguistic style used."
                    },
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=150
            )
            return response['choices'][0]['message']['content'].strip()
        except Exception as e:
            print(f"Error getting GPT-4 response: {e}")
            return ""

    def load_questions(self, file_path: str) -> List[str]:
        """Load questions from JSONL file."""
        questions = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                questions.append(data['question'])
        return questions

    def generate_dataset(self, questions: List[str], rule_key: str) -> pd.DataFrame:
        """Generate pairs of responses for a specific rule."""
        data = []
        for i, question in enumerate(questions, 1):
            print(f"Processing question {i}/{len(questions)} for {self.rules[rule_key]['name']}")
            accepted, rejected = self.generate_response_pair(question, rule_key)
            data.append({
                'question': question,
                'accepted_response': accepted,
                'rejected_response': rejected,
                'rule': self.rules[rule_key]['name']
            })
        return pd.DataFrame(data)

    def generate_response_pair(self, question: str, rule_key: str) -> Tuple[str, str]:
        """Generate a pair of responses for a given question and rule."""
        accepted_prompt = self.generate_prompt(question, rule_key, True)
        rejected_prompt = self.generate_prompt(question, rule_key, False)
        
        accepted_response = self.get_gpt4_response(accepted_prompt)
        rejected_response = self.get_gpt4_response(rejected_prompt)
        
        return accepted_response, rejected_response

    def process_all_rules(self, data_path: str, output_dir: str, sample_size: int = None):
        """Process all rules and save results to separate CSV files."""
        questions = self.load_questions(data_path)

        if sample_size:
            questions = questions[:sample_size]
            
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        for rule_key in self.rules:
            print(f"\nProcessing {self.rules[rule_key]['name']}...")
            df = self.generate_dataset(questions, rule_key)
            output_path = output_dir / f'responses_{rule_key}.csv'
            df.to_csv(output_path, index=False)
            print(f"Saved to {output_path}")

In [9]:
HOME_DIR = Path("/Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/")

In [10]:
!pwd

/Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/scripts


In [11]:
generator = LegalResponseGenerator('api_key.txt')

In [12]:
!ls

api_key.txt              langchain_test.ipynb     lexical-frequency.ipynb
counsel-gpt-family.ipynb legal-gpt-family.ipynb   [34muntitled folder[m[m


In [13]:
generator.process_all_rules(
    data_path = HOME_DIR / 'public-data' / 'open-australian-legal-qa' / 'qa.jsonl',
    output_dir = HOME_DIR / 'generated-data' / 'open-australian-legal-qa' / 'GPT-4' / 'test2',
    sample_size = 30
)


Processing Personal Pronouns Rule...
Processing question 1/30 for Personal Pronouns Rule
Processing question 2/30 for Personal Pronouns Rule
Processing question 3/30 for Personal Pronouns Rule
Processing question 4/30 for Personal Pronouns Rule
Processing question 5/30 for Personal Pronouns Rule
Processing question 6/30 for Personal Pronouns Rule
Processing question 7/30 for Personal Pronouns Rule
Processing question 8/30 for Personal Pronouns Rule
Processing question 9/30 for Personal Pronouns Rule
Processing question 10/30 for Personal Pronouns Rule
Processing question 11/30 for Personal Pronouns Rule
Processing question 12/30 for Personal Pronouns Rule
Processing question 13/30 for Personal Pronouns Rule
Processing question 14/30 for Personal Pronouns Rule
Processing question 15/30 for Personal Pronouns Rule
Processing question 16/30 for Personal Pronouns Rule
Processing question 17/30 for Personal Pronouns Rule
Processing question 18/30 for Personal Pronouns Rule
Processing questi

# InstructGPT

In [14]:
import openai
import pandas as pd
from pathlib import Path
import json
import random
from typing import List, Dict, Tuple

In [15]:
class LegalResponseGenerator:
    def __init__(self, api_key_path: str):
        """Initialize the generator with path to API key file."""
        api_key = load_api_key(api_key_path)
        openai.api_key = api_key
        self.rules = {
            "pronouns": {
                "name": "Personal Pronouns Rule",
                "accepted": "Use personal pronouns, including inclusive 'we'",
                "rejected": "Avoid using any personal pronouns"
            },
            "voice": {
                "name": "Voice Rule",
                "accepted": "Use active voice",
                "rejected": "Use passive voice"
            },
            "tense": {
                "name": "Tense Rule",
                "accepted": "Use present tense",
                "rejected": "Use past tense"
            },
            "mood": {
                "name": "Mood Rule",
                "accepted": "Use polite imperative mood",
                "rejected": "Use imperative mood"
            },
            "emotional": {
                "name": "Emotional Words Rule",
                "accepted": "Use emotional words and cognitive verbs (like feel, understand, think, believe)",
                "rejected": "Avoid using any emotional words or cognitive verbs"
            }
        }

    def generate_prompt(self, question: str, rule_key: str, is_accepted: bool) -> str:
        """Generate a prompt that emphasizes maintaining the same meaning."""
        rule = self.rules[rule_key]
        style = "accepted" if is_accepted else "rejected"
        
        prompt = f"""You are a legal assistant specializing in Australian law cases. Your task is to provide legally accurate responses while following specific linguistic style requirements. The meaning and legal content must remain consistent regardless of the linguistic style used.

As a legal assistant, provide a response to the following legal question.

Your response must follow these requirements:
1. MOST IMPORTANT: The factual content and legal meaning of your response must be clear and complete
2. Linguistic Style Rule: {rule[style]}

Critical Instructions:
- Your response should convey EXACTLY THE SAME legal information and meaning as you would normally provide
- ONLY the linguistic style should change, not the underlying meaning or legal content
- Keep the response concise and limited to a single sentence
- Focus on answering the legal question while maintaining the required linguistic style

Question: {question}"""
        
        return prompt

    def get_instruct_response(self, prompt: str) -> str:
            """Get response using gpt-3.5-turbo."""
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=150
                )
                return response.choices[0].message['content'].strip()
            except Exception as e:
                print(f"Error getting GPT response: {e}")
                return ""

    def load_questions(self, file_path: str) -> List[str]:
        """Load questions from JSONL file."""
        questions = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                questions.append(data['question'])
        return questions

    def generate_dataset(self, questions: List[str], rule_key: str) -> pd.DataFrame:
        """Generate pairs of responses for a specific rule."""
        data = []
        for i, question in enumerate(questions, 1):
            print(f"Processing question {i}/{len(questions)} for {self.rules[rule_key]['name']}")
            accepted, rejected = self.generate_response_pair(question, rule_key)
            data.append({
                'question': question,
                'accepted_response': accepted,
                'rejected_response': rejected,
                'rule': self.rules[rule_key]['name']
            })
        return pd.DataFrame(data)

    def generate_response_pair(self, question: str, rule_key: str) -> Tuple[str, str]:
        """Generate a pair of responses for a given question and rule."""
        accepted_prompt = self.generate_prompt(question, rule_key, True)
        rejected_prompt = self.generate_prompt(question, rule_key, False)
        
        accepted_response = self.get_instruct_response(accepted_prompt)
        rejected_response = self.get_instruct_response(rejected_prompt)
        
        return accepted_response, rejected_response

    def process_all_rules(self, data_path: str, output_dir: str, sample_size: int = None):
        """Process all rules and save results to separate CSV files."""
        questions = self.load_questions(data_path)
        
        if sample_size:
            questions = questions[:sample_size]
            
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        for rule_key in self.rules:
            print(f"\nProcessing {self.rules[rule_key]['name']}...")
            df = self.generate_dataset(questions, rule_key)
            output_path = output_dir / f'responses_{rule_key}.csv'
            df.to_csv(output_path, index=False)
            print(f"Saved to {output_path}")

In [16]:
instruct_gpt_generator = LegalResponseGenerator('api_key.txt')

In [17]:
instruct_gpt_generator.process_all_rules(
    data_path = HOME_DIR / 'public-data' / 'open-australian-legal-qa' / 'qa.jsonl',
    output_dir = HOME_DIR / 'generated-data' / 'open-australian-legal-qa' / 'InstructGPT' / 'test',
    sample_size = 30
)


Processing Personal Pronouns Rule...
Processing question 1/30 for Personal Pronouns Rule
Processing question 2/30 for Personal Pronouns Rule
Processing question 3/30 for Personal Pronouns Rule
Processing question 4/30 for Personal Pronouns Rule
Processing question 5/30 for Personal Pronouns Rule
Processing question 6/30 for Personal Pronouns Rule
Processing question 7/30 for Personal Pronouns Rule
Processing question 8/30 for Personal Pronouns Rule
Processing question 9/30 for Personal Pronouns Rule
Processing question 10/30 for Personal Pronouns Rule
Processing question 11/30 for Personal Pronouns Rule
Processing question 12/30 for Personal Pronouns Rule
Processing question 13/30 for Personal Pronouns Rule
Processing question 14/30 for Personal Pronouns Rule
Processing question 15/30 for Personal Pronouns Rule
Processing question 16/30 for Personal Pronouns Rule
Processing question 17/30 for Personal Pronouns Rule
Processing question 18/30 for Personal Pronouns Rule
Processing questi

# GPT-4o

In [18]:
import openai
import pandas as pd
from pathlib import Path
import json
import random
from typing import List, Dict, Tuple

In [19]:
class LegalResponseGenerator:
    def __init__(self, api_key_path: str):
        """Initialize the generator with path to API key file."""
        api_key = load_api_key(api_key_path)
        openai.api_key = api_key
        self.rules = {
            "pronouns": {
                "name": "Personal Pronouns Rule",
                "accepted": "Use personal pronouns, including inclusive 'we'",
                "rejected": "Avoid using any personal pronouns"
            },
            "voice": {
                "name": "Voice Rule",
                "accepted": "Use active voice",
                "rejected": "Use passive voice"
            },
            "tense": {
                "name": "Tense Rule",
                "accepted": "Use present tense",
                "rejected": "Use past tense"
            },
            "mood": {
                "name": "Mood Rule",
                "accepted": "Use polite imperative mood",
                "rejected": "Use imperative mood"
            },
            "emotional": {
                "name": "Emotional Words Rule",
                "accepted": "Use emotional words and cognitive verbs (like feel, understand, think, believe)",
                "rejected": "Avoid using any emotional words or cognitive verbs"
            }
        }

    def generate_prompt(self, question: str, rule_key: str, is_accepted: bool) -> str:
        """Generate a prompt that emphasizes maintaining the same meaning."""
        rule = self.rules[rule_key]
        style = "accepted" if is_accepted else "rejected"
        
        prompt = f"""As a legal assistant, provide a response to the following legal question.

        Your response must follow these requirements:
        1. MOST IMPORTANT: The factual content and legal meaning of your response must be clear and complete
        2. Linguistic Style Rule: {rule[style]}
        
        Critical Instructions:
        - Your response should convey EXACTLY THE SAME legal information and meaning as you would normally provide
        - ONLY the linguistic style should change, not the underlying meaning or legal content
        - Keep the response concise and limited to a single sentence
        - Focus on answering the legal question while maintaining the required linguistic style
        
        Question: {question}"""
        
        return prompt

    def get_gpt4_response(self, prompt: str) -> str:
            """Get response from latest GPT-4."""
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-4-0125-preview",
                    messages=[
                        {
                            "role": "system", 
                            "content": "You are a legal assistant specializing in Australian law cases. Your task is to provide legally accurate responses while following specific linguistic style requirements. The meaning and legal content must remain consistent regardless of the linguistic style used."
                        },
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=150
                )
                return response.choices[0].message['content'].strip()
            except Exception as e:
                print(f"Error getting GPT-4 response: {e}")
                return ""

    def load_questions(self, file_path: str) -> List[str]:
        """Load questions from JSONL file."""
        questions = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                questions.append(data['question'])
        return questions

    def generate_dataset(self, questions: List[str], rule_key: str) -> pd.DataFrame:
        """Generate pairs of responses for a specific rule."""
        data = []
        for i, question in enumerate(questions, 1):
            print(f"Processing question {i}/{len(questions)} for {self.rules[rule_key]['name']}")
            accepted, rejected = self.generate_response_pair(question, rule_key)
            data.append({
                'question': question,
                'accepted_response': accepted,
                'rejected_response': rejected,
                'rule': self.rules[rule_key]['name']
            })
        return pd.DataFrame(data)

    def generate_response_pair(self, question: str, rule_key: str) -> Tuple[str, str]:
        """Generate a pair of responses for a given question and rule."""
        accepted_prompt = self.generate_prompt(question, rule_key, True)
        rejected_prompt = self.generate_prompt(question, rule_key, False)
        
        accepted_response = self.get_gpt4_response(accepted_prompt)
        rejected_response = self.get_gpt4_response(rejected_prompt)
        
        return accepted_response, rejected_response

    def process_all_rules(self, data_path: str, output_dir: str, sample_size: int = None):
        """Process all rules and save results to separate CSV files."""
        questions = self.load_questions(data_path)
        
        if sample_size:
            questions = questions[:sample_size]
            
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        for rule_key in self.rules:
            print(f"\nProcessing {self.rules[rule_key]['name']}...")
            df = self.generate_dataset(questions, rule_key)
            output_path = output_dir / f'responses_{rule_key}.csv'
            df.to_csv(output_path, index=False)
            print(f"Saved to {output_path}")

In [20]:
gpt4o_generator = LegalResponseGenerator('api_key.txt')

In [21]:
gpt4o_generator.process_all_rules(
    data_path = HOME_DIR / 'public-data' / 'open-australian-legal-qa' / 'qa.jsonl',
    output_dir = HOME_DIR / 'generated-data' / 'open-australian-legal-qa' / 'GPT-4o' / 'test',
    sample_size = 20
)


Processing Personal Pronouns Rule...
Processing question 1/20 for Personal Pronouns Rule
Processing question 2/20 for Personal Pronouns Rule
Processing question 3/20 for Personal Pronouns Rule
Processing question 4/20 for Personal Pronouns Rule
Processing question 5/20 for Personal Pronouns Rule
Processing question 6/20 for Personal Pronouns Rule
Processing question 7/20 for Personal Pronouns Rule
Processing question 8/20 for Personal Pronouns Rule
Processing question 9/20 for Personal Pronouns Rule
Processing question 10/20 for Personal Pronouns Rule
Processing question 11/20 for Personal Pronouns Rule
Processing question 12/20 for Personal Pronouns Rule
Processing question 13/20 for Personal Pronouns Rule
Processing question 14/20 for Personal Pronouns Rule
Processing question 15/20 for Personal Pronouns Rule
Processing question 16/20 for Personal Pronouns Rule
Processing question 17/20 for Personal Pronouns Rule
Processing question 18/20 for Personal Pronouns Rule
Processing questi

# Preprocessing and Uploading Generated Data to Hugging Face

In [14]:
def build_dataset(tokenizer, dataset_name, input_min_text_length=2, input_max_text_length=8):
    EOS_TOKEN = tokenizer.eos_token
    train_dataset = load_dataset(dataset_name)
    num_proc = 4

    # Accepted Prompt (with rules)
    system_input_with_rules = generate_accepted_prompt()

    # Rejected Prompt (rules negated)
    system_input_without_rules = generate_rejected_prompt()

    def preprocess_function(examples):
        new_examples = {
            "accepted": [],
            "rejected": [],
            "accepted_input_ids": [],
            "rejected_input_ids": [],
        }
        for question in examples["question"]:
            # Accepted Version
            accepted_prompt = f"### System:\n{system_input_with_rules}\n### User:\n{question}\n### Assistant:\n"
            accepted_query = accepted_prompt + EOS_TOKEN
            tokenized_accepted = tokenizer(accepted_query, truncation=True)

            # Rejected Version
            rejected_prompt = f"### System:\n{system_input_without_rules}\n### User:\n{question}\n### Assistant:\n"
            rejected_query = rejected_prompt + EOS_TOKEN
            tokenized_rejected = tokenizer(rejected_query, truncation=True)

            # Add to dataset
            new_examples["accepted"].append(accepted_query)
            new_examples["rejected"].append(rejected_query)
            new_examples["accepted_input_ids"].append(tokenized_accepted["input_ids"])
            new_examples["rejected_input_ids"].append(tokenized_rejected["input_ids"])

        return new_examples

    # Apply preprocessing to dataset
    ds = train_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
    )

    # Filter for maximum length
    ds = ds.filter(
        lambda x: len(x["accepted_input_ids"]) < 2048 and len(x["rejected_input_ids"]) < 2048,
        batched=False
    )

    # Set final dataset format
    ds.set_format(type="torch")
    return ds

# Dataset Build

In [None]:
from datasets import DatasetDict
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt-4")

In [None]:
dataset = build_dataset(tokenizer, "dataset_name")
print(dataset["train"][0])

In [None]:
def collator(data):
  return dict((key, [d[key] for d in data]) for key in data[0])

In [None]:
dataset = build_dataset(tokenizer, "elsayedissa/alignment-questions")
dataset['train'][0]