# Data Check

In [1]:
import json
import pandas as pd

from typing import Any, Dict, List
from pathlib import Path

import random

In [2]:
HOME_DIR = Path("/Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/")
data_path = HOME_DIR / 'public-data' / 'counsel-chat' / 'counsel_chat_250-tokens_full.json'

In [3]:
def analyze_structure(data: Any, prefix: str = "") -> List[str]:
    """
    Recursively analyzes the structure of nested JSON data
    
    Args:
        data: JSON data (can be dict, list, or primitive type)
        prefix: Current path prefix for nested structures
    
    Returns:
        List of structure descriptions
    """
    structure = []
    
    if isinstance(data, dict):
        for key, value in data.items():
            current_path = f"{prefix}.{key}" if prefix else key
            if isinstance(value, (dict, list)):
                structure.extend(analyze_structure(value, current_path))
            else:
                structure.append(f"{current_path}: {type(value).__name__}")
                
    elif isinstance(data, list) and len(data) > 0:
        # Analyze first item in list for structure
        sample_item = data[0]
        current_path = f"{prefix}[]" if prefix else "[]"
        if isinstance(sample_item, (dict, list)):
            structure.extend(analyze_structure(sample_item, current_path))
        else:
            structure.append(f"{current_path}: {type(sample_item).__name__}")
            
    return structure

In [4]:
def analyze_json_file(file_path: str):
    """
    Reads and analyzes the structure of a JSON file, including nested structures
    
    Args:
        file_path (str): Path to the JSON file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            
        # 1. Analyze overall structure
        print("\n=== JSON File Structure ===")
        if isinstance(data, list):
            print(f"Root type: List (Total {len(data)} items)")
        else:
            print("Root type: Dictionary")
            
        # 2. Print complete structure with nested keys
        print("\n=== Complete Structure ===")
        structure = analyze_structure(data)
        for item in structure:
            print(f"- {item}")
                
        # 3. Print sample data
        print("\n=== Sample Data (First Few Items) ===")
        if isinstance(data, list):
            sample_data = data[:2]  # First 2 items if list
        elif isinstance(data, dict):
            # For nested structures, try to get a representative sample
            sample_data = {}
            for key, value in data.items():
                if isinstance(value, list):
                    sample_data[key] = value[:2] if value else []  # First 2 items of list
                else:
                    sample_data[key] = value
        
        print(json.dumps(sample_data, indent=2, ensure_ascii=False)[:1000] + "...")
        
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{file_path}'")
    except Exception as e:
        print(f"Error: Unexpected error occurred - {str(e)}")

In [5]:
def print_single_example(file_path: Path):
    """
    Prints a single, complete example from the JSON file in a readable format
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        # Get first example from train data
        first_example = data['train'][0]
        
        print("\n=== Single Training Example ===")
        print("\n1. Personality:")
        for p in first_example['personality']:
            print(f"   {p}" if p else "   [empty string]")
        
        print("\n2. Utterances:")
        first_utterance = first_example['utterances'][0]
        
        print("\n   History:")
        for idx, h in enumerate(first_utterance['history'], 1):
            print(f"   {idx}. {h}")
        
        print("\n   Candidates (possible responses):")
        for idx, c in enumerate(first_utterance['candidates'], 1):
            print(f"   {idx}. {c}\n")
    except:
        pass

In [6]:
def print_train_histories(file_path: Path, num_examples: int = 5):
    """
    Prints specified number of history examples from train data
    
    Args:
        file_path (Path): Path to the JSON file
        num_examples (int): Number of examples to show (default: 5)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        
        print(f"\n=== First {num_examples} Training Histories ===\n")
        
        for i in range(num_examples):
            history = data['train'][i]['utterances'][0]['history'][0]
            print(f"Example {i+1}:")
            print(f"{history}")
            print("-" * 80 + "\n")
            
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{file_path}'")
    except Exception as e:
        print(f"Error: Unexpected error occurred - {str(e)}")

In [7]:
analyze_json_file(data_path)


=== JSON File Structure ===
Root type: Dictionary

=== Complete Structure ===
- train[].personality[]: str
- train[].utterances[].history[]: str
- train[].utterances[].candidates[]: str
- valid[].personality[]: str
- valid[].utterances[].history[]: str
- valid[].utterances[].candidates[]: str

=== Sample Data (First Few Items) ===
{
  "train": [
    {
      "personality": [
        ""
      ],
      "utterances": [
        {
          "history": [
            "can i change my feeling of being worthless to everyone ? i ' m going through some things with my feelings and myself . i barely sleep and i do nothing but think about how i ' m worthless and how i shouldn ' t be here . i ' ve never tried or contemplated suicide . i ' ve always wanted to fix my issues , but i never get around to it . how can i change my feeling of being worthless to everyone ?"
          ],
          "candidates": [
            "maybe lower your expectations for a bit",
            "if you are whole - heartedly c

In [8]:
print_single_example(data_path)


=== Single Training Example ===

1. Personality:
   [empty string]

2. Utterances:

   History:
   1. can i change my feeling of being worthless to everyone ? i ' m going through some things with my feelings and myself . i barely sleep and i do nothing but think about how i ' m worthless and how i shouldn ' t be here . i ' ve never tried or contemplated suicide . i ' ve always wanted to fix my issues , but i never get around to it . how can i change my feeling of being worthless to everyone ?

   Candidates (possible responses):
   1. maybe lower your expectations for a bit

   2. if you are whole - heartedly committed to moving past the sexual and romantic parts of your relationship and just having a friendship than refraining from all the touching would be a good place to start

   3. very often , one person wants to deal with the conflict right away or shortly thereafter and the other person wants to wait

   4. " my best guess is that your boyfriend is triggered by some previous r

In [9]:
print_train_histories(data_path)


=== First 5 Training Histories ===

Example 1:
can i change my feeling of being worthless to everyone ? i ' m going through some things with my feelings and myself . i barely sleep and i do nothing but think about how i ' m worthless and how i shouldn ' t be here . i ' ve never tried or contemplated suicide . i ' ve always wanted to fix my issues , but i never get around to it . how can i change my feeling of being worthless to everyone ?
--------------------------------------------------------------------------------

Example 2:
can i change my feeling of being worthless to everyone ? i ' m going through some things with my feelings and myself . i barely sleep and i do nothing but think about how i ' m worthless and how i shouldn ' t be here . i ' ve never tried or contemplated suicide . i ' ve always wanted to fix my issues , but i never get around to it . how can i change my feeling of being worthless to everyone ?
-------------------------------------------------------------------

# GPT-4

In [16]:
import openai
import pandas as pd
from pathlib import Path
import json
from typing import List, Dict, Tuple

In [17]:
!pip install openai==0.28


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [18]:
def load_api_key(file_path: str) -> str:
    """Load OpenAI API key from a file."""
    with open(file_path, 'r') as file:
        return file.read().strip()

In [None]:
class CounselingResponseGenerator:
    def __init__(self, api_key_path: str):
        """Initialize the generator with path to API key file."""
        api_key = self.load_api_key(api_key_path)
        openai.api_key = api_key
        self.rules = {
            "pronouns": {
                "name": "Personal Pronouns Rule",
                "accepted": "Use personal pronouns, including inclusive 'we'",
                "rejected": "Avoid using any personal pronouns"
            },
            "voice": {
                "name": "Voice Rule",
                "accepted": "Use active voice",
                "rejected": "Use passive voice"
            },
            "tense": {
                "name": "Tense Rule",
                "accepted": "Use present tense",
                "rejected": "Use past tense"  
            },
            "mood": {
                "name": "Mood Rule",
                "accepted": "Use polite imperative mood",
                "rejected": "Use imperative mood" # changed from "Avoid imperative mood" to this
            },
            "emotional": {
                "name": "Emotional Words Rule",
                "accepted": "Use emotional words and cognitive verbs (like feel, understand, think, believe)",
                "rejected": "Avoid using any emotional words or cognitive verbs"
            }
        }

    def load_api_key(self, api_key_path: str) -> str:
        """Load API key from file."""
        with open(api_key_path, 'r') as f:
            return f.read().strip()

    def load_questions(self, file_path: str) -> List[str]:
        """
        Load questions from counsel chat JSON file and remove duplicates.
        Returns a list of unique questions.
        """

        # Remove duplicates
        unique_questions = set()
        
        with open(file_path, 'r') as f:
            data = json.load(f)
            
            # Process train data
            for item in data['train']:
                for utterance in item['utterances']:
                    if utterance['history']:
                        unique_questions.add(utterance['history'][0])
            
            # Process valid data
            for item in data['valid']:
                for utterance in item['utterances']:
                    if utterance['history']:
                        unique_questions.add(utterance['history'][0])
        
        # Convert set back to list
        questions = list(unique_questions)
        
        # Print statistics
        print(f"\nQuestion Statistics:")
        print(f"Total unique questions: {len(questions)}")
        
        # Optional: Print a few example questions
        print("\nExample questions:")
        for q in questions[:3]:
            print(f"- {q[:100]}...")
            
        return questions

    def generate_prompt(self, question: str, rule_key: str, is_accepted: bool) -> str:
        """Generate a prompt that emphasizes maintaining the same meaning."""
        rule = self.rules[rule_key]
        style = "accepted" if is_accepted else "rejected"
        
        prompt = f"""As a counseling assistant, provide a response to the following question.

        Your response must follow these requirements:
        1. MOST IMPORTANT: The content and therapeutic meaning of your response must be clear and complete
        2. Linguistic Style Rule: {rule[style]}
        
        Critical Instructions:
        - Your response should convey EXACTLY THE SAME therapeutic information and meaning
        - ONLY the linguistic style should change, not the underlying meaning or counseling content
        - Keep the response concise and limited to a single sentence
        - Focus on providing supportive counseling while maintaining the required linguistic style
        
        Question: {question}"""
        
        return prompt

    def get_gpt4_response(self, prompt: str) -> str:
        """Get response from GPT-4."""
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4",
                messages=[
                    {
                        "role": "system", 
                        "content": "You are a counseling assistant specializing in providing supportive and therapeutic responses. Your task is to provide empathetic and helpful responses while following specific linguistic style requirements. The meaning and therapeutic content must remain consistent regardless of the linguistic style used."
                    },
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
                max_tokens=150
            )
            return response['choices'][0]['message']['content'].strip()
        except Exception as e:
            print(f"Error getting GPT-4 response: {e}")
            return ""

    def generate_dataset(self, questions: List[str], rule_key: str) -> pd.DataFrame:
        """Generate pairs of responses for a specific rule."""
        data = []
        for i, question in enumerate(questions, 1):
            print(f"Processing question {i}/{len(questions)} for {self.rules[rule_key]['name']}")
            accepted, rejected = self.generate_response_pair(question, rule_key)
            data.append({
                'question': question,
                'accepted_response': accepted,
                'rejected_response': rejected,
                'rule': self.rules[rule_key]['name']
            })
        return pd.DataFrame(data)

    def generate_response_pair(self, question: str, rule_key: str) -> Tuple[str, str]:
        """Generate a pair of responses for a given question and rule."""
        accepted_prompt = self.generate_prompt(question, rule_key, True)
        rejected_prompt = self.generate_prompt(question, rule_key, False)
        
        accepted_response = self.get_gpt4_response(accepted_prompt)
        rejected_response = self.get_gpt4_response(rejected_prompt)
        
        return accepted_response, rejected_response

    def process_all_rules(self, data_path: str, output_dir: str, sample_size: int = None):
        """Process all rules and save results to separate CSV files."""
        questions = self.load_questions(data_path)
        print(f"Loaded {len(questions)} unique questions")

        if sample_size:
            if sample_size > len(questions):
                print(f"Warning: Requested sample size ({sample_size}) is larger than available unique questions ({len(questions)})")
                sample_size = len(questions)
            questions = questions[:sample_size]
            print(f"Using sample of {sample_size} questions")
            
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        for rule_key in self.rules:
            print(f"\nProcessing {self.rules[rule_key]['name']}...")
            df = self.generate_dataset(questions, rule_key)
            output_path = output_dir / f'counseling_responses_{rule_key}.csv'
            df.to_csv(output_path, index=False)
            print(f"Saved to {output_path}")


In [20]:
HOME_DIR = Path("/Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/")

In [21]:
!pwd

/Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/scripts


In [22]:
generator = CounselingResponseGenerator('api_key.txt')

In [23]:
!ls

api_key.txt              gpt-family.ipynb
counsel-gpt-family.ipynb langchain_test.ipynb


In [24]:
generator.process_all_rules(
    data_path = HOME_DIR / 'public-data' / 'counsel-chat' / 'counsel_chat_250-tokens_full.json',
    output_dir = HOME_DIR / 'generated-data' / 'counsel-chat' / 'GPT-4' / 'test-phase1',
    sample_size = 5
)


Question Statistics:
Total unique questions: 785

Example questions:
- is my ex - boyfriend a pathological liar ? i have an ex - boyfriend who just lies about everything ....
- how do i make new friends ? in the past year , two of my best and only close friends moved to differ...
- is it appropriate to give my counselor a bottle of wine for christmas ? i am an international studen...
Loaded 785 unique questions
Using sample of 5 questions

Processing Personal Pronouns Rule...
Processing question 1/5 for Personal Pronouns Rule
Processing question 2/5 for Personal Pronouns Rule
Processing question 3/5 for Personal Pronouns Rule
Processing question 4/5 for Personal Pronouns Rule
Processing question 5/5 for Personal Pronouns Rule
Saved to /Volumes/ssd/01-ckj-postdoc/LLM-alignment-data-generation/generated-data/counsel-chat/GPT-4/test-phase1/counseling_responses_pronouns.csv

Processing Voice Rule...
Processing question 1/5 for Voice Rule
Processing question 2/5 for Voice Rule
Processing q

# InstructGPT

In [None]:
import openai
import pandas as pd
from pathlib import Path
import json
import random
from typing import List, Dict, Tuple

In [16]:
class LegalResponseGenerator:
    def __init__(self, api_key_path: str):
        """Initialize the generator with path to API key file."""
        api_key = load_api_key(api_key_path)
        openai.api_key = api_key
        self.rules = {
            "pronouns": {
                "name": "Personal Pronouns Rule",
                "accepted": "Use personal pronouns, including inclusive 'we'",
                "rejected": "Avoid using any personal pronouns"
            },
            "voice": {
                "name": "Voice Rule",
                "accepted": "Use active voice",
                "rejected": "Use passive voice"
            },
            "tense": {
                "name": "Tense Rule",
                "accepted": "Use present tense",
                "rejected": "Use past tense"
            },
            "mood": {
                "name": "Mood Rule",
                "accepted": "Use polite imperative mood",
                "rejected": "Avoid using imperative mood"
            },
            "emotional": {
                "name": "Emotional Words Rule",
                "accepted": "Use emotional words and cognitive verbs (like feel, understand, think, believe)",
                "rejected": "Avoid using any emotional words or cognitive verbs"
            }
        }

    def generate_prompt(self, question: str, rule_key: str, is_accepted: bool) -> str:
        """Generate a prompt that emphasizes maintaining the same meaning."""
        rule = self.rules[rule_key]
        style = "accepted" if is_accepted else "rejected"
        
        prompt = f"""You are a legal assistant specializing in Australian law cases. Your task is to provide legally accurate responses while following specific linguistic style requirements. The meaning and legal content must remain consistent regardless of the linguistic style used.

As a legal assistant, provide a response to the following legal question.

Your response must follow these requirements:
1. MOST IMPORTANT: The factual content and legal meaning of your response must be clear and complete
2. Linguistic Style Rule: {rule[style]}

Critical Instructions:
- Your response should convey EXACTLY THE SAME legal information and meaning as you would normally provide
- ONLY the linguistic style should change, not the underlying meaning or legal content
- Keep the response concise and limited to a single sentence
- Focus on answering the legal question while maintaining the required linguistic style

Question: {question}"""
        
        return prompt

    def get_instruct_response(self, prompt: str) -> str:
            """Get response using gpt-3.5-turbo."""
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=150
                )
                return response.choices[0].message['content'].strip()
            except Exception as e:
                print(f"Error getting GPT response: {e}")
                return ""

    def load_questions(self, file_path: str) -> List[str]:
        """Load questions from JSONL file."""
        questions = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                questions.append(data['question'])
        return questions

    def generate_dataset(self, questions: List[str], rule_key: str) -> pd.DataFrame:
        """Generate pairs of responses for a specific rule."""
        data = []
        for i, question in enumerate(questions, 1):
            print(f"Processing question {i}/{len(questions)} for {self.rules[rule_key]['name']}")
            accepted, rejected = self.generate_response_pair(question, rule_key)
            data.append({
                'question': question,
                'accepted_response': accepted,
                'rejected_response': rejected,
                'rule': self.rules[rule_key]['name']
            })
        return pd.DataFrame(data)

    def generate_response_pair(self, question: str, rule_key: str) -> Tuple[str, str]:
        """Generate a pair of responses for a given question and rule."""
        accepted_prompt = self.generate_prompt(question, rule_key, True)
        rejected_prompt = self.generate_prompt(question, rule_key, False)
        
        accepted_response = self.get_instruct_response(accepted_prompt)
        rejected_response = self.get_instruct_response(rejected_prompt)
        
        return accepted_response, rejected_response

    def process_all_rules(self, data_path: str, output_dir: str, sample_size: int = None):
        """Process all rules and save results to separate CSV files."""
        questions = self.load_questions(data_path)
        
        if sample_size:
            questions = questions[:sample_size]
            
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        for rule_key in self.rules:
            print(f"\nProcessing {self.rules[rule_key]['name']}...")
            df = self.generate_dataset(questions, rule_key)
            output_path = output_dir / f'responses_{rule_key}.csv'
            df.to_csv(output_path, index=False)
            print(f"Saved to {output_path}")

In [17]:
instruct_gpt_generator = LegalResponseGenerator('api_key.txt')

In [18]:
instruct_gpt_generator.process_all_rules(
    data_path = HOME_DIR / 'public-data' / 'open-australian-legal-qa' / 'qa.jsonl',
    output_dir = HOME_DIR / 'generated-data' / 'open-australian-legal-qa' / 'InstructGPT' / 'test-phase1',
    sample_size = 20
)


Processing Personal Pronouns Rule...
Processing question 1/20 for Personal Pronouns Rule
Processing question 2/20 for Personal Pronouns Rule
Processing question 3/20 for Personal Pronouns Rule
Processing question 4/20 for Personal Pronouns Rule
Processing question 5/20 for Personal Pronouns Rule
Processing question 6/20 for Personal Pronouns Rule
Processing question 7/20 for Personal Pronouns Rule
Processing question 8/20 for Personal Pronouns Rule
Processing question 9/20 for Personal Pronouns Rule
Processing question 10/20 for Personal Pronouns Rule
Processing question 11/20 for Personal Pronouns Rule
Processing question 12/20 for Personal Pronouns Rule
Processing question 13/20 for Personal Pronouns Rule
Processing question 14/20 for Personal Pronouns Rule
Processing question 15/20 for Personal Pronouns Rule
Processing question 16/20 for Personal Pronouns Rule
Processing question 17/20 for Personal Pronouns Rule
Processing question 18/20 for Personal Pronouns Rule
Processing questi

# GPT-4o

In [None]:
import openai
import pandas as pd
from pathlib import Path
import json
import random
from typing import List, Dict, Tuple

In [20]:
class LegalResponseGenerator:
    def __init__(self, api_key_path: str):
        """Initialize the generator with path to API key file."""
        api_key = load_api_key(api_key_path)
        openai.api_key = api_key
        self.rules = {
            "pronouns": {
                "name": "Personal Pronouns Rule",
                "accepted": "Use personal pronouns, including inclusive 'we'",
                "rejected": "Avoid using any personal pronouns"
            },
            "voice": {
                "name": "Voice Rule",
                "accepted": "Use active voice",
                "rejected": "Use passive voice"
            },
            "tense": {
                "name": "Tense Rule",
                "accepted": "Use present tense",
                "rejected": "Use past tense"
            },
            "mood": {
                "name": "Mood Rule",
                "accepted": "Use polite imperative mood",
                "rejected": "Avoid using imperative mood"
            },
            "emotional": {
                "name": "Emotional Words Rule",
                "accepted": "Use emotional words and cognitive verbs (like feel, understand, think, believe)",
                "rejected": "Avoid using any emotional words or cognitive verbs"
            }
        }

    def generate_prompt(self, question: str, rule_key: str, is_accepted: bool) -> str:
        """Generate a prompt that emphasizes maintaining the same meaning."""
        rule = self.rules[rule_key]
        style = "accepted" if is_accepted else "rejected"
        
        prompt = f"""As a legal assistant, provide a response to the following legal question.

        Your response must follow these requirements:
        1. MOST IMPORTANT: The factual content and legal meaning of your response must be clear and complete
        2. Linguistic Style Rule: {rule[style]}
        
        Critical Instructions:
        - Your response should convey EXACTLY THE SAME legal information and meaning as you would normally provide
        - ONLY the linguistic style should change, not the underlying meaning or legal content
        - Keep the response concise and limited to a single sentence
        - Focus on answering the legal question while maintaining the required linguistic style
        
        Question: {question}"""
        
        return prompt

    def get_gpt4_response(self, prompt: str) -> str:
            """Get response from latest GPT-4."""
            try:
                response = openai.ChatCompletion.create(
                    model="gpt-4-0125-preview",
                    messages=[
                        {
                            "role": "system", 
                            "content": "You are a legal assistant specializing in Australian law cases. Your task is to provide legally accurate responses while following specific linguistic style requirements. The meaning and legal content must remain consistent regardless of the linguistic style used."
                        },
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=150
                )
                return response.choices[0].message['content'].strip()
            except Exception as e:
                print(f"Error getting GPT-4 response: {e}")
                return ""

    def load_questions(self, file_path: str) -> List[str]:
        """Load questions from JSONL file."""
        questions = []
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                questions.append(data['question'])
        return questions

    def generate_dataset(self, questions: List[str], rule_key: str) -> pd.DataFrame:
        """Generate pairs of responses for a specific rule."""
        data = []
        for i, question in enumerate(questions, 1):
            print(f"Processing question {i}/{len(questions)} for {self.rules[rule_key]['name']}")
            accepted, rejected = self.generate_response_pair(question, rule_key)
            data.append({
                'question': question,
                'accepted_response': accepted,
                'rejected_response': rejected,
                'rule': self.rules[rule_key]['name']
            })
        return pd.DataFrame(data)

    def generate_response_pair(self, question: str, rule_key: str) -> Tuple[str, str]:
        """Generate a pair of responses for a given question and rule."""
        accepted_prompt = self.generate_prompt(question, rule_key, True)
        rejected_prompt = self.generate_prompt(question, rule_key, False)
        
        accepted_response = self.get_gpt4_response(accepted_prompt)
        rejected_response = self.get_gpt4_response(rejected_prompt)
        
        return accepted_response, rejected_response

    def process_all_rules(self, data_path: str, output_dir: str, sample_size: int = None):
        """Process all rules and save results to separate CSV files."""
        questions = self.load_questions(data_path)
        
        if sample_size:
            questions = questions[:sample_size]
            
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        
        for rule_key in self.rules:
            print(f"\nProcessing {self.rules[rule_key]['name']}...")
            df = self.generate_dataset(questions, rule_key)
            output_path = output_dir / f'responses_{rule_key}.csv'
            df.to_csv(output_path, index=False)
            print(f"Saved to {output_path}")

In [21]:
gpt4o_generator = LegalResponseGenerator('api_key.txt')

In [22]:
gpt4o_generator.process_all_rules(
    data_path = HOME_DIR / 'public-data' / 'open-australian-legal-qa' / 'qa.jsonl',
    output_dir = HOME_DIR / 'generated-data' / 'open-australian-legal-qa' / 'GPT-4o' / 'test-phase1',
    sample_size = 20
)


Processing Personal Pronouns Rule...
Processing question 1/20 for Personal Pronouns Rule
Processing question 2/20 for Personal Pronouns Rule
Processing question 3/20 for Personal Pronouns Rule
Processing question 4/20 for Personal Pronouns Rule
Processing question 5/20 for Personal Pronouns Rule
Processing question 6/20 for Personal Pronouns Rule
Processing question 7/20 for Personal Pronouns Rule
Processing question 8/20 for Personal Pronouns Rule
Processing question 9/20 for Personal Pronouns Rule
Processing question 10/20 for Personal Pronouns Rule
Processing question 11/20 for Personal Pronouns Rule
Processing question 12/20 for Personal Pronouns Rule
Processing question 13/20 for Personal Pronouns Rule
Processing question 14/20 for Personal Pronouns Rule
Processing question 15/20 for Personal Pronouns Rule
Processing question 16/20 for Personal Pronouns Rule
Processing question 17/20 for Personal Pronouns Rule
Processing question 18/20 for Personal Pronouns Rule
Processing questi

# Preprocessing and Uploading Generated Data to Hugging Face

In [14]:
def build_dataset(tokenizer, dataset_name, input_min_text_length=2, input_max_text_length=8):
    EOS_TOKEN = tokenizer.eos_token
    train_dataset = load_dataset(dataset_name)
    num_proc = 4

    # Accepted Prompt (with rules)
    system_input_with_rules = generate_accepted_prompt()

    # Rejected Prompt (rules negated)
    system_input_without_rules = generate_rejected_prompt()

    def preprocess_function(examples):
        new_examples = {
            "accepted": [],
            "rejected": [],
            "accepted_input_ids": [],
            "rejected_input_ids": [],
        }
        for question in examples["question"]:
            # Accepted Version
            accepted_prompt = f"### System:\n{system_input_with_rules}\n### User:\n{question}\n### Assistant:\n"
            accepted_query = accepted_prompt + EOS_TOKEN
            tokenized_accepted = tokenizer(accepted_query, truncation=True)

            # Rejected Version
            rejected_prompt = f"### System:\n{system_input_without_rules}\n### User:\n{question}\n### Assistant:\n"
            rejected_query = rejected_prompt + EOS_TOKEN
            tokenized_rejected = tokenizer(rejected_query, truncation=True)

            # Add to dataset
            new_examples["accepted"].append(accepted_query)
            new_examples["rejected"].append(rejected_query)
            new_examples["accepted_input_ids"].append(tokenized_accepted["input_ids"])
            new_examples["rejected_input_ids"].append(tokenized_rejected["input_ids"])

        return new_examples

    # Apply preprocessing to dataset
    ds = train_dataset.map(
        preprocess_function,
        batched=True,
        num_proc=num_proc,
    )

    # Filter for maximum length
    ds = ds.filter(
        lambda x: len(x["accepted_input_ids"]) < 2048 and len(x["rejected_input_ids"]) < 2048,
        batched=False
    )

    # Set final dataset format
    ds.set_format(type="torch")
    return ds

# Dataset Build

In [None]:
from datasets import DatasetDict
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt-4")

In [None]:
dataset = build_dataset(tokenizer, "dataset_name")
print(dataset["train"][0])

In [None]:
def collator(data):
  return dict((key, [d[key] for d in data]) for key in data[0])

In [None]:
dataset = build_dataset(tokenizer, "elsayedissa/alignment-questions")
dataset['train'][0]