In [1]:
import pandas as pd
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize the model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    use_cache=None,
    attn_implementation=None,
)
model.device

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
def generate_output(prompt, max_tokens=1000):
    batch = tokenizer(prompt, return_tensors="pt")
    batch = {k: v.to("cuda") for k, v in batch.items()}
    with torch.no_grad():
        outputs = model.generate(
            **batch,
            max_new_tokens=max_tokens,
            do_sample=False,
            top_p=1.0,
            temperature=0,
            use_cache=True,
            top_k=50,
            repetition_penalty=1.2,
            length_penalty=1,
            output_hidden_states=True,
            return_dict_in_generate=True,
        )
    output_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    return output_text[len(prompt):].strip()
# Load input JSON file
input_file = "legal_reasoning_30.json"  # Path to input JSON file
output_csv_file = "/home/apai14/NLP/legal_reasoning_with_answers.csv"  # Path to save the output CSV file
try:
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
except Exception as e:
    print(f"Error loading input file: {e}")
    data = {"legal_scenarios": []}
# Prepare a list to store rows for the CSV
csv_rows = []
# Process each scenario
for scenario in data.get("legal_scenarios", []):
    try:
        scenario_id = scenario['id']
        legal_context = scenario['context']
        question = scenario['question']
        options = scenario['options']
        analysis = scenario.get('ground_truth', '')  # Using 'ground_truth' as a substitute for 'analysis'
        # Generate first output
        user_prompt1 = legal_context + question + options + analysis + '''Task:
        You are a helpful legal assistant. Choose the correct option by performing legal reasoning while strictly adhering to
        the legal context and analysis provided.
        While answering make sure to use the following format:
        [explanation of your legal reasoning step by step as numbered points]'''
        output_text1 = generate_output(user_prompt1)
        scenario['output_text1'] = output_text1
        # Generate second output
        user_prompt2 = legal_context + question + options + analysis + output_text1 + '''Task:
        You are a helpful legal assistant.
        You need to generate verification questions for each of the legal reasoning steps based on the legal context, question, options and analysis.
        Based on the verification questions and answers give feedback for the legal reasoning steps and analyse it to find the correct option to the question.'''
        output_text2 = generate_output(user_prompt2)
        scenario['output_text2'] = output_text2
        # Append the data to the list for CSV
        csv_rows.append({
            "ID": scenario_id,
            "Context": legal_context,
            "Question": question,
            "Options": options,
            "Ground Truth": analysis,
            "Output Text 1": output_text1,
            "Output Text 2": output_text2,
        })
    except Exception as e:
        # Handle errors gracefully and log them
        csv_rows.append({
            "ID": scenario.get('id', 'N/A'),
            "Context": scenario.get('context', 'N/A'),
            "Question": scenario.get('question', 'N/A'),
            "Options": scenario.get('options', 'N/A'),
            "Ground Truth": scenario.get('ground_truth', 'N/A'),
            "Output Text 1": f"Error: {str(e)}",
            "Output Text 2": f"Error: {str(e)}"
        })
# Convert the list of rows into a DataFrame
df = pd.DataFrame(csv_rows)
# Save the DataFrame to a CSV file
try:
    df.to_csv(output_csv_file, index=False, encoding='utf-8')
    print(f"Output saved to {output_csv_file}")
except Exception as e:
    print(f"Error saving output file: {e}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_t

Output saved to /home/apai14/NLP/legal_reasoning_with_answers.csv
