In [10]:
import pandas as pd
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer

# Initialize the model and tokenizer
model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda",
    use_cache=None,
    attn_implementation=None,
)
model.device

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
def generate_output(prompt, max_tokens=800):
    batch = tokenizer(prompt, return_tensors="pt")
    batch = {k: v.to("cuda") for k, v in batch.items()}
    with torch.no_grad():
        outputs = model.generate(
            **batch,
            max_new_tokens=max_tokens,
            do_sample=False,
            top_p=1.0,
            temperature=0,
            use_cache=True,
            top_k=50,
            repetition_penalty=1.2,
            length_penalty=1,
            output_hidden_states=True,
            return_dict_in_generate=True,
        )
    output_text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
    return output_text[len(prompt):].strip()
# Load input JSON file
input_file = "legal_reasoning_with_answers.json"  # Path to input JSON file
output_json_file = "/home/smanylal/MidTermReport1/NLPFinal/New/legal_reasoning_with_questions.json"  # Path to save the output CSV file
try:
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
except Exception as e:
    print(f"Error loading input file: {e}")
    data = {"legal_scenarios": []}
# Prepare a list to store rows for the CSV
output_scenarios = []
# Process each scenario
for scenario in data.get("legal_scenarios", []):
    #scenario_id = scenario['id']
    legal_context = scenario['context']
    question = scenario['question']
    options = scenario['options']
    legal_reasoning = scenario['legal_reasoning']
    correct_answer = scenario['correct_answer']
    
    # Generate the verification questions
    user_prompt2 = legal_context + question + options + legal_reasoning + '''Task:
    You are a expert legal critique whose is given:
    1) a given legal_context.
    2) a given question.
    3) answer options to the given question out of which only one is true. 
    3) the legal_reasoning behind the legal_context.
    
    Your task is to thoroughly examine the given reasoning provided and identify any possible faults, fallacies, or loopholes in these legal_reasoning steps. Assume that every legal_reasoning provided contains errors. 
    Generate compelling questions (atmost 15) aimed at rectifying these error prone reasoning. Start the sentence of each question indicating which Legal_reasoning step they are finding the fault of. For eg: In Step n: 'followed by the questioning the fault in step n.'

    Important: ONLY the QUESTIONS are expected from you.
    Please ensure that the output is clearly separated by special markers so that it can be parsed into individual elements later on.

    Follow this format:
    --1 In Step n: [question]
    --2 In Step n+1: [question]
    --3 ...
    --n 
    '''
    
    verification_questions_raw = generate_output(user_prompt2)
    
    # Parse verification questions into a list by splitting using markers (--1, --2, etc.)
    questions_start = verification_questions_raw.find("--1")
    if questions_start != -1:
        questions_text = verification_questions_raw[questions_start:].strip()
        # Split by the markers '--1', '--2', etc. and filter empty parts
        questions_list = [q.strip() for q in questions_text.split('\n') if q.startswith("--")]
        
        # Only keep the first 15 questions
        verification_questions = questions_list[:15]
    else:
        verification_questions = ["Error parsing verification questions."]
    
    # Add to output scenarios
    output_scenarios.append({
        #"id": scenario_id,
        "context": legal_context,
        "question": question,
        "options": options,
        "legal_reasoning": legal_reasoning,
        "correct_answer": correct_answer,
        "verification_questions": verification_questions
    })

    # Wrap in the required structure
    output_data = {
        "legal_scenarios": output_scenarios
    }
    
    # Save to JSON file
    try:
        with open(output_json_file, "w", encoding="utf-8") as f:
            json.dump(output_data, f, indent=4, ensure_ascii=False)
        #print(f"Output saved to {output_json_file}")
    except Exception as e:
        print(f"Error saving output file: {e}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin