In [None]:
import pandas as pd

# Replace 'file.jsonl' with the path to your JSONL file
file_path = 'code/legalLLM/hall_detect/factscore_data/labeled/ChatGPT.jsonl'

# Load the JSONL file into a pandas DataFrame
df = pd.read_json(file_path, lines=True)
df['evidence'] = df['annotations']

# Display the DataFrame
df

In [None]:
def extract_evidence(annotations):
    # Check if the input is None and return an empty string if so
    if annotations is None:
        return ""
    
    # Initialize an empty list to store evidence texts
    evidence_texts = []

    # Iterate through each sentence in the annotations
    for sentence in annotations:
        if sentence is not None and sentence['human-atomic-facts']:  # Check if sentence is not None
            # Iterate through each text dictionary in the sentence
            for text in sentence['human-atomic-facts']:
                if text is not None and 'text' in text:  # Ensure 'text' exists in dictionary
                    evidence_texts.append(text['text'])  # Append the text to the list
    
    # Join all evidence texts into a single string
    return " ".join(evidence_texts)
df['evidence'] = df['annotations'].apply(extract_evidence)
df['evidence']

In [None]:
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import  HuggingFacePipeline
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Load your dataframe
df = pd.DataFrame({
    'input': ["Sample question 1", "Sample question 2"],  # Replace with your actual questions
    'evidence': ["Sample evidence for question 1", "Sample evidence for question 2"]
})

# Initialize the LLaMA model via Hugging Face Hub
callbacks = [StreamingStdOutCallbackHandler()]
llm = HuggingFacePipeline.from_model_id(
        model_id="meta-llama/Meta-Llama-3.1-8B-Instruct",
        # model_id= "microsoft/MiniLM-L12-H384-uncased",
        
        task="text-generation",
        device=0,
        callbacks = callbacks,
        pipeline_kwargs=dict(
            return_full_text=False,
            max_new_tokens=1024,
            do_sample=True,
            temperature=0.5,
        ),
    )
llm.pipeline.tokenizer.pad_token_id = llm.pipeline.tokenizer.eos_token_id
llm_engine = llm

# Step 1: Generate response to each question
generate_response_prompt = PromptTemplate(
    template="Given the question: {input}, generate a detailed response.",
    input_variables=["input"]
)
generate_response_chain = llm_engine | generate_response_prompt

# Step 2: Decompose response into atomic facts
decompose_facts_prompt = PromptTemplate(
    template="Given the response: {response}, decompose it into individual atomic facts.",
    input_variables=["response"]
)
decompose_facts_chain = llm_engine | decompose_facts_prompt

# Step 3: Convert decomposed facts to a list of strings
def split_into_atomic_facts(decomposed_facts):
    # Use a simple splitter or more complex logic as needed
    splitter = RecursiveCharacterTextSplitter(separator="\n", chunk_size=1)
    return splitter.split_text(decomposed_facts)

# Step 4: Classify each fact with respect to evidence
classification_prompt = PromptTemplate(
    template=(
        "Classify each fact based on the evidence:\n\n"
        "Evidence: {evidence}\n\n"
        "Fact: {fact}\n\n"
        "Label each fact as 'Supported', 'Not Supported', or 'Irrelevant'."
    ),
    input_variables=["evidence", "fact"]
)
classification_chain = llm_engine | classification_prompt

# Sequential Chain Combining All Steps
def process_question(input_question, evidence):
    # Generate response
    response = generate_response_chain.invoke({"input": input_question})
    
    # Decompose response into atomic facts
    decomposed_facts = decompose_facts_chain.invoke({"response": response})
    
    # Split into atomic fact list
    atomic_facts = split_into_atomic_facts(decomposed_facts)
    
    # Classify each fact based on evidence
    classified_facts = []
    for fact in atomic_facts:
        classification = classification_chain.invoke({"evidence": evidence, "fact": fact})
        classified_facts.append((fact, classification))
    
    return classified_facts

# Apply the processing pipeline to each question in the dataframe
classified_facts_list = []

for _, row in df.iterrows():
    classified_facts = process_question(row['input'], row['evidence'])
    classified_facts_list.append(classified_facts)

# Add the classified facts list as a new column in the dataframe
df['classified_facts'] = classified_facts_list


print(df[['input', 'classified_facts']])
