In [1]:
#!/usr/bin/env python3
"""
RecipeMPR to ICM Dataset Converter

This script converts the RecipeMPR dataset (500QA.json) to ICM-digestible format.
Based on ICM documentation: https://github.com/codelion/icm/tree/main

The script handles multiple answer formats and creates ICM examples for each query.
"""

import json
import os
from typing import List, Dict, Any, Union
from dataclasses import dataclass, asdict
import argparse


from icm import ICMSearcher, ICMDataset, ICMExample
from icm.consistency import LogicalConsistencyChecker, MathConsistencyRule



In [2]:
def load_recipe_mpr_dataset(file_path: str) -> List[Dict[str, Any]]:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Loaded {len(data)} examples from {file_path}")
        return data
    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        print("Please download the RecipeMPR dataset from: https://github.com/D3Mlab/Recipe-MPR/blob/main/data/500QA.json")
        raise
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON file: {e}")
        raise


def format_icm_input(question: str, answer: str, answer_choices: List[str] = None) -> str:
    """
    Format the input text according to ICM's expected format.
    
    ICM expects format like:
    "Question: <question>\nClaim: <claim>\nI think this Claim is [True/False]"
    
    Args:
        question: The recipe question
        answer: The proposed answer/claim
        answer_choices: List of possible answer choices (if multiple choice)
        
    Returns:
        Formatted input string for ICM
    """
    # For recipe questions, we'll treat each answer as a claim to be verified
    if answer_choices and len(answer_choices) > 1:
        # Multiple choice format
        choices_text = "\n".join([f"  {chr(65+i)}) {choice}" for i, choice in enumerate(answer_choices)])
        formatted_input = f"Question: {question}\nChoices:\n{choices_text}\nClaim: The correct answer is {answer}\nI think this Claim is [True/False]"
    else:
        # Direct answer format
        formatted_input = f"Query: {question}\nClaim: I think the dish you would prefer is {answer}\nI think this Claim is [True/False]"
    
    return formatted_input


def extract_recipe_context(recipe_data: Dict[str, Any]) -> str:
    """
    Extract recipe context from the recipe data.
    
    Args:
        recipe_data: Recipe information
        
    Returns:
        Formatted recipe context string
    """
    context_parts = []
    
    # Add recipe title if available
    if 'title' in recipe_data:
        context_parts.append(f"Recipe: {recipe_data['title']}")
    
    # Add ingredients if available
    if 'ingredients' in recipe_data and recipe_data['ingredients']:
        ingredients_text = ", ".join(recipe_data['ingredients'])
        context_parts.append(f"Ingredients: {ingredients_text}")
    
    # Add instructions if available
    if 'instructions' in recipe_data and recipe_data['instructions']:
        if isinstance(recipe_data['instructions'], list):
            instructions_text = " ".join(recipe_data['instructions'])
        else:
            instructions_text = str(recipe_data['instructions'])
        context_parts.append(f"Instructions: {instructions_text}")
    
    return " | ".join(context_parts)


def convert_recipe_mpr_to_icm(recipe_mpr_data: List[Dict[str, Any]]) -> ICMDataset:
    """
    Convert RecipeMPR dataset to ICM format.
    
    Args:
        recipe_mpr_data: List of RecipeMPR examples
        
    Returns:
        ICMDataset object ready for ICM processing
    """
    icm_examples = []
    
    for idx, example in enumerate(recipe_mpr_data):
        try:
            # Extract question/query
            question = example.get('query', example.get('question', ''))
            if not question:
                print(f"Warning: No question found in example {idx}, skipping...")
                continue
            
            # Extract options (hash -> dish name mapping)
            options = example.get('options', {})
            if not options:
                print(f"Warning: No options found in example {idx}, skipping...")
                continue
            
            # Extract correct answer hash
            correct_answer_hash = example.get('answer', '')
            if not correct_answer_hash:
                print(f"Warning: No answer found in example {idx}, skipping...")
                continue
            
            # Extract query type informationR
            query_type = example.get('query_type', {})
            
            # Extract correctness explanation
            correctness_explanation = example.get('correctness_explanation', {})
            
            # Create ICM examples - one for each option (dish)
            for option_hash, dish_name in options.items():
                # Determine if this option is correct
                is_correct = (option_hash == correct_answer_hash)
                
                # Format the input for ICM
                formatted_input = format_icm_input(
                    question=question,
                    answer=dish_name
                )
                
                # Create metadata
                metadata = {
                    'gold_label': 'True' if is_correct else 'False',
                    'task': 'recipe_preference',
                    'original_example_id': idx,
                    'option_hash': option_hash,
                    'dish_name': dish_name,
                    'source': 'RecipeMPR',
                    'query_type': query_type
                }
                
                # Add correctness explanation if available
                if correctness_explanation:
                    metadata['correctness_explanation'] = correctness_explanation
                
                # Create ICM example
                icm_example = ICMExample(
                    formatted_input,
                    metadata
                )
                
                icm_examples.append(icm_example)
                
        except Exception as e:
            print(f"Error processing example {idx}: {e}")
            continue
    
    print(f"Converted {len(icm_examples)} examples to ICM format")
    return ICMDataset(examples=icm_examples)



In [26]:
recipe_data = load_recipe_mpr_dataset("500QA.json")

Loaded 500 examples from 500QA.json


In [27]:
# recipe_data = recipe_data[:2]
# print(f"Limited to {50} examples")

In [28]:
icm_dataset = convert_recipe_mpr_to_icm(recipe_data)

Converted 2500 examples to ICM format


In [29]:
icm_dataset[1]

ICMExample(input_text='Query: I want to make a warm dish containing oysters\nClaim: I think the dish you would prefer is Seasoned salted crackers shaped like oysters\nI think this Claim is [True/False]', metadata={'gold_label': 'False', 'task': 'recipe_preference', 'original_example_id': 0, 'option_hash': '5b9441298f', 'dish_name': 'Seasoned salted crackers shaped like oysters', 'source': 'RecipeMPR', 'query_type': {'Specific': 0, 'Commonsense': 1, 'Negated': 0, 'Analogical': 0, 'Temporal': 0}, 'correctness_explanation': {'warm dish': 'soup', 'oysters': 'oyster'}})

In [30]:
# Custom consistency checker
checker = LogicalConsistencyChecker()

# Advanced searcher
searcher = ICMSearcher(
    model_name="Qwen/Qwen2.5-3B",
    alpha=50.0,
    seed=42
)

result = searcher.search(icm_dataset)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ICM Search: 100%|██████████| 1000/1000 [52:09<00:00,  3.13s/it]


In [31]:
# Access results
print(f"Generated {len(result.labeled_examples)} labeled examples")
print(f"Final score: {result.score:.4f}")

Generated 21 labeled examples
Final score: -65.6638


In [33]:
from icm.storage import ICMStorage
from icm.exporters import ICMExporter

# Save results
storage = ICMStorage("my_results")
storage.save_result(result, "recipempr_qwen25-3b_1000")

'my_results/recipempr_qwen25-3b_1000_20250904_083223.jsonl'