In [3]:
import csv
import json

def preprocess_recipes_from_csv(input_csv, output_file):
    """
    Preprocesses the Recipe1M+ dataset from a CSV file into instruction-response format.
    
    Args:
        input_csv (str): Path to the raw Recipe1M+ dataset file (CSV format).
        output_file (str): Path to save the preprocessed dataset in instruction-response format.
    """
    preprocessed_data = []
    counter = 0  # Initialize a counter
    
    # Open and read the CSV file
    with open(input_csv, 'r') as infile:
        csv_reader = csv.DictReader(infile)
        
        for row in csv_reader:
            counter += 1  # Increment the counter
            
            # Extract relevant fields
            title = row.get('title', '').strip()
            ingredients = row.get('ingredients', '').strip()
            directions = row.get('directions', '').strip()
            
            # Skip recipes with missing fields
            if not title or not ingredients or not directions:
                continue
            
            # Split ingredients into a list and reformat as bullet points
            ingredients_list = "\n".join([f"- {ing.strip()}" for ing in ingredients.split('|') if ing.strip()])
            
            # Split directions into numbered steps
            directions_list = "\n".join([f"{i + 1}. {step.strip()}" for i, step in enumerate(directions.split('|')) if step.strip()])
            
            # Skip if directions are empty after processing
            if not directions_list:
                continue
            
            # Create instruction and response
            instruction = f"Write a recipe containing the following ingredients only:\n{ingredients_list}"
            response = f"<recipe>\n<title> {title} </title>\n<ingredients>\n{ingredients_list}\n</ingredients>\n<directions>\n{directions_list}\n</directions>\n</recipe>"
            
            # Append to preprocessed data
            preprocessed_data.append({
                "instruction": instruction,
                "response": response
            })
            
            # Print a progress message every 50,000 entries processed
            if counter % 50000 == 0:
                print(f"Processed {counter} entries...")
    
    # Save the preprocessed data to a JSON file
    with open(output_file, 'w') as outfile:
        json.dump(preprocessed_data, outfile, indent=4)

    print(f"Preprocessed {len(preprocessed_data)} recipes and saved to {output_file}")

# Example Usage
# Replace 'recipe1m_raw.csv' with the actual path to your Recipe1M+ dataset in CSV format
input_csv_path = './data/dataset/full_dataset.csv'
output_json_path = './data/preprocessed_recipes.json'
preprocess_recipes_from_csv(input_csv_path, output_json_path)


Processed 50000 entries...
Processed 100000 entries...
Processed 150000 entries...
Processed 200000 entries...
Processed 250000 entries...
Processed 300000 entries...
Processed 350000 entries...
Processed 400000 entries...
Processed 450000 entries...
Processed 500000 entries...
Processed 550000 entries...
Processed 600000 entries...
Processed 650000 entries...
Processed 700000 entries...
Processed 750000 entries...
Processed 800000 entries...
Processed 850000 entries...
Processed 900000 entries...
Processed 950000 entries...
Processed 1000000 entries...
Processed 1050000 entries...
Processed 1100000 entries...
Processed 1150000 entries...
Processed 1200000 entries...
Processed 1250000 entries...
Processed 1300000 entries...
Processed 1350000 entries...
Processed 1400000 entries...
Processed 1450000 entries...
Processed 1500000 entries...
Processed 1550000 entries...
Processed 1600000 entries...
Processed 1650000 entries...
Processed 1700000 entries...
Processed 1750000 entries...
Proce