# Benchmark Dataset

In [None]:
import json
import random
import csv
import os
from collections import Counter

# Comprehensive word banks - 11 categories with 30-80 words each
word_banks = {
    "fruit": [
        "apple", "banana", "cherry", "grape", "orange", "lemon", "peach", "pear", "plum", "berry",
        "melon", "kiwi", "mango", "lime", "date", "fig", "apricot", "coconut", "papaya", "guava",
        "pomegranate", "avocado", "strawberry", "blueberry", "raspberry", "blackberry", "cranberry",
        "watermelon", "cantaloupe", "honeydew", "grapefruit", "tangerine", "nectarine", "persimmon",
        "dragonfruit", "passionfruit", "starfruit", "lychee", "rambutan", "jackfruit", "durian",
        "elderberry", "gooseberry", "currant", "mulberry", "boysenberry", "lingonberry", "cloudberry"
    ],

    "animal": [
        "dog", "cat", "bird", "fish", "lion", "bear", "wolf", "deer", "mouse", "rabbit",
        "horse", "cow", "pig", "sheep", "goat", "chicken", "duck", "goose", "turkey", "eagle",
        "hawk", "owl", "parrot", "canary", "pigeon", "crow", "sparrow", "robin", "cardinal",
        "tiger", "leopard", "cheetah", "jaguar", "panther", "elephant", "rhino", "hippo", "giraffe",
        "zebra", "kangaroo", "koala", "panda", "monkey", "gorilla", "chimpanzee", "orangutan",
        "whale", "dolphin", "shark", "octopus", "squid", "crab", "lobster", "shrimp", "jellyfish",
        "turtle", "frog", "toad", "snake", "lizard", "crocodile", "alligator", "iguana"
    ],

    "vehicle": [
        "car", "bus", "truck", "bike", "plane", "boat", "train", "taxi", "van", "ship",
        "jet", "helicopter", "motorcycle", "scooter", "bicycle", "tricycle", "subway", "tram",
        "ferry", "yacht", "canoe", "kayak", "sailboat", "speedboat", "cruise", "cargo", "tanker",
        "ambulance", "firetruck", "police", "limousine", "convertible", "sedan", "hatchback",
        "wagon", "pickup", "trailer", "semi", "bulldozer", "excavator", "crane", "forklift",
        "tractor", "combine", "harvester", "snowplow", "garbage", "delivery", "rickshaw"
    ],

    "color": [
        "red", "blue", "green", "yellow", "black", "white", "pink", "brown", "gray", "purple",
        "orange", "silver", "gold", "violet", "cyan", "magenta", "turquoise", "indigo", "maroon",
        "navy", "teal", "lime", "olive", "aqua", "fuchsia", "coral", "salmon", "peach", "beige",
        "tan", "khaki", "ivory", "cream", "pearl", "platinum", "bronze", "copper", "rust",
        "crimson", "scarlet", "burgundy", "lavender", "lilac", "periwinkle", "azure", "cobalt"
    ],

    "body_part": [
        "head", "face", "eye", "nose", "mouth", "ear", "neck", "shoulder", "arm", "elbow",
        "wrist", "hand", "finger", "thumb", "nail", "chest", "back", "waist", "hip", "leg",
        "thigh", "knee", "shin", "ankle", "foot", "toe", "heel", "brain", "heart",
        "lung", "liver", "kidney", "stomach", "intestine", "muscle", "bone", "skin", "hair",
        "eyebrow", "eyelash", "cheek", "chin", "forehead", "temple", "jaw", "tooth", "tongue"
    ],

    "tool": [
        "hammer", "wrench", "screwdriver", "drill", "saw", "pliers", "knife", "scissors", "ruler",
        "tape", "level", "square", "chisel", "file", "sandpaper", "clamp", "vise", "anvil",
        "toolbox", "workbench", "ladder", "stepladder", "crowbar", "pickaxe", "shovel", "rake",
        "hoe", "spade", "trowel", "pruner", "shears", "mower", "trimmer", "blower", "chainsaw",
        "welder", "grinder", "router", "jigsaw", "bandsaw", "lathe", "press", "compressor"
    ],

    "clothing": [
        "shirt", "pants", "dress", "skirt", "jacket", "coat", "sweater", "hoodie", "blouse", "top",
        "jeans", "shorts", "trousers", "suit", "tie", "scarf", "hat", "cap", "gloves", "socks",
        "shoes", "boots", "sandals", "sneakers", "heels", "flats", "loafers", "slippers", "belt",
        "vest", "cardigan", "blazer", "tuxedo", "gown", "robe", "pajamas", "underwear", "bra",
        "bikini", "swimsuit", "uniform", "overalls", "jumpsuit", "romper", "tunic", "poncho"
    ],

    "sport": [
        "football", "basketball", "baseball", "soccer", "tennis", "golf", "hockey", "volleyball",
        "cricket", "rugby", "boxing", "wrestling", "swimming", "diving", "track", "marathon",
        "cycling", "skiing", "snowboard", "surfing", "skateboard", "bowling", "billiards", "darts",
        "archery", "fencing", "karate", "judo", "taekwondo", "gymnastics", "cheerleading", "dance",
        "yoga", "pilates", "aerobics", "crossfit", "weightlifting", "powerlifting", "bodybuilding",
        "climbing", "hiking", "camping", "fishing", "hunting", "sailing", "rowing", "kayaking"
    ],

    "building": [
        "house", "apartment", "mansion", "cottage", "cabin", "castle", "palace", "tower", "skyscraper",
        "office", "store", "shop", "mall", "market", "restaurant", "cafe", "bar", "hotel", "motel",
        "hospital", "clinic", "school", "university", "library", "museum", "theater", "cinema",
        "church", "temple", "mosque", "synagogue", "cathedral", "chapel", "monastery", "convent",
        "factory", "warehouse", "garage", "barn", "shed", "greenhouse", "lighthouse", "windmill",
        "bridge", "tunnel", "dam", "fort", "bunker", "observatory", "planetarium", "aquarium"
    ],

    "weather": [
        "sun", "rain", "snow", "wind", "cloud", "storm", "thunder", "lightning", "hail", "sleet",
        "fog", "mist", "drizzle", "shower", "downpour", "blizzard", "tornado", "hurricane", "cyclone",
        "typhoon", "drought", "flood", "frost", "ice", "dew", "humidity", "pressure", "temperature",
        "heat", "cold", "warm", "cool", "hot", "freezing", "mild", "severe", "gentle", "fierce",
        "calm", "breezy", "gusty", "windy", "stormy", "sunny", "cloudy", "overcast", "clear"
    ],

    "emotion": [
        "happy", "sad", "angry", "excited", "nervous", "calm", "peaceful", "anxious", "worried",
        "scared", "afraid", "brave", "confident", "shy", "proud", "ashamed", "guilty", "innocent",
        "curious", "bored", "interested", "fascinated", "amazed", "surprised", "shocked", "confused",
        "frustrated", "annoyed", "irritated", "pleased", "satisfied", "content", "grateful",
        "thankful", "hopeful", "optimistic", "pessimistic", "depressed", "elated", "ecstatic",
        "enthusiastic", "passionate", "loving", "caring", "compassionate", "empathetic", "sympathetic"
    ]
}


def save_word_banks(output_dir="counting_dataset"):
    """Save word banks to JSON file in specified directory"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    filepath = os.path.join(output_dir, 'word_banks.json')
    with open(filepath, 'w') as f:
        json.dump(word_banks, f, indent=2)

    print(f"Word banks saved to: {filepath}")
    print(f"Categories: {len(word_banks)}")
    print(f"Total category words: {sum(len(words) for words in word_banks.values())}")

    # Print category summary
    for category, words in word_banks.items():
        print(f"  {category}: {len(words)} words")

def generate_example(category, list_length=7):
    """Generate a single counting example with UNIFORM distribution"""
    target_words = word_banks[category]

    # UNIFORM METHOD: Equal probability for each possible count
    max_matches = min(len(target_words), list_length)
    num_matches = random.randint(0, max_matches)

    # Select matches from target category
    matches = random.sample(target_words, num_matches) if num_matches > 0 else []

    # Fill remaining slots with words from other categories
    remaining_slots = list_length - len(matches)

    fillers = []
    other_categories = [cat for cat in word_banks if cat != category]

    # Randomly sample from other categories to fill remaining slots
    for _ in range(remaining_slots):
        other_cat = random.choice(other_categories)
        other_word = random.choice(word_banks[other_cat])
        fillers.append(other_word)

    # Combine and shuffle
    word_list = matches + fillers
    random.shuffle(word_list)

    return {
        'type': category,
        'list_items': ', '.join([f"'{word}'" for word in word_list]),  # Proper Python list format
        'list_length': len(word_list),
        'answer': len(matches)
    }

def create_dataset(target_size=5000):
    """Generate complete dataset"""
    examples = []
    categories_list = list(word_banks.keys())
    list_lengths = [5, 6, 7, 8, 9, 10]  # Variable list lengths

    for i in range(target_size):
        category = random.choice(categories_list)
        list_length = random.choice(list_lengths)

        try:
            example = generate_example(category, list_length)
            examples.append(example)
        except Exception as e:
            print(f"Error generating example {i}: {e}")
            continue

    return examples

def validate_dataset(dataset):
    """Validate dataset quality"""
    print(f"\n=== Dataset Validation ===")
    print(f"Total examples: {len(dataset)}")

    # Answer distribution
    answer_dist = Counter([ex['answer'] for ex in dataset])
    print(f"Answer distribution: {dict(answer_dist)}")

    # Show percentages for better understanding
    total = len(dataset)
    print("Answer distribution (percentages):")
    for answer in sorted(answer_dist.keys()):
        count = answer_dist[answer]
        percentage = (count / total) * 100
        print(f"  {answer}: {count} examples ({percentage:.1f}%)")

    # Category distribution
    category_dist = Counter([ex['type'] for ex in dataset])
    print(f"Category distribution: {dict(category_dist)}")

    # List length distribution
    length_dist = Counter([ex['list_length'] for ex in dataset])
    print(f"List length distribution: {dict(length_dist)}")

    # Check for duplicates based on type + list_items combination
    examples_key = [(ex['type'], ex['list_items']) for ex in dataset]
    duplicates = len(examples_key) - len(set(examples_key))
    print(f"Duplicate examples: {duplicates}")

    return len(dataset) - duplicates

def export_dataset(dataset, output_dir="counting_dataset"):
    """Export dataset as single CSV file"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Define CSV fieldnames - minimal columns for data generation
    fieldnames = ['type', 'list_items', 'list_length', 'answer']

    # Export main dataset
    dataset_path = os.path.join(output_dir, f'counting_dataset_{len(dataset)}.csv')
    with open(dataset_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(dataset)

    print(f"\nDataset exported to: {dataset_path}")
    print(f"Total examples: {len(dataset)}")
    print(f"Word banks saved to: {os.path.join(output_dir, 'word_banks.json')}")

    return dataset_path

# Main execution
if __name__ == "__main__":
    # Set output directory
    output_directory = "counting_dataset"

    # Save word banks
    save_word_banks(output_directory)

    # Generate dataset
    print("\nGenerating dataset with UNIFORM distribution...")
    dataset = create_dataset(5000)

    # Validate
    valid_count = validate_dataset(dataset)

    # Export
    dataset_path = export_dataset(dataset, output_dir=output_directory)

    # Show sample examples
    print(f"\n=== Sample Examples ===")
    for i in range(3):
        ex = dataset[i]
        print(f"\nExample {i+1}:")
        print(f"Type: {ex['type']}")
        print(f"List: {ex['list_items']}")
        print(f"Answer: {ex['answer']}")

    # Show CSV loading example
    print(f"\n=== Quick Usage Example ===")
    print(f"import pandas as pd")
    print(f"df = pd.read_csv('{dataset_path}')")
    print(f"print(df.head())")
    print(f"\n# Easy to work with - list_items are ready to use:")
    print(f"word_list = eval('[' + df.iloc[0]['list_items'] + ']')")
    print(f"# Or use ast.literal_eval for safety")

Word banks saved to: counting_dataset/word_banks.json
Categories: 11
Total category words: 534
  fruit: 48 words
  animal: 63 words
  vehicle: 49 words
  color: 46 words
  body_part: 47 words
  tool: 43 words
  clothing: 46 words
  sport: 47 words
  building: 51 words
  weather: 47 words
  emotion: 47 words

Generating dataset with UNIFORM distribution...

=== Dataset Validation ===
Total examples: 5000
Answer distribution: {1: 600, 2: 628, 3: 602, 0: 651, 9: 163, 6: 452, 4: 622, 7: 374, 5: 613, 10: 64, 8: 231}
Answer distribution (percentages):
  0: 651 examples (13.0%)
  1: 600 examples (12.0%)
  2: 628 examples (12.6%)
  3: 602 examples (12.0%)
  4: 622 examples (12.4%)
  5: 613 examples (12.3%)
  6: 452 examples (9.0%)
  7: 374 examples (7.5%)
  8: 231 examples (4.6%)
  9: 163 examples (3.3%)
  10: 64 examples (1.3%)
Category distribution: {'body_part': 487, 'building': 414, 'emotion': 449, 'vehicle': 503, 'animal': 477, 'sport': 418, 'weather': 478, 'color': 436, 'fruit': 435, 'cl

In [None]:
import shutil
import os

src_dir = '/content/counting_dataset'
dst_dir = '/content/drive/MyDrive/counting_project/counting_dataset'

# Make sure destination directory exists
os.makedirs(dst_dir, exist_ok=True)

# Loop through all files in source directory
for filename in os.listdir(src_dir):
    src_file = os.path.join(src_dir, filename)
    dst_file = os.path.join(dst_dir, filename)

    # Only copy files (skip subdirectories)
    if os.path.isfile(src_file):
        shutil.copy2(src_file, dst_file)  # copy2 preserves metadata

# Causal Mediator Dataset

In [None]:
import json
import random
import os
from typing import List, Dict, Tuple
from collections import defaultdict

class CMAWordCountDataGenerator:
    def __init__(self, word_banks_path: str = "word_banks.json"):
        """Initialize with word banks"""
        with open(word_banks_path, 'r') as f:
            self.word_banks = json.load(f)

        self.categories = list(self.word_banks.keys())
        print(f"Loaded {len(self.categories)} categories: {self.categories}")

    def create_base_example(self, target_category: str, list_length: int = 7) -> Dict:
        """Create a single base example for counting words of a specific category"""
        # Number of target category words (1 to list_length-2)
        num_target_words = random.randint(1, max(2, list_length - 2))

        # Sample words from target category
        target_words = random.sample(self.word_banks[target_category],
                                   min(num_target_words, len(self.word_banks[target_category])))

        # Sample distractor words from other categories
        distractor_words = []
        other_categories = [cat for cat in self.categories if cat != target_category]

        while len(distractor_words) < list_length - len(target_words):
            random_category = random.choice(other_categories)
            word = random.choice(self.word_banks[random_category])
            if word not in distractor_words and word not in target_words:
                distractor_words.append(word)

        # Create word list and shuffle
        word_list = target_words + distractor_words
        random.shuffle(word_list)

        # Record positions of target words
        target_positions = [i for i, word in enumerate(word_list)
                          if word in target_words]

        return {
            'category': target_category,
            'word_list': word_list,
            'target_words': target_words,
            'distractor_words': distractor_words,
            'target_positions': target_positions,
            'count': len(target_words),
            'list_length': list_length
        }

    def create_intervention_pair(self, base_example: Dict) -> Dict:
        """Create an intervention by replacing a target word with a distractor"""
        target_positions = base_example['target_positions']

        if not target_positions:
            return None

        # Choose a random target word position to intervene
        intervention_pos = random.choice(target_positions)
        original_word = base_example['word_list'][intervention_pos]

        # Choose a replacement word from a different category
        other_categories = [cat for cat in self.categories if cat != base_example['category']]
        replacement_category = random.choice(other_categories)

        # Get a word that's not already in the list
        available_words = [w for w in self.word_banks[replacement_category]
                          if w not in base_example['word_list']]

        if not available_words:
            # Try another category
            for cat in other_categories:
                available_words = [w for w in self.word_banks[cat]
                                 if w not in base_example['word_list']]
                if available_words:
                    replacement_category = cat
                    break

        if not available_words:
            return None

        intervention_word = random.choice(available_words)

        # Create intervention list
        intervention_list = base_example['word_list'].copy()
        intervention_list[intervention_pos] = intervention_word

        return {
            'category': base_example['category'],
            'original_list': base_example['word_list'],
            'intervention_list': intervention_list,
            'original_count': base_example['count'],
            'intervention_count': base_example['count'] - 1,  # One less target word
            'intervention_position': intervention_pos,
            'original_word': original_word,
            'intervention_word': intervention_word,
            'intervention_category': replacement_category
        }

    def generate_dataset_pairs(self, num_pairs: int = 3000, output_dir: str = "/content/counting_dataset") -> List[Dict]:
        """Generate intervention pairs and save to specified directory"""

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        intervention_pairs = []
        attempts = 0
        max_attempts = num_pairs * 3  # Allow more attempts to reach target

        while len(intervention_pairs) < num_pairs and attempts < max_attempts:
            attempts += 1

            # Create a base example
            category = random.choice(self.categories)
            list_length = random.randint(5, 8)
            base_example = self.create_base_example(category, list_length)

            # Create intervention pair
            intervention = self.create_intervention_pair(base_example)
            if intervention:
                intervention['pair_id'] = len(intervention_pairs)
                intervention_pairs.append(intervention)

        # Save the pairs
        output_path = os.path.join(output_dir, "cma_intervention_pairs.json")
        with open(output_path, 'w') as f:
            json.dump(intervention_pairs, f, indent=2)

        print(f"\nGenerated {len(intervention_pairs)} intervention pairs")
        print(f"Saved to: {output_path}")

        # Print basic statistics
        self.print_statistics(intervention_pairs)

        return intervention_pairs

    def print_statistics(self, intervention_pairs: List[Dict]):
        """Print dataset statistics"""
        print("\n=== Dataset Statistics ===")
        print(f"Total intervention pairs: {len(intervention_pairs)}")

        # Category distribution
        category_counts = defaultdict(int)
        position_dist = defaultdict(int)

        for pair in intervention_pairs:
            category_counts[pair['category']] += 1
            position_dist[pair['intervention_position']] += 1

        print("\nCategory distribution:")
        for cat, count in sorted(category_counts.items()):
            print(f"  {cat}: {count}")

        print("\nIntervention position distribution:")
        for pos, count in sorted(position_dist.items()):
            print(f"  Position {pos}: {count}")


# Example usage
if __name__ == "__main__":
    # Initialize generator with your path
    generator = CMAWordCountDataGenerator("/content/counting_dataset/word_banks.json")

    # Generate dataset pairs and save to specified directory
    pairs = generator.generate_dataset_pairs(num_pairs=3000, output_dir="/content/CMA_analysis")

    # Show a few examples
    print("\n=== Example Pairs ===")
    for i in range(min(3, len(pairs))):
        pair = pairs[i]
        print(f"\nPair {pair['pair_id']}:")
        print(f"  Category: {pair['category']}")
        print(f"  Original: {pair['original_list']} (count: {pair['original_count']})")
        print(f"  Intervention: {pair['intervention_list']} (count: {pair['intervention_count']})")
        print(f"  Changed: '{pair['original_word']}' -> '{pair['intervention_word']}' at position {pair['intervention_position']}")