In [48]:
import pandas as pd
import json
import re


In [49]:
def preprocess_puzzle(puzzle_text):

    parts = re.split(r'##\s*clues:', puzzle_text, flags=re.IGNORECASE)
    
    if len(parts) < 2:
        return "", []
    
    characteristics_text = parts[0]
    clues_text = parts[1]
    
    # Extract clues, removing the number prefix (e.g., "1. ", "2. ")
    clues = []
    for line in clues_text.split('\n'):
        line = line.strip()
        if line:
            # Remove leading number and period (e.g., "1. ", "2. ")
            clue = re.sub(r'^\d+\.\s+', '', line)
            if clue:
                clues.append(clue)
    
    return characteristics_text, clues

def extract_attributes(characteristics_text):
    """Extract attribute columns from characteristics section."""
    attributes = {}
    
    lines = characteristics_text.split('\n')
    
    for line in lines:
        # Match lines that have format: " - Description: `value1`, `value2`, ..."
        match = re.match(r'\s*-\s*(.+?):\s*(.+)', line)
        if match:
            description = match.group(1).strip()
            values_str = match.group(2)

            words = description.split()
            attr_name = words[-1] if words else "unknown"
            
            # Extract all backtick-quoted values
            values = re.findall(r'`([^`]+)`', values_str)
            
            if values:
                attributes[attr_name] = values
    
    return pd.DataFrame(attributes)


In [50]:
def normalize(clue):
    """
    Normalize clues by removing common linguistic variations.
    Examples:
    - "the german is bob" → "bob is german"
    - "the person who is german is bob" → "bob is german"
    """
    clue = clue.lower()
    clue = clue.replace("the person who is", "")
    clue = clue.replace("the person who loves", "")
    clue = clue.replace("the person whose", "")
    clue = clue.replace("the person ", "")
    return clue.strip()

In [51]:
PATTERNS = {
    "not_at_position": re.compile(r"(.+)\s+is\s+not\s+in\s+the\s+(\w+)\s+house"),
    "at_position": re.compile(r"(.+)\s+(?:is\s+)?in\s+the\s+(\w+)\s+house"),
    "next_to": re.compile(r"(.+)\s+and\s+(.+)\s+are\s+next\s+to\s+each\s+other"),
    "direct_left": re.compile(r"(.+)\s+is\s+directly\s+(?:left|to\s+the\s+left)\s+of\s+(.+)"),
    "left_of": re.compile(r"(.+)\s+is\s+(?:somewhere\s+)?to\s+the\s+left\s+of\s+(.+)"),
    "distance": re.compile(r"there\s+(?:is|are)\s+(\w+)\s+house[s]?\s+between\s+(.+?)\s+and\s+(.+)"),
    "same": re.compile(r"(.+)\s+is\s+(.+)")
}

In [52]:
def extract_symbols(clue, known_entities):
    """
    Extract entity references from a clue based on known entities.
    Also extract position numbers (first, second, third, etc.)
    
    Args:
        clue: The normalized clue text
        known_entities: List of all known attribute values
    
    Returns:
        Dict with 'entities' (list) and 'position' (int or None)
    """
    entities = [e for e in known_entities if e.lower() in clue.lower()]
    
    # Extract position if present (first, second, third, etc.)
    position_map = {
        'first': 0, 'second': 1, 'third': 2, 'fourth': 3, 'fifth': 4,
        'sixth': 5, 'seventh': 6, 'eighth': 7, 'ninth': 8, 'tenth': 9
    }
    position = None
    for pos_word, pos_num in position_map.items():
        if pos_word in clue.lower():
            position = pos_num
            break
    
    return {'entities': entities, 'position': position}

In [53]:
def parse_clue(clue, known_entities):
    """
    Parse a clue and identify its type and relevant symbols.
    
    Args:
        clue: The clue text
        known_entities: List of all known attribute values
    
    Returns:
        Tuple of (constraint_type, symbol_data, match_groups)
        symbol_data: Dict with 'entities' (list) and 'position' (int or None)
        Example: ("not_at_position", {"entities": ["eric"], "position": 1}, ("eric", "second"))
    
    Raises:
        ValueError: If clue doesn't match any known pattern
    """
    clue = normalize(clue)
    
    # Try to match against each pattern type (order matters - check specific patterns first)
    for ctype, pattern in PATTERNS.items():
        match = pattern.search(clue)
        if match:
            symbols = extract_symbols(clue, known_entities)
            return ctype, symbols, match.groups()
    
    # If no pattern matched, raise an error
    raise ValueError(f"Unrecognized clue format: {clue}")

In [54]:
# 7️⃣ Parse Clues from Actual Puzzle
def parse_puzzle_clues(puzzle_text):
    """
    Parse all clues from the puzzle using the full pipeline.
    
    Args:
        puzzle_text: The raw puzzle text
    
    Returns:
        List of tuples: (original_clue, constraint_type, symbols, match_groups)
    """
    # Preprocess puzzle
    characteristics_text, clues = preprocess_puzzle(puzzle_text)
    
    # Extract all known entities from attributes
    attrs_df = extract_attributes(characteristics_text)
    known_entities = []
    for col in attrs_df.columns:
        known_entities.extend(attrs_df[col].tolist())
    
    parsed_clues = []
    
    for _, clue in enumerate(clues, 1):
        try:
            ctype, symbols, groups = parse_clue(clue, known_entities)
            parsed_clues.append((clue, ctype, symbols, groups))
        except ValueError as e:
            parsed_clues.append((clue, "UNKNOWN", [], ()))
    
    return parsed_clues

In [55]:
gridmode = pd.read_parquet("Gridmode-00000-of-00001.parquet")
mc = pd.read_parquet("mc-00000-of-00001.parquet")
index = 7
puzzle = gridmode.puzzle.iloc[index].lower()

print(puzzle)


there are 5 houses, numbered 1 to 5 from left to right, as seen from across the street. each house is occupied by a different person. each house has a unique attribute for each of the following characteristics:
 - each person has a unique name: `alice`, `peter`, `bob`, `eric`, `arnold`
 - each person has a unique level of education: `master`, `bachelor`, `doctorate`, `high school`, `associate`
 - each person has a favorite color: `yellow`, `green`, `blue`, `red`, `white`
 - each person prefers a unique type of vacation: `mountain`, `camping`, `cruise`, `beach`, `city`
 - people use unique phone models: `huawei p50`, `google pixel 6`, `iphone 13`, `oneplus 9`, `samsung galaxy s21`

## clues:
1. alice is the person with a master's degree.
2. the person who uses a oneplus 9 is the person with a doctorate.
3. the person who loves blue is somewhere to the left of bob.
4. the person who prefers city breaks is peter.
5. the person who loves beach vacations is the person whose favorite color i

In [56]:
# Preprocess the puzzle to separate characteristics and clues
characteristics_text, clues = preprocess_puzzle(puzzle)

# Extract attributes from the characteristics section
attrs = extract_attributes(characteristics_text)
print("Attributes:")
print(attrs)
print("\nClues:")
for _, clue in enumerate(clues, 1):
    print(clue)

Attributes:
     name    education   color  vacation              models
0   alice       master  yellow  mountain          huawei p50
1   peter     bachelor   green   camping      google pixel 6
2     bob    doctorate    blue    cruise           iphone 13
3    eric  high school     red     beach           oneplus 9
4  arnold    associate   white      city  samsung galaxy s21

Clues:
alice is the person with a master's degree.
the person who uses a oneplus 9 is the person with a doctorate.
the person who loves blue is somewhere to the left of bob.
the person who prefers city breaks is peter.
the person who loves beach vacations is the person whose favorite color is green.
the person who loves white is the person who uses an iphone 13.
the person who uses a oneplus 9 is the person who enjoys mountain retreats.
the person who uses a huawei p50 is somewhere to the left of the person whose favorite color is red.
the person who loves beach vacations is not in the first house.
alice is in the

In [57]:
# Parse all clues from the actual puzzle
parsed_clues = parse_puzzle_clues(puzzle)

# Display parsed clues in a formatted table
for i, (clue, ctype, symbols, groups) in enumerate(parsed_clues, 1):
    print(f"{i}. [{ctype.upper()}] {clue}")
    print(f"   Entities: {symbols.get('entities', [])}")
    if symbols.get('position') is not None:
        position_names = ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth']
        pos_word = position_names[symbols['position']] if symbols['position'] < len(position_names) else symbols['position']
        print(f"   Position: {pos_word} (index {symbols['position']})")
    print(f"   Groups: {groups}\n")

1. [SAME] alice is the person with a master's degree.
   Entities: ['alice', 'master']
   Groups: ('alice', "with a master's degree.")

2. [SAME] the person who uses a oneplus 9 is the person with a doctorate.
   Entities: ['doctorate', 'oneplus 9']
   Groups: ('who uses a oneplus 9', 'with a doctorate.')

3. [LEFT_OF] the person who loves blue is somewhere to the left of bob.
   Entities: ['bob', 'blue']
   Groups: ('blue', 'bob.')

4. [SAME] the person who prefers city breaks is peter.
   Entities: ['peter', 'city']
   Groups: ('who prefers city breaks', 'peter.')

5. [SAME] the person who loves beach vacations is the person whose favorite color is green.
   Entities: ['green', 'beach']
   Groups: ('beach vacations is  favorite color', 'green.')

6. [SAME] the person who loves white is the person who uses an iphone 13.
   Entities: ['white', 'iphone 13']
   Groups: ('white', 'who uses an iphone 13.')

7. [SAME] the person who uses a oneplus 9 is the person who enjoys mountain retreat