# Knowledge Graph Rule Discovery

This notebook discovers logical rules in a knowledge graph and validates them with:
- **Confidence**: Fraction of times the rule holds
- **Support**: Number of instances where the rule applies
- **Concrete examples**: Actual instances from the dataset

## Rules mined:
1) Inverese Rule
2) Horn Clause (A->B->C) => (A<->C)
3) Horn Clause but for more than 3 people involved.
4) Triangle Rules

In [24]:
import pandas as pd
from collections import defaultdict
from itertools import combinations

# Step 1: Parse the graph data
def parse_graph(file_path):
    """Parse the graph file and return a list of triples (subject, relation, object)"""
    triples = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 3:
                    subject, relation, obj = parts
                    triples.append((subject, relation, obj))
    return triples

# Step 2: Build data structures for efficient querying
def build_graph_structures(triples):
    """Build data structures to store relations"""
    # Dictionary: person -> list of (relation, target_person)
    outgoing = defaultdict(list)
    # Dictionary: person -> list of (relation, source_person)
    incoming = defaultdict(list)
    # Set of all unique persons
    persons = set()
    # Set of all unique relations
    relations = set()
    
    for subject, relation, obj in triples:
        outgoing[subject].append((relation, obj))
        incoming[obj].append((relation, subject))
        persons.add(subject)
        persons.add(obj)
        relations.add(relation)
    
    return outgoing, incoming, persons, relations

# Step 3: Mine rules - find if B->A implies A->B patterns
def mine_reciprocal_rules(triples, outgoing):
    """
    For each triple (B, relation1, A), check if (A, relation2, B) exists
    Count how often each pattern occurs
    """
    rule_counts = defaultdict(int)  # (relation1, relation2) -> count
    relation_counts = defaultdict(int)  # relation1 -> total count
    
    for subject, relation1, obj in triples:
        relation_counts[relation1] += 1
        
        # Check all relations from obj back to subject
        if obj in outgoing:
            for relation2, target in outgoing[obj]:
                if target == subject:
                    rule_counts[(relation1, relation2)] += 1
    
    return rule_counts, relation_counts
# Step 4: Calculate support and confidence
def calculate_metrics(rule_counts, relation_counts, total_triples):
    """Calculate support and confidence for each rule"""
    results = []
    
    for (rel1, rel2), count in rule_counts.items():
        support = count / total_triples if total_triples > 0 else 0
        confidence = count / relation_counts[rel1] if relation_counts[rel1] > 0 else 0
        
        results.append({
            'Rule': f'If B {rel1} A, then A {rel2} B',
            'Relation1': rel1,
            'Relation2': rel2,
            'Count': count,
            'Support': support,
            'Confidence': confidence,
            'Total_Relation1': relation_counts[rel1]
        })
    
    return pd.DataFrame(results).sort_values('Count', ascending=False)

# Main execution
# Read and parse the file
file_path = 'train.txt'  # Change this to your file path
triples = parse_graph(file_path)

# Build graph structures
outgoing, incoming, persons, relations = build_graph_structures(triples)

# Mine reciprocal rules
rule_counts, relation_counts = mine_reciprocal_rules(triples, outgoing)

# Calculate metrics
total_triples = len(triples)
results_df = calculate_metrics(rule_counts, relation_counts, total_triples)

# Display results
print(f"Total triples: {total_triples}")
print(f"Total unique persons: {len(persons)}")
print(f"Total unique relations: {len(relations)}\n")
print("Reciprocal Rules Found:")
print(results_df.to_string(index=False))

# Optional: Show sample of the graph structure
print("\n\nSample of outgoing relations:")
for person in list(persons)[:5]:
    if person in outgoing:
        print(f"{person}: {outgoing[person][:3]}")

Total triples: 13821
Total unique persons: 1316
Total unique relations: 28

Reciprocal Rules Found:
                                                       Rule                    Relation1                    Relation2  Count  Support  Confidence  Total_Relation1
                  If B grandfatherOf A, then A grandsonOf B                grandfatherOf                   grandsonOf    407 0.029448    0.500615              813
                  If B grandsonOf A, then A grandfatherOf B                   grandsonOf                grandfatherOf    407 0.029448    0.500000              814
                  If B grandsonOf A, then A grandmotherOf B                   grandsonOf                grandmotherOf    407 0.029448    0.500000              814
                  If B grandmotherOf A, then A grandsonOf B                grandmotherOf                   grandsonOf    407 0.029448    0.500615              813
             If B granddaughterOf A, then A grandmotherOf B              granddaughte

# The above list is that of all the inverse relations found in the graph.
Ideally for inverse relations the confidence should be way higher at 90%. However most have a confidence of about 50%. In the context of a family knowledge graph we can conclude that this is due to the graph being sparse and that lot of the entries are simply missing.
Ideally the support should be more than 5% but most relations have a support of arround 2%.

All of this points to the conclusion that the graph is not properly populated and several edges in this graph are missing.

## The Horn Clause

In [25]:
import pandas as pd
from collections import defaultdict
from itertools import product

# Step 1: Parse the graph data (same as before)
def parse_graph(file_path):
    """Parse the graph file and return a list of triples (subject, relation, object)"""
    triples = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 3:
                    subject, relation, obj = parts
                    triples.append((subject, relation, obj))
    return triples

# Step 2: Build data structures (same as before)
def build_graph_structures(triples):
    """Build data structures to store relations"""
    outgoing = defaultdict(list)
    incoming = defaultdict(list)
    persons = set()
    relations = set()
    
    for subject, relation, obj in triples:
        outgoing[subject].append((relation, obj))
        incoming[obj].append((relation, subject))
        persons.add(subject)
        persons.add(obj)
        relations.add(relation)
    
    return outgoing, incoming, persons, relations

# Step 3: Mine Horn clauses
def mine_horn_clauses(triples, outgoing):
    """
    Mine rules of the form: relation1(X,Y) ∧ relation2(Y,Z) → relation3(X,Z)
    
    For each pair of triples:
    - (X, relation1, Y) and (Y, relation2, Z)
    Check if there exists (X, relation3, Z)
    """
    # Build index for quick lookup: (X, Z) -> set of relations
    xz_relations = defaultdict(set)
    for subject, relation, obj in triples:
        xz_relations[(subject, obj)].add(relation)
    
    # Count potential body patterns and successful head patterns
    body_counts = defaultdict(int)  # (relation1, relation2) -> count of X,Y,Z satisfying body
    rule_counts = defaultdict(int)  # (relation1, relation2, relation3) -> count of full rule satisfaction
    
    # For each triple (X, relation1, Y)
    for x, relation1, y in triples:
        # Check if Y has outgoing edges (Y, relation2, Z)
        if y in outgoing:
            for relation2, z in outgoing[y]:
                # We have: relation1(X, Y) ∧ relation2(Y, Z)
                body_counts[(relation1, relation2)] += 1
                
                # Check if relation3(X, Z) exists for any relation3
                if (x, z) in xz_relations:
                    for relation3 in xz_relations[(x, z)]:
                        # We found: relation1(X,Y) ∧ relation2(Y,Z) → relation3(X,Z)
                        rule_counts[(relation1, relation2, relation3)] += 1
    
    return rule_counts, body_counts

# Step 4: Calculate metrics for Horn clauses
def calculate_horn_metrics(rule_counts, body_counts, total_triples):
    """Calculate support and confidence for each Horn clause"""
    results = []
    
    for (rel1, rel2, rel3), count in rule_counts.items():
        body_count = body_counts[(rel1, rel2)]
        
        # Support: proportion of all triples
        support = count / total_triples if total_triples > 0 else 0
        
        # Confidence: proportion of body occurrences that lead to head
        confidence = count / body_count if body_count > 0 else 0
        
        results.append({
            'Rule': f'{rel1}(X,Y) ∧ {rel2}(Y,Z) → {rel3}(X,Z)',
            'Relation1': rel1,
            'Relation2': rel2,
            'Relation3': rel3,
            'Count': count,
            'Body_Count': body_count,
            'Support': support,
            'Confidence': confidence
        })
    
    return pd.DataFrame(results).sort_values('Count', ascending=False)

# Step 5: Find examples for a specific rule
def find_rule_examples(triples, outgoing, rel1, rel2, rel3, max_examples=5):
    """Find concrete examples of a specific Horn clause"""
    examples = []
    xz_relations = defaultdict(set)
    
    for subject, relation, obj in triples:
        xz_relations[(subject, obj)].add(relation)
    
    for x, relation1, y in triples:
        if relation1 == rel1 and y in outgoing:
            for relation2, z in outgoing[y]:
                if relation2 == rel2:
                    if (x, z) in xz_relations and rel3 in xz_relations[(x, z)]:
                        examples.append({
                            'X': x,
                            'Y': y,
                            'Z': z,
                            f'{rel1}(X,Y)': True,
                            f'{rel2}(Y,Z)': True,
                            f'{rel3}(X,Z)': True
                        })
                        if len(examples) >= max_examples:
                            return pd.DataFrame(examples)
    
    return pd.DataFrame(examples)

# Main execution
# Read and parse the file
file_path = 'train.txt'  # Change this to your file path
triples = parse_graph(file_path)

# Build graph structures
outgoing, incoming, persons, relations = build_graph_structures(triples)

# Mine Horn clauses
rule_counts, body_counts = mine_horn_clauses(triples, outgoing)

# Calculate metrics
total_triples = len(triples)
horn_results_df = calculate_horn_metrics(rule_counts, body_counts, total_triples)

# Display results
print(f"Total triples: {total_triples}")
print(f"Total unique persons: {len(persons)}")
print(f"Total unique relations: {len(relations)}\n")
print("Horn Clause Rules Found:")
print(horn_results_df.to_string(index=False))

# Optional: Show examples for top rules
if len(horn_results_df) > 0:
    print("\n" + "="*80)
    print("Examples for top 3 rules:")
    print("="*80)
    
    for idx, row in horn_results_df.head(3).iterrows():
        print(f"\nRule: {row['Rule']}")
        print(f"Confidence: {row['Confidence']:.2%}, Count: {row['Count']}")
        examples = find_rule_examples(triples, outgoing, 
                                      row['Relation1'], 
                                      row['Relation2'], 
                                      row['Relation3'], 
                                      max_examples=3)
        if not examples.empty:
            print(examples.to_string(index=False))
        print("-"*80)

# **What this code does:**

# 1. **Mines Horn clauses** of the form `relation1(X,Y) ∧ relation2(Y,Z) → relation3(X,Z)`
# 2. **Calculates metrics:**
#    - **Count**: How many times the full rule holds
#    - **Body_Count**: How many times the body (premise) occurs
#    - **Confidence**: What percentage of body occurrences lead to the head (conclusion)
#    - **Support**: Overall frequency in the dataset

# 3. **Shows concrete examples** for the top rules found

# **Example output you might see:**
# ```
# Rule: motherOf(X,Y) ∧ motherOf(Y,Z) → grandmotherOf(X,Z)
# Confidence: 95%, Count: 42

Total triples: 13821
Total unique persons: 1316
Total unique relations: 28

Horn Clause Rules Found:
                                                                                           Rule                    Relation1                    Relation2                    Relation3  Count  Body_Count  Support  Confidence
                                    sisterOf(X,Y) ∧ granddaughterOf(Y,Z) → granddaughterOf(X,Z)                     sisterOf              granddaughterOf              granddaughterOf    772         772 0.055857    1.000000
                                      granddaughterOf(X,Y) ∧ grandmotherOf(Y,Z) → sisterOf(X,Z)              granddaughterOf                grandmotherOf                     sisterOf    747        1598 0.054048    0.467459
                                        grandfatherOf(X,Y) ∧ sisterOf(Y,Z) → grandfatherOf(X,Z)                grandfatherOf                     sisterOf                grandfatherOf    747         747 0.054048    1.000000
       

In [28]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# Step 1: Parse the graph data
def parse_graph(file_path):
    """Parse the graph file and return a list of triples (subject, relation, object)"""
    triples = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 3:
                    subject, relation, obj = parts
                    triples.append((subject, relation, obj))
    return triples

# Step 2: Build optimized data structures
def build_graph_structures(triples):
    """Build optimized data structures for fast querying"""
    # outgoing[node] = list of (relation, target) pairs
    outgoing = defaultdict(list)
    # relation_index[(relation, source)] = list of targets
    relation_index = defaultdict(list)
    # pair_to_relations[(source, target)] = set of relations
    pair_to_relations = defaultdict(set)
    
    persons = set()
    relations = set()
    
    for subject, relation, obj in triples:
        outgoing[subject].append((relation, obj))
        relation_index[(relation, subject)].append(obj)
        pair_to_relations[(subject, obj)].add(relation)
        persons.add(subject)
        persons.add(obj)
        relations.add(relation)
    
    return outgoing, relation_index, pair_to_relations, persons, relations

# Step 3: Optimized mining for length 2
def mine_length_2_optimized(triples, outgoing, pair_to_relations, min_support=2):
    """
    Optimized: relation1(X,Y) ∧ relation2(Y,Z) → relation3(X,Z)
    """
    body_counts = defaultdict(int)
    rule_counts = defaultdict(int)
    
    # Group triples by middle node Y for efficient joining
    for x, rel1, y in triples:
        if y not in outgoing:
            continue
            
        for rel2, z in outgoing[y]:
            body_counts[(rel1, rel2)] += 1
            
            # Check if (x, z) has any relations
            if (x, z) in pair_to_relations:
                for rel3 in pair_to_relations[(x, z)]:
                    rule_counts[(rel1, rel2, rel3)] += 1
    
    # Filter by minimum support
    rule_counts = {k: v for k, v in rule_counts.items() if v >= min_support}
    
    return rule_counts, body_counts

# Step 4: Optimized mining for length 3
def mine_length_3_optimized(triples, outgoing, pair_to_relations, min_support=2):
    """
    Optimized: relation1(X,Y) ∧ relation2(Y,Z) ∧ relation3(Z,W) → relation4(X,W)
    """
    body_counts = defaultdict(int)
    rule_counts = defaultdict(int)
    
    # Build 2-hop paths first: (X, rel1, rel2) -> list of (Y, Z) pairs
    two_hop_paths = defaultdict(list)
    
    for x, rel1, y in triples:
        if y in outgoing:
            for rel2, z in outgoing[y]:
                two_hop_paths[(x, rel1, rel2)].append((y, z))
    
    # Extend to 3-hop
    for (x, rel1, rel2), yz_pairs in two_hop_paths.items():
        for y, z in yz_pairs:
            if z not in outgoing:
                continue
                
            for rel3, w in outgoing[z]:
                body_counts[(rel1, rel2, rel3)] += 1
                
                if (x, w) in pair_to_relations:
                    for rel4 in pair_to_relations[(x, w)]:
                        rule_counts[(rel1, rel2, rel3, rel4)] += 1
    
    rule_counts = {k: v for k, v in rule_counts.items() if v >= min_support}
    
    return rule_counts, body_counts

# Step 5: Optimized mining for length 4
def mine_length_4_optimized(triples, outgoing, pair_to_relations, min_support=2):
    """
    Optimized: relation1(X,Y) ∧ relation2(Y,Z) ∧ relation3(Z,W) ∧ relation4(W,V) → relation5(X,V)
    """
    body_counts = defaultdict(int)
    rule_counts = defaultdict(int)
    
    # Build 3-hop paths first
    three_hop_paths = defaultdict(list)
    
    for x, rel1, y in triples:
        if y not in outgoing:
            continue
        for rel2, z in outgoing[y]:
            if z not in outgoing:
                continue
            for rel3, w in outgoing[z]:
                three_hop_paths[(x, rel1, rel2, rel3)].append(w)
    
    # Extend to 4-hop
    for (x, rel1, rel2, rel3), w_list in three_hop_paths.items():
        for w in w_list:
            if w not in outgoing:
                continue
                
            for rel4, v in outgoing[w]:
                body_counts[(rel1, rel2, rel3, rel4)] += 1
                
                if (x, v) in pair_to_relations:
                    for rel5 in pair_to_relations[(x, v)]:
                        rule_counts[(rel1, rel2, rel3, rel4, rel5)] += 1
    
    rule_counts = {k: v for k, v in rule_counts.items() if v >= min_support}
    
    return rule_counts, body_counts

# Step 6: Calculate metrics
def calculate_metrics(rule_counts, body_counts, total_triples, chain_length):
    """Calculate support and confidence"""
    results = []
    
    for rule_tuple, count in rule_counts.items():
        body_tuple = rule_tuple[:-1]
        head_relation = rule_tuple[-1]
        body_count = body_counts.get(body_tuple, 0)
        
        if body_count == 0:
            continue
        
        support = count / total_triples
        confidence = count / body_count
        
        # Build rule string
        variables = ['X', 'Y', 'Z', 'W', 'V', 'U'][:chain_length + 1]
        body_parts = [f'{rel}({variables[i]},{variables[i+1]})' 
                      for i, rel in enumerate(body_tuple)]
        body_str = ' ∧ '.join(body_parts)
        head_str = f'{head_relation}({variables[0]},{variables[chain_length]})'
        rule_str = f'{body_str} → {head_str}'
        
        results.append({
            'Rule': rule_str,
            'Body_Relations': ' -> '.join(body_tuple),
            'Head_Relation': head_relation,
            'Count': count,
            'Body_Count': body_count,
            'Support': support,
            'Confidence': confidence,
            'Chain_Length': chain_length
        })
    
    return pd.DataFrame(results).sort_values(['Confidence', 'Count'], ascending=[False, False])

# Step 7: Find examples (optimized)
def find_examples_fast(triples, outgoing, rule_tuple, max_examples=3):
    """Fast example finding"""
    body_rels = rule_tuple[:-1]
    head_rel = rule_tuple[-1]
    chain_length = len(body_rels)
    
    examples = []
    variables = ['X', 'Y', 'Z', 'W', 'V', 'U'][:chain_length + 1]
    
    # Build pair_to_relations for quick lookup
    pair_to_relations = defaultdict(set)
    for s, r, o in triples:
        pair_to_relations[(s, o)].add(r)
    
    for x, rel1, y in triples:
        if rel1 != body_rels[0]:
            continue
        
        # Try to build the chain
        current_nodes = [y]
        chain_nodes = [x, y]
        
        valid = True
        for i in range(1, chain_length):
            found = False
            for node in current_nodes:
                if node in outgoing:
                    for rel, next_node in outgoing[node]:
                        if rel == body_rels[i]:
                            chain_nodes.append(next_node)
                            current_nodes = [next_node]
                            found = True
                            break
                if found:
                    break
            if not found:
                valid = False
                break
        
        if not valid or len(chain_nodes) != chain_length + 1:
            continue
        
        # Check if head holds
        if (x, chain_nodes[-1]) in pair_to_relations and head_rel in pair_to_relations[(x, chain_nodes[-1])]:
            example = {}
            for i, var in enumerate(variables):
                example[var] = chain_nodes[i]
            
            for i, rel in enumerate(body_rels):
                example[f'{rel}({variables[i]},{variables[i+1]})'] = '✓'
            example[f'{head_rel}({variables[0]},{variables[chain_length]})'] = '✓'
            
            examples.append(example)
            
            if len(examples) >= max_examples:
                break
    
    return pd.DataFrame(examples)

# Main execution
print("Loading graph...")
file_path = 'train.txt'  # Change this to your file path
triples = parse_graph(file_path)

print("Building indexes...")
outgoing, relation_index, pair_to_relations, persons, relations = build_graph_structures(triples)

total_triples = len(triples)
print(f"\nGraph Statistics:")
print(f"  Total triples: {total_triples}")
print(f"  Unique persons: {len(persons)}")
print(f"  Unique relations: {len(relations)}")

# Set minimum support threshold
MIN_SUPPORT = 2
MIN_CONFIDENCE = 0.1  # 10%

# Mine different chain lengths
for chain_len, mine_func in [(2, mine_length_2_optimized), 
                              (3, mine_length_3_optimized),
                              (4, mine_length_4_optimized)]:
    
    print(f"\n{'='*100}")
    print(f"Mining Chain Length {chain_len} (min_support={MIN_SUPPORT})...")
    print(f"{'='*100}")
    
    rule_counts, body_counts = mine_func(triples, outgoing, pair_to_relations, MIN_SUPPORT)
    
    if not rule_counts:
        print(f"No rules found with minimum support of {MIN_SUPPORT}")
        continue
    
    results_df = calculate_metrics(rule_counts, body_counts, total_triples, chain_len)
    
    # Filter by confidence
    results_df = results_df[results_df['Confidence'] >= MIN_CONFIDENCE]
    
    print(f"\nFound {len(results_df)} rules")
    print(f"\nTop 10 rules:")
    print(results_df.head(10)[['Rule', 'Count', 'Body_Count', 'Confidence']].to_string(index=False))
    
    # Save to CSV
    csv_filename = f'horn_rules_length_{chain_len}.csv'
    results_df.to_csv(csv_filename, index=False)
    print(f"\nSaved to {csv_filename}")
    
    # Show examples for top 3
    print(f"\nTop 3 Rule Examples:")
    print("-" * 100)
    for idx, row in results_df.head(3).iterrows():
        print(f"\n{row['Rule']}")
        print(f"Confidence: {row['Confidence']:.1%} | Count: {row['Count']} | Body: {row['Body_Count']}")
        
        body_rels = tuple(row['Body_Relations'].split(' -> '))
        rule_tuple = body_rels + (row['Head_Relation'],)
        
        examples = find_examples_fast(triples, outgoing, rule_tuple, max_examples=2)
        if not examples.empty:
            print(examples.to_string(index=False))
        print("-" * 100)

print("\n" + "="*100)
print("Mining complete!")
print("="*100)

Loading graph...
Building indexes...

Graph Statistics:
  Total triples: 13821
  Unique persons: 1316
  Unique relations: 28

Mining Chain Length 2 (min_support=2)...

Found 614 rules

Top 10 rules:
                                                       Rule  Count  Body_Count  Confidence
sisterOf(X,Y) ∧ granddaughterOf(Y,Z) → granddaughterOf(X,Z)    772         772         1.0
    grandmotherOf(X,Y) ∧ sisterOf(Y,Z) → grandmotherOf(X,Z)    747         747         1.0
    grandfatherOf(X,Y) ∧ sisterOf(Y,Z) → grandfatherOf(X,Z)    747         747         1.0
     sisterOf(X,Y) ∧ grandsonOf(Y,Z) → granddaughterOf(X,Z)    722         722         1.0
    brotherOf(X,Y) ∧ granddaughterOf(Y,Z) → grandsonOf(X,Z)    722         722         1.0
   grandmotherOf(X,Y) ∧ brotherOf(Y,Z) → grandmotherOf(X,Z)    673         673         1.0
   grandfatherOf(X,Y) ∧ brotherOf(Y,Z) → grandfatherOf(X,Z)    673         673         1.0
         brotherOf(X,Y) ∧ grandsonOf(Y,Z) → grandsonOf(X,Z)    624       

## Mining the Triangle RuleS (A<-B->C)=>(A<->C) and (A->B<-C)=>(A<->C)

In [32]:
import pandas as pd
from collections import defaultdict

# Step 1: Parse the graph data
def parse_graph(file_path):
    """Parse the graph file and return a list of triples (subject, relation, object)"""
    triples = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                if len(parts) == 3:
                    subject, relation, obj = parts
                    triples.append((subject, relation, obj))
    return triples

# Step 2: Build data structures
def build_graph_structures(triples):
    """Build data structures for efficient querying"""
    # outgoing[node] = list of (relation, target) pairs
    outgoing = defaultdict(list)
    # incoming[node] = list of (relation, source) pairs  
    incoming = defaultdict(list)
    # pair_to_relations[(source, target)] = set of relations
    pair_to_relations = defaultdict(set)
    
    persons = set()
    relations = set()
    
    for subject, relation, obj in triples:
        outgoing[subject].append((relation, obj))
        incoming[obj].append((relation, subject))
        pair_to_relations[(subject, obj)].add(relation)
        persons.add(subject)
        persons.add(obj)
        relations.add(relation)
    
    return outgoing, incoming, pair_to_relations, persons, relations

# Step 3: Mine Triangle Closure Type 1: A←B→C ⇒ A↔C
def mine_triangle_closure_type1(triples, outgoing, incoming, pair_to_relations, min_support=2):
    """
    Mine rules: relation1(B,A) ∧ relation2(B,C) → relation3(A,C)
    Triangle closure pattern: A←B→C ⇒ A↔C
    
    This means:
    - If B has relation1 to A (B→A)
    - And B has relation2 to C (B→C)  
    - Then A should have relation3 to C (A→C or A↔C)
    """
    
    # Counters
    body_counts = defaultdict(int)  # (rel1, rel2) -> count
    rule_counts = defaultdict(int)  # (rel1, rel2, rel3) -> count
    bidirectional_body_counts = defaultdict(int)  # (rel1, rel2) -> count where A↔C exists
    bidirectional_rule_counts = defaultdict(int)  # (rel1, rel2, rel3, rel4) -> count for A→C and C→A
    
    # For each node B (the central node in the triangle)
    for b in outgoing:
        # Get all nodes that B points to
        b_outgoing = outgoing[b]
        
        # For each pair of nodes that B points to
        for i, (rel1, a) in enumerate(b_outgoing):
            for j, (rel2, c) in enumerate(b_outgoing):
                if i >= j or a == c:  # Skip same pairs and avoid duplicates
                    continue
                
                # We have: B→A (rel1) and B→C (rel2)
                # This is the pattern: A←B→C
                body_counts[(rel1, rel2)] += 1
                
                # Check if A→C exists (any relation)
                if (a, c) in pair_to_relations:
                    for rel3 in pair_to_relations[(a, c)]:
                        rule_counts[(rel1, rel2, rel3)] += 1
                
                # Check if C→A exists (any relation)
                if (c, a) in pair_to_relations:
                    for rel3 in pair_to_relations[(c, a)]:
                        rule_counts[(rel2, rel1, rel3)] += 1  # Note: flipped order
                
                # Check for bidirectional closure: A↔C (both A→C and C→A exist)
                if (a, c) in pair_to_relations and (c, a) in pair_to_relations:
                    bidirectional_body_counts[(rel1, rel2)] += 1
                    
                    for rel_ac in pair_to_relations[(a, c)]:
                        for rel_ca in pair_to_relations[(c, a)]:
                            bidirectional_rule_counts[(rel1, rel2, rel_ac, rel_ca)] += 1
    
    # Filter by minimum support
    rule_counts = {k: v for k, v in rule_counts.items() if v >= min_support}
    bidirectional_rule_counts = {k: v for k, v in bidirectional_rule_counts.items() if v >= min_support}
    
    return rule_counts, body_counts, bidirectional_rule_counts, bidirectional_body_counts

# Step 4: Mine Triangle Closure Type 2: A→B←C ⇒ A↔C
def mine_triangle_closure_type2(triples, outgoing, incoming, pair_to_relations, min_support=2):
    """
    Mine rules: relation1(A,B) ∧ relation2(C,B) → relation3(A,C)
    Triangle closure pattern: A→B←C ⇒ A↔C
    
    This means:
    - If A has relation1 to B (A→B)
    - And C has relation2 to B (C→B)  
    - Then A should have relation3 to C (A→C or A↔C)
    """
    
    # Counters
    body_counts = defaultdict(int)  # (rel1, rel2) -> count
    rule_counts = defaultdict(int)  # (rel1, rel2, rel3) -> count
    bidirectional_body_counts = defaultdict(int)  # (rel1, rel2) -> count where A↔C exists
    bidirectional_rule_counts = defaultdict(int)  # (rel1, rel2, rel3, rel4) -> count for A→C and C→A
    
    # For each node B (the central node in the triangle)
    for b in incoming:
        # Get all nodes that point to B
        b_incoming = incoming[b]
        
        # For each pair of nodes that point to B
        for i, (rel1, a) in enumerate(b_incoming):
            for j, (rel2, c) in enumerate(b_incoming):
                if i >= j or a == c:  # Skip same pairs and avoid duplicates
                    continue
                
                # We have: A→B (rel1) and C→B (rel2)
                # This is the pattern: A→B←C
                body_counts[(rel1, rel2)] += 1
                
                # Check if A→C exists (any relation)
                if (a, c) in pair_to_relations:
                    for rel3 in pair_to_relations[(a, c)]:
                        rule_counts[(rel1, rel2, rel3)] += 1
                
                # Check if C→A exists (any relation)
                if (c, a) in pair_to_relations:
                    for rel3 in pair_to_relations[(c, a)]:
                        rule_counts[(rel2, rel1, rel3)] += 1  # Note: flipped order
                
                # Check for bidirectional closure: A↔C (both A→C and C→A exist)
                if (a, c) in pair_to_relations and (c, a) in pair_to_relations:
                    bidirectional_body_counts[(rel1, rel2)] += 1
                    
                    for rel_ac in pair_to_relations[(a, c)]:
                        for rel_ca in pair_to_relations[(c, a)]:
                            bidirectional_rule_counts[(rel1, rel2, rel_ac, rel_ca)] += 1
    
    # Filter by minimum support
    rule_counts = {k: v for k, v in rule_counts.items() if v >= min_support}
    bidirectional_rule_counts = {k: v for k, v in bidirectional_rule_counts.items() if v >= min_support}
    
    return rule_counts, body_counts, bidirectional_rule_counts, bidirectional_body_counts

# Step 5: Calculate metrics for triangle closure
def calculate_triangle_metrics(rule_counts, body_counts, total_triples, pattern_type):
    """Calculate metrics for triangle closure rules"""
    results = []
    
    for (rel1, rel2, rel3), count in rule_counts.items():
        body_count = body_counts.get((rel1, rel2), 0)
        
        if body_count == 0:
            continue
        
        support = count / total_triples
        confidence = count / body_count
        
        # Build rule string based on pattern type
        if pattern_type == "type1":
            rule_str = f'{rel1}(B,A) ∧ {rel2}(B,C) → {rel3}(A,C)'
            pattern_str = f'A←[{rel1}]B→[{rel2}]C ⇒ A→[{rel3}]C'
        else:  # type2
            rule_str = f'{rel1}(A,B) ∧ {rel2}(C,B) → {rel3}(A,C)'
            pattern_str = f'A→[{rel1}]B←[{rel2}]C ⇒ A→[{rel3}]C'
        
        results.append({
            'Rule': rule_str,
            'Pattern': pattern_str,
            'Relation1': rel1,
            'Relation2': rel2,
            'Relation3': rel3,
            'Count': count,
            'Body_Count': body_count,
            'Support': support,
            'Confidence': confidence
        })
    
    return pd.DataFrame(results).sort_values(['Confidence', 'Count'], ascending=[False, False])

# Step 6: Calculate metrics for bidirectional closure
def calculate_bidirectional_metrics(bidirectional_rule_counts, bidirectional_body_counts, total_triples, pattern_type):
    """Calculate metrics for bidirectional triangle closure A↔C"""
    results = []
    
    for (rel1, rel2, rel_ac, rel_ca), count in bidirectional_rule_counts.items():
        body_count = bidirectional_body_counts.get((rel1, rel2), 0)
        
        if body_count == 0:
            continue
        
        support = count / total_triples
        confidence = count / body_count
        
        # Build rule string based on pattern type
        if pattern_type == "type1":
            rule_str = f'{rel1}(B,A) ∧ {rel2}(B,C) → {rel_ac}(A,C) ∧ {rel_ca}(C,A)'
            pattern_str = f'A←[{rel1}]B→[{rel2}]C ⇒ A↔[{rel_ac},{rel_ca}]C'
        else:  # type2
            rule_str = f'{rel1}(A,B) ∧ {rel2}(C,B) → {rel_ac}(A,C) ∧ {rel_ca}(C,A)'
            pattern_str = f'A→[{rel1}]B←[{rel2}]C ⇒ A↔[{rel_ac},{rel_ca}]C'
        
        results.append({
            'Rule': rule_str,
            'Pattern': pattern_str,
            'Relation1': rel1,
            'Relation2': rel2,
            'Rel_A_to_C': rel_ac,
            'Rel_C_to_A': rel_ca,
            'Count': count,
            'Body_Count': body_count,
            'Support': support,
            'Confidence': confidence
        })
    
    return pd.DataFrame(results).sort_values(['Confidence', 'Count'], ascending=[False, False])

# Step 7: Find examples for Type 1
def find_triangle_examples_type1(triples, outgoing, pair_to_relations, rel1, rel2, rel3, max_examples=5):
    """Find concrete examples of triangle closure Type 1: A←B→C"""
    examples = []
    
    # For each node B
    for b in outgoing:
        b_outgoing = outgoing[b]
        
        # Find all A where B→A with rel1
        a_nodes = [(a, r) for r, a in b_outgoing if r == rel1]
        # Find all C where B→C with rel2
        c_nodes = [(c, r) for r, c in b_outgoing if r == rel2]
        
        for a, _ in a_nodes:
            for c, _ in c_nodes:
                if a == c:
                    continue
                
                # Check if A→C with rel3
                if (a, c) in pair_to_relations and rel3 in pair_to_relations[(a, c)]:
                    examples.append({
                        'B': b,
                        'A': a,
                        'C': c,
                        f'{rel1}(B,A)': '✓',
                        f'{rel2}(B,C)': '✓',
                        f'{rel3}(A,C)': '✓'
                    })
                    
                    if len(examples) >= max_examples:
                        return pd.DataFrame(examples)
    
    return pd.DataFrame(examples)

# Step 8: Find examples for Type 2
def find_triangle_examples_type2(triples, incoming, pair_to_relations, rel1, rel2, rel3, max_examples=5):
    """Find concrete examples of triangle closure Type 2: A→B←C"""
    examples = []
    
    # For each node B
    for b in incoming:
        b_incoming = incoming[b]
        
        # Find all A where A→B with rel1
        a_nodes = [(a, r) for r, a in b_incoming if r == rel1]
        # Find all C where C→B with rel2
        c_nodes = [(c, r) for r, c in b_incoming if r == rel2]
        
        for a, _ in a_nodes:
            for c, _ in c_nodes:
                if a == c:
                    continue
                
                # Check if A→C with rel3
                if (a, c) in pair_to_relations and rel3 in pair_to_relations[(a, c)]:
                    examples.append({
                        'B': b,
                        'A': a,
                        'C': c,
                        f'{rel1}(A,B)': '✓',
                        f'{rel2}(C,B)': '✓',
                        f'{rel3}(A,C)': '✓'
                    })
                    
                    if len(examples) >= max_examples:
                        return pd.DataFrame(examples)
    
    return pd.DataFrame(examples)

# Main execution
print("Loading graph...")
file_path = 'train.txt'  # Change this to your file path
triples = parse_graph(file_path)

print("Building indexes...")
outgoing, incoming, pair_to_relations, persons, relations = build_graph_structures(triples)

total_triples = len(triples)
print(f"\nGraph Statistics:")
print(f"  Total triples: {total_triples}")
print(f"  Unique persons: {len(persons)}")
print(f"  Unique relations: {len(relations)}")

# Set parameters
MIN_SUPPORT = 2
MIN_CONFIDENCE = 0.1

print(f"\n{'='*100}")
print(f"Mining Triangle Closure Rules")
print(f"{'='*100}")

# ============================================================================
# TYPE 1: A←B→C ⇒ A↔C
# ============================================================================
print(f"\n{'#'*100}")
print(f"# TYPE 1: A←B→C ⇒ A↔C (B points to both A and C)")
print(f"{'#'*100}")

rule_counts_t1, body_counts_t1, bidirectional_rule_counts_t1, bidirectional_body_counts_t1 = mine_triangle_closure_type1(
    triples, outgoing, incoming, pair_to_relations, MIN_SUPPORT
)

# Type 1 - Unidirectional
print(f"\n--- Type 1 Unidirectional: A←B→C ⇒ A→C ---\n")
if rule_counts_t1:
    results_t1_df = calculate_triangle_metrics(rule_counts_t1, body_counts_t1, total_triples, "type1")
    results_t1_df = results_t1_df[results_t1_df['Confidence'] >= MIN_CONFIDENCE]
    
    print(f"Found {len(results_t1_df)} rules\n")
    print("Top 15 rules:")
    print(results_t1_df.head(15)[['Pattern', 'Count', 'Body_Count', 'Confidence']].to_string(index=False))
    
    results_t1_df.to_csv('triangle_closure_type1_unidirectional.csv', index=False)
    print(f"\nSaved to triangle_closure_type1_unidirectional.csv")
    
    # Examples
    print(f"\n{'='*100}")
    print("Examples for Top 3 Rules:")
    print(f"{'='*100}")
    for idx, row in results_t1_df.head(3).iterrows():
        print(f"\n{row['Pattern']}")
        print(f"Confidence: {row['Confidence']:.1%} | Count: {row['Count']} | Body: {row['Body_Count']}")
        
        examples = find_triangle_examples_type1(
            triples, outgoing, pair_to_relations,
            row['Relation1'], row['Relation2'], row['Relation3'],
            max_examples=3
        )
        if not examples.empty:
            print(examples.to_string(index=False))
        print("-" * 100)
else:
    print("No unidirectional rules found")

# Type 1 - Bidirectional
print(f"\n--- Type 1 Bidirectional: A←B→C ⇒ A↔C ---\n")
if bidirectional_rule_counts_t1:
    bidirectional_t1_df = calculate_bidirectional_metrics(
        bidirectional_rule_counts_t1, bidirectional_body_counts_t1, total_triples, "type1"
    )
    bidirectional_t1_df = bidirectional_t1_df[bidirectional_t1_df['Confidence'] >= MIN_CONFIDENCE]
    
    print(f"Found {len(bidirectional_t1_df)} bidirectional rules\n")
    print("Top 10 rules:")
    print(bidirectional_t1_df.head(10)[['Pattern', 'Count', 'Body_Count', 'Confidence']].to_string(index=False))
    
    bidirectional_t1_df.to_csv('triangle_closure_type1_bidirectional.csv', index=False)
    print(f"\nSaved to triangle_closure_type1_bidirectional.csv")
else:
    print("No bidirectional rules found")

# ============================================================================
# TYPE 2: A→B←C ⇒ A↔C
# ============================================================================
print(f"\n\n{'#'*100}")
print(f"# TYPE 2: A→B←C ⇒ A↔C (Both A and C point to B)")
print(f"{'#'*100}")

rule_counts_t2, body_counts_t2, bidirectional_rule_counts_t2, bidirectional_body_counts_t2 = mine_triangle_closure_type2(
    triples, outgoing, incoming, pair_to_relations, MIN_SUPPORT
)

# Type 2 - Unidirectional
print(f"\n--- Type 2 Unidirectional: A→B←C ⇒ A→C ---\n")
if rule_counts_t2:
    results_t2_df = calculate_triangle_metrics(rule_counts_t2, body_counts_t2, total_triples, "type2")
    results_t2_df = results_t2_df[results_t2_df['Confidence'] >= MIN_CONFIDENCE]
    
    print(f"Found {len(results_t2_df)} rules\n")
    print("Top 15 rules:")
    print(results_t2_df.head(15)[['Pattern', 'Count', 'Body_Count', 'Confidence']].to_string(index=False))
    
    results_t2_df.to_csv('triangle_closure_type2_unidirectional.csv', index=False)
    print(f"\nSaved to triangle_closure_type2_unidirectional.csv")
    
    # Examples
    print(f"\n{'='*100}")
    print("Examples for Top 3 Rules:")
    print(f"{'='*100}")
    for idx, row in results_t2_df.head(3).iterrows():
        print(f"\n{row['Pattern']}")
        print(f"Confidence: {row['Confidence']:.1%} | Count: {row['Count']} | Body: {row['Body_Count']}")
        
        examples = find_triangle_examples_type2(
            triples, incoming, pair_to_relations,
            row['Relation1'], row['Relation2'], row['Relation3'],
            max_examples=3
        )
        if not examples.empty:
            print(examples.to_string(index=False))
        print("-" * 100)
else:
    print("No unidirectional rules found")

# Type 2 - Bidirectional
print(f"\n--- Type 2 Bidirectional: A→B←C ⇒ A↔C ---\n")
if bidirectional_rule_counts_t2:
    bidirectional_t2_df = calculate_bidirectional_metrics(
        bidirectional_rule_counts_t2, bidirectional_body_counts_t2, total_triples, "type2"
    )
    bidirectional_t2_df = bidirectional_t2_df[bidirectional_t2_df['Confidence'] >= MIN_CONFIDENCE]
    
    print(f"Found {len(bidirectional_t2_df)} bidirectional rules\n")
    print("Top 10 rules:")
    print(bidirectional_t2_df.head(10)[['Pattern', 'Count', 'Body_Count', 'Confidence']].to_string(index=False))
    
    bidirectional_t2_df.to_csv('triangle_closure_type2_bidirectional.csv', index=False)
    print(f"\nSaved to triangle_closure_type2_bidirectional.csv")
else:
    print("No bidirectional rules found")

print(f"\n{'='*100}")
print("Triangle Closure Mining Complete!")
print(f"{'='*100}")

Loading graph...
Building indexes...

Graph Statistics:
  Total triples: 13821
  Unique persons: 1316
  Unique relations: 28

Mining Triangle Closure Rules

####################################################################################################
# TYPE 1: A←B→C ⇒ A↔C (B points to both A and C)
####################################################################################################

--- Type 1 Unidirectional: A←B→C ⇒ A→C ---

Found 333 rules

Top 15 rules:
                                                                        Pattern  Count  Body_Count  Confidence
                                    A←[brotherOf]B→[brotherOf]C ⇒ A→[sisterOf]C    482         431    1.118329
                                      A←[nephewOf]B→[nephewOf]C ⇒ A→[sisterOf]C    440         403    1.091811
                                      A←[sisterOf]B→[sisterOf]C ⇒ A→[sisterOf]C    540         511    1.056751
                                     A←[motherOf]B→[motherOf]C ⇒ A→[brot