# Create the simulation code

In [None]:
# get the frequency percentage of the group type
import pandas as pd
import pandas as pd
import math

file_name = '2020'

file_path = f'/Users/qingtan/Desktop/role_mining_code/sample_new_events_{file_name}_train.csv'
test_file_path = f'/Users/qingtan/Desktop/AgentSimulator-main/simulated_data/bpic_{file_name}_gt/autonomous/test_preprocessed.csv'

# only use the test file to get the number of cases it has
def get_test_case_num(test_file_path):
    df = pd.read_csv(test_file_path)
    unique_cases = df['case_id'].unique()
    n_cases = len(unique_cases)
    return n_cases
    

def get_group_type_freq_per(file_path):
    df = pd.read_csv(file_path)
    group_roles = (
        df.groupby('group_id')['group_role_id']
          .apply(lambda x: tuple(sorted(set(x))))
          .reset_index(name='role_set')
    )

    # Step 2: Assign a group_type index
    group_roles['group_type'] = group_roles['role_set'].astype('category').cat.codes + 1  # start from 1

    # Step 3: Merge group_type back to main df
    df = df.merge(group_roles[['group_id', 'group_type']], on='group_id', how='left')

    # Step 4: Compute case counts per group_type
    case_count = (
        df.groupby('group_type')
          .agg(num_cases=('case_id', 'nunique'),
              role_set=('group_role_id', lambda x: sorted(set(x))))
          .reset_index()
    )

    # Step 5: Compute proportion of cases
    total_cases = case_count['num_cases'].sum()
    case_count['proportion'] = case_count['num_cases'] / total_cases

    # Step 6: Convert to dictionary with GT-style keys
    group_type_dict = {
        f"GT{row.group_type}": (tuple(row.role_set), round(row.proportion,3))
        for _, row in case_count.iterrows()
    }

    return group_type_dict





# get_group_type_freq_per('/Users/qingtan/Desktop/role_mining_code/sample_new_events_2015_2.csv')
# file_path = '/Users/qingtan/Desktop/role_mining_code/sample_new_events_2015_2.csv'
num_test_cases = get_test_case_num(test_file_path)
group_type_dic = get_group_type_freq_per(file_path)
group_type_dic

{'GT1': (('Role1', 'Role2', 'Role3'), 0.096),
 'GT2': (('Role1', 'Role2', 'Role3', 'Role4'), 0.444),
 'GT3': (('Role1', 'Role2', 'Role3', 'Role4', 'Role5'), 0.262),
 'GT4': (('Role1', 'Role2', 'Role3', 'Role4', 'Role5', 'Role6'), 0.03),
 'GT5': (('Role1', 'Role2', 'Role3', 'Role4', 'Role6'), 0.042),
 'GT6': (('Role1', 'Role2', 'Role3', 'Role6'), 0.022),
 'GT7': (('Role1', 'Role2', 'Role3', 'Role7'), 0.093),
 'GT8': (('Role1', 'Role2', 'Role3', 'Role7', 'Role8'), 0.011)}

In [60]:
# for each group type, get the percentage for each group

def compute_group_case_proportion(file_path, group_type_dict):

    df = pd.read_csv(file_path)
    
    # Step 1: Determine each group's role set
    group_roles = (
        df.groupby('group_id')['group_role_id']
          .apply(lambda x: tuple(sorted(set(x))))
          .reset_index(name='role_set')
    )

    # Step 2: Assign group_type based on role_set using input dictionary
    role_to_gt = {}
    for gt, (roles, _) in group_type_dict.items():
        key = tuple(sorted(roles))
        if key not in role_to_gt:
            role_to_gt[key] = gt
        else:
            print(f"⚠️ Duplicate role set found for {gt} and {role_to_gt[key]}, keeping the first one.")

    # Step 3: Count unique cases per group
    group_roles['group_type'] = group_roles['role_set'].apply(
    lambda rs: role_to_gt.get(tuple(sorted(rs)), None))
    df = df.merge(group_roles[['group_id', 'group_type']], on='group_id', how='left')
    group_case_counts = (
    df.groupby(['group_id', 'group_type'])['case_id']
      .nunique()
      .reset_index(name='num_cases')
    )

    # Step 4: Compute case proportions per group_type
    group_type_totals = (
        group_case_counts.groupby('group_type')['num_cases']
        .sum()
        .reset_index(name='total_cases')
    )
    group_case_counts = group_case_counts.merge(group_type_totals, on='group_type')
    group_case_counts['case_proportion'] = (
        group_case_counts['num_cases'] / group_case_counts['total_cases']
    )

    # Step 5: Build output dictionary
    group_result = {
        gt: {
            gid: round(prop, 3)
            for gid, prop in group_case_counts.loc[
                group_case_counts['group_type'] == gt, ['group_id', 'case_proportion']
            ].values
        }
        for gt in group_case_counts['group_type'].dropna().unique()
    }

    return group_result


GT_G_dic = compute_group_case_proportion(file_path, group_type_dic)
GT_G_dic

{'GT3': {'G1': 1.0},
 'GT2': {'G2': 1.0},
 'GT4': {'G3': 1.0},
 'GT5': {'G4': 1.0},
 'GT1': {'G5': 1.0},
 'GT7': {'G6': 1.0},
 'GT6': {'G7': 1.0},
 'GT8': {'G8': 1.0}}

In [61]:
# for each group type, get the role and activity continuous information (this part can also be completed based on the MAS model for each group type)
# e.g., GT1: {((R1, T1), (R2, T2), (R2, T3)): {(R1, T4): 0.1, (R2, T2): 0.5, (R2, T3): 0.4}}
import pandas as pd
from collections import defaultdict, Counter

def build_group_type_transition_model(df, group_type_dic, group_case_dic, max_seq_len=3):
    """
    Build variable-length transition probability models for each group type, 
    safely handling short traces.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain ['case_id', 'group_id', 'group_role_id', 'activity_type']
        and optionally 'event_index' or 'timestamp' for ordering.
    group_type_dic : dict
        e.g., {'GT1': (('Role1',), 0.574), ...}
    group_case_dic : dict
        e.g., {'GT1': {'G1': 0.08, 'G2': 0.05, ...}, 'GT2': {'G3': 0.3, ...}}
    max_seq_len : int, optional
        Maximum prefix length to consider for sequence transitions (default=3)

    Returns
    -------
    dict
        {GT: {sequence_tuple: {next_event: probability, ...}}}
    """

    # Ensure events are ordered correctly within cases
    if 'event_index' in df.columns:
        df = df.sort_values(['case_id', 'event_index'])
    elif 'timestamp' in df.columns:
        df = df.sort_values(['case_id', 'timestamp'])

    # Build group_id → group_type mapping
    group_to_gt = {
        g: gt for gt, group_map in group_case_dic.items() for g in group_map.keys()
    }

    # Initialize nested dictionary for storing transitions
    transition_models = defaultdict(lambda: defaultdict(Counter))

    # Iterate through each group
    for gid, gdf in df.groupby('group_id'):
        gt = group_to_gt.get(gid)
        if gt is None:
            continue  # Skip groups not in mapping

        # Process each trace for this group
        for _, trace_df in gdf.groupby('case_id'):
            trace = list(zip(trace_df['group_role_id'], trace_df['activity_name']))
            trace = [("s*", "s*")] + trace + [("e*", "e*")]  # add start/end markers
            trace_len = len(trace)

            # For each position in the trace, generate prefixes up to max_seq_len
            for i in range(trace_len - 1):
                max_prefix = min(max_seq_len, i + 1)  # handle short traces safely
                for l in range(1, max_prefix + 1):
                    seq = tuple(trace[i - l + 1 : i + 1])
                    next_event = trace[i + 1]
                    weight = group_case_dic.get(gt, {}).get(gid, 1.0)
                    transition_models[gt][seq][next_event] += weight

    # Normalize transition probabilities
    for gt in transition_models:
        for seq, counter in transition_models[gt].items():
            total = sum(counter.values())
            transition_models[gt][seq] = {
                evt: round(cnt / total, 3) for evt, cnt in counter.items()
            }

    return dict(transition_models)


df = pd.read_csv(f'/Users/qingtan/Desktop/role_mining_code/sample_new_events_{file_name}_train.csv')
max_case_length = df.groupby('case_id').size().max()
group_type_pref_dic = build_group_type_transition_model(df, group_type_dic, GT_G_dic, max_seq_len=max_case_length+1)

In [62]:
# get the resource of each role in each group 
# output: {G1:{R1:(A1, A2), R2:(A3)}}

from collections import defaultdict

def extract_group_role_activities(df, group_col='group_id', role_col='group_role_id', agent_col='agent_id'):
    """
    Build a nested dictionary mapping each group to its roles and activities.
    
    Example output:
    {
        'G1': {'R1': ('A1', 'A2'), 'R2': ('A3',)},
        'G2': {'R3': ('A1',), 'R4': ('A2', 'A4')}
    }
    """
    group_role_acts = defaultdict(lambda: defaultdict(set))
    
    # Iterate through rows
    for _, row in df.iterrows():
        g = row[group_col]
        r = row[role_col]
        a = row[agent_col]
        group_role_acts[g][r].add(a)
    
    # Convert sets to sorted tuples for readability and immutability
    result = {
        g: {r: tuple(sorted(acts)) for r, acts in roles.items()}
        for g, roles in group_role_acts.items()
    }
    
    return result


group_role_resource_map = extract_group_role_activities(df, group_col='group_id', role_col='group_role_id', agent_col='agent')
group_role_resource_map

{'G1': {'Role1': (0,),
  'Role4': (6,),
  'Role5': (7,),
  'Role2': (1,),
  'Role3': (5,)},
 'G2': {'Role1': (0,), 'Role4': (6,), 'Role2': (1,), 'Role3': (5,)},
 'G3': {'Role1': (0,),
  'Role4': (6,),
  'Role2': (1,),
  'Role5': (7,),
  'Role6': (3,),
  'Role3': (5,)},
 'G4': {'Role1': (0,),
  'Role4': (6,),
  'Role2': (1,),
  'Role3': (5,),
  'Role6': (3,)},
 'G5': {'Role1': (0,), 'Role2': (1,), 'Role3': (5,)},
 'G6': {'Role1': (0,), 'Role7': (2,), 'Role2': (1,), 'Role3': (5,)},
 'G7': {'Role1': (0,), 'Role2': (1,), 'Role6': (3,), 'Role3': (5,)},
 'G8': {'Role1': (0,),
  'Role7': (2,),
  'Role2': (1,),
  'Role3': (5,),
  'Role8': (4,)}}

In [63]:
# for each group, please add an agent (A1, A2, A3) continuous probability
# e.g. {G1:{(A1, A2):{A1:0.2, A3: 0.8}, (A1, A3):{A1:0.2, A2: 0.8}}}
import pandas as pd
from collections import defaultdict, Counter

def compute_group_agent_transition_prob(df, group_col='group_id', agent_col='agent_id', case_col='case_id', 
                                        order_col='timestamp', max_len=2):
    """
    Compute agent-level transition probabilities for each group.
    
    For each group:
      - Adds start ("s*") and end ("e*") markers.
      - Builds transition probabilities from prefix (up to max_len) to next agent.
    
    Returns:
        dict like:
        {
          "G1": {
              ("A1", "A2"): {"A1": 0.2, "A3": 0.8},
              ("s*",): {"A1": 1.0}
          }
        }
    """
    
    group_transitions = defaultdict(lambda: defaultdict(Counter))
    
    # Process each group separately
    for g, g_df in df.groupby(group_col):
        # Sort within each case to ensure order
        for case, c_df in g_df.groupby(case_col):
            c_df = c_df.sort_values(order_col)
            agents = ["s*"] + list(c_df[agent_col]) + ["e*"]
            
            # Build transitions for all prefixes up to max_len
            for i in range(1, len(agents)):
                for l in range(1, max_len + 1):
                    if i - l < 0:
                        break
                    prefix = tuple(agents[i - l:i])
                    next_agent = agents[i]
                    group_transitions[g][prefix][next_agent] += 1
    
    # Normalize counts to probabilities
    group_probabilities = {}
    for g, prefix_dict in group_transitions.items():
        group_probabilities[g] = {}
        for prefix, next_counts in prefix_dict.items():
            total = sum(next_counts.values())
            probs = {a: round(c / total, 3) for a, c in next_counts.items()}
            group_probabilities[g][prefix] = probs
    
    return group_probabilities


group_resource_hand = compute_group_agent_transition_prob(df, group_col='group_id', agent_col='agent', case_col='case_id', order_col='timestamp', max_len=10)
group_resource_hand

{'G1': {('s*',): {0: 1.0},
  (0,): {6: 0.515, 0: 0.414, 1: 0.024, 5: 0.03, 7: 0.015, 'e*': 0.002},
  ('s*', 0): {6: 0.878, 0: 0.115, 1: 0.007},
  (6,): {7: 0.746, 1: 0.048, 0: 0.202, 5: 0.004, 6: 0.002},
  (0, 6): {7: 0.748, 1: 0.048, 0: 0.2, 5: 0.003, 6: 0.001},
  ('s*', 0, 6): {7: 0.843, 1: 0.061, 0: 0.096},
  (7,): {1: 0.886, 0: 0.079, 5: 0.01, 7: 0.025, 6: 0.0},
  (6, 7): {1: 0.899, 0: 0.072, 7: 0.022, 5: 0.007, 6: 0.0},
  (0, 6, 7): {1: 0.9, 0: 0.072, 7: 0.022, 5: 0.007, 6: 0.0},
  ('s*', 0, 6, 7): {1: 0.887, 0: 0.113},
  (1,): {0: 0.438, 5: 0.526, 1: 0.029, 7: 0.005, 6: 0.002},
  (7, 1): {0: 0.432, 5: 0.538, 1: 0.027, 7: 0.002, 6: 0.001},
  (6, 7, 1): {0: 0.439, 5: 0.553, 1: 0.006, 7: 0.002, 6: 0.0},
  (0, 6, 7, 1): {0: 0.439, 5: 0.553, 1: 0.006, 7: 0.002, 6: 0.0},
  ('s*', 0, 6, 7, 1): {0: 1.0},
  (1, 0): {0: 0.679, 6: 0.299, 5: 0.02, 1: 0.002},
  (7, 1, 0): {0: 0.7, 6: 0.281, 5: 0.018, 1: 0.001},
  (6, 7, 1, 0): {0: 0.716, 6: 0.272, 5: 0.012},
  (0, 6, 7, 1, 0): {0: 0.716, 6: 0

In [64]:
# start simulation (approach 1: use probability model)
# get the group type based on GT distribution
import random
from collections import Counter

def simulate_group_types(group_type_dic, num_cases):
    """
    Randomly sample group types based on their probability and
    return a dictionary mapping case IDs to sampled group types.

    Args:
        group_type_dic (dict): e.g.,
            {'GT1': (('Role1',), 0.574),
             'GT2': (('Role1', 'Role2'), 0.068), ...}
        num_cases (int): total number of cases to simulate.

    Returns:
        dict: {c1: 'GT1', c2: 'GT2', ...}
    """
    group_types = list(group_type_dic.keys())
    probs = [v[1] for v in group_type_dic.values()]

    # Normalize probabilities if needed
    total = sum(probs)
    probs = [p / total for p in probs]

    # Sample group types
    sampled = random.choices(group_types, weights=probs, k=num_cases)

    # Build dictionary output
    case_group_type_dic = {f'c{i+1}': gt for i, gt in enumerate(sampled)}

    return case_group_type_dic


# simulate the activities based on the specific GT
def simulate_activity_traces(case_group_type_dic, group_type_pref_dic, 
                             max_len=100, max_context_len=5):
    """
    Simulate activity traces for each case based on group-type transition probabilities.

    Args:
        case_group_type_dic (dict): {c1: 'GT1', c2: 'GT2', ...}
        group_type_pref_dic (dict): {'GT1': defaultdict(Counter, {...}), ...}
        max_len (int): maximum number of events per trace (to prevent infinite loops)
        max_context_len (int): maximum sequence length to use for context lookup

    Returns:
        dict: {c1: [(role, activity), (role, activity), ...], ...}
    """
    simulated_traces = {}

    for case_id, gt in case_group_type_dic.items():
        transitions = group_type_pref_dic.get(gt, {})
        trace = []
        current_seq = (('s*', 's*'),)  # start sequence

        for _ in range(max_len):
            next_event = None
            found = False

            # Try progressively shorter context lengths
            for k in range(min(max_context_len, len(current_seq)), 0, -1):
                key = tuple(current_seq[-k:])
                if key in transitions:
                    next_candidates = transitions[key]
                    next_event = random.choices(
                        population=list(next_candidates.keys()),
                        weights=list(next_candidates.values()),
                        k=1
                    )[0]
                    found = True
                    break

            if not found:
                break  # No valid continuation found

            if next_event == ('e*', 'e*'):
                break  # End marker reached

            trace.append(next_event)
            current_seq = (*current_seq, next_event)

        simulated_traces[case_id] = trace

    return simulated_traces


# locate the specific group based on the specific GT
def assign_cases_to_groups(case_group_type_dic, group_type_group_dic):
    """
    Assign each case to a specific group within its group type, 
    based on the group-type-to-group probability dictionary.

    Args:
        case_group_type_dic (dict): {c1: 'GT1', c2: 'GT2', ...}
        group_type_group_dic (dict): 
            {'GT1': {'G1': 0.08, 'G10': 0.029, ...}, 'GT2': {'G5': 0.6, ...}, ...}

    Returns:
        dict: {c1: ('GT1', 'G4'), c2: ('GT2', 'G5'), ...}
    """
    case_group_assignment = {}

    for case_id, gt in case_group_type_dic.items():
        # Get group probability dictionary for this GT
        group_probs = group_type_group_dic.get(gt, {})
        if not group_probs:
            # If no group info, mark as None
            case_group_assignment[case_id] = (gt, None)
            continue

        groups = list(group_probs.keys())
        probs = list(group_probs.values())

        # Normalize probabilities (in case they don't sum to 1)
        total = sum(probs)
        if total > 0:
            probs = [p / total for p in probs]

        selected_group = random.choices(population=groups, weights=probs, k=1)[0]
        case_group_assignment[case_id] = selected_group

    return case_group_assignment


# assign agent attribute based on the agent interaction in this specific group
import random
from collections import defaultdict

def assign_agents_to_trace(
    case_role_activity_dic,
    case_group_dic,
    group_role_resource_map,
    group_resource_hand,
    max_seq_len=10,
):
    """
    Assigns agents to each (role, activity) in the simulated traces based on group handover probabilities.

    Args:
        case_role_activity_dic (dict): 
            e.g., {'c1': [('Role1','A1'), ('Role1','A2'), ...]}
        case_group_dic (dict): 
            e.g., {'c1': 'G1', 'c2': 'G2', ...}
        group_role_resource_map (dict): 
            e.g., {'G1': {'Role1': ('r560532',), 'Role2': ('r560458',)}}
        group_resource_hand (dict): 
            e.g., {'G1': {('s*',): {'r560532': 1.0}, ('r560532',): {'r560532': 0.9, 'e*':0.1}}}
        max_seq_len (int): max number of previous agents to use for context.

    Returns:
        dict: {case_id: [(role, activity, agent), ...]}
    """
    simulated_agent_traces = {}

    for case_id, role_act_seq in case_role_activity_dic.items():
        group_id = case_group_dic.get(case_id)
        if not group_id:
            continue

        # Get resource handover model for this group
        hand_model = group_resource_hand.get(group_id, {})
        # Get available agents per role
        role_resource_map = group_role_resource_map.get(group_id, {})

        agent_seq = ['s*']  # start symbol
        case_trace = []

        for role, act in role_act_seq:
            assigned_agent = None
            valid_agents = role_resource_map.get(role, [])

            # Backoff: try from max_seq_len → 1
            for seq_len in range(max_seq_len, 0, -1):
                context = tuple(agent_seq[-seq_len:])
                if context in hand_model:
                    next_agent_probs = hand_model[context]
                    # Filter by valid agents only
                    filtered_probs = {
                        a: p for a, p in next_agent_probs.items() if a in valid_agents
                    }
                    if filtered_probs:
                        agents = list(filtered_probs.keys())
                        probs = list(filtered_probs.values())
                        if sum(probs) == 0:
                            probs = [1 for _ in probs]
                        else:
                            probs = [p / sum(probs) for p in probs]  # normalize
                        assigned_agent = random.choices(agents, weights=probs, k=1)[0]
                        break

            # If no valid transition found, random valid agent fallback
            if not assigned_agent and valid_agents:
                assigned_agent = random.choice(valid_agents)
            elif not assigned_agent:
                assigned_agent = 'UNKNOWN'

            case_trace.append((role, act, assigned_agent))
            agent_seq.append(assigned_agent)

        agent_seq.append('e*')
        simulated_agent_traces[case_id] = case_trace

    return simulated_agent_traces


import pandas as pd
from datetime import datetime

def simulated_traces_to_event_log(simulated_agent_traces, output_path='simulated_event_log.csv'):
    """
    Convert simulated agent traces into an event log (CSV format).
    
    Parameters
    ----------
    simulated_agent_traces : dict
        Example format:
        {
            'c1': [('Role1', '01_HOOFD', 'r560530'),
                   ('Role1', '16_LGSD', 'r560530')],
            'c2': [('Role2', '05_TASK', 'r12345')]
        }
    output_path : str, optional
        Path to save the CSV file.
    
    Returns
    -------
    pd.DataFrame
        The event log DataFrame.
    """
    rows = []
    start_time = pd.Timestamp("2014-03-25 15:26:10+00:00")
    end_time = pd.Timestamp("2014-03-25 15:36:10+00:00")
    
    case_id_map = {}
    agent_id_map = {}
    case_counter = 0
    agent_counter = 0

    for case, events in simulated_agent_traces.items():
        # Assign new case_id if not already assigned
        if case not in case_id_map:
            case_id_map[case] = case_counter
            case_counter += 1
        
        case_id = case_id_map[case]
        
        for role, activity, resource in events:
            # Assign numeric agent_id if not already assigned
            if resource not in agent_id_map:
                agent_id_map[resource] = agent_counter
                agent_counter += 1

            agent_id = agent_id_map[resource]

            rows.append({
                'case_id': case_id,
                'agent': agent_id,
                'activity_name': activity,
                'start_timestamp': start_time,
                'end_timestamp': end_time,
                'resource': resource,
                'role': role
            })

    event_log_df = pd.DataFrame(rows)
    event_log_df.to_csv(output_path, index=False)
    print(f"Event log saved to {output_path}")
    
    return event_log_df



# test the whole simulation process
for i in [0,1,2,3,4,5,6,7,8,9]:
    output_file_path = f"/Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC{file_name}_prob/autonomous/simulated_log_{i}.csv"
    case_group_type_dic = simulate_group_types(group_type_dic, num_cases=num_test_cases)
    case_role_activity_simu = simulate_activity_traces(case_group_type_dic, group_type_pref_dic, max_len=200, max_context_len=10)
    case_group_map = assign_cases_to_groups(case_group_type_dic, GT_G_dic)
    simulated_agent_traces = assign_agents_to_trace(case_role_activity_simu, case_group_map, group_role_resource_map, group_resource_hand, max_seq_len=10)
    simulated_traces_to_event_log(simulated_agent_traces, output_file_path)

Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_prob/autonomous/simulated_log_0.csv
Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_prob/autonomous/simulated_log_1.csv
Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_prob/autonomous/simulated_log_2.csv
Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_prob/autonomous/simulated_log_3.csv
Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_prob/autonomous/simulated_log_4.csv
Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_prob/autonomous/simulated_log_5.csv
Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_prob/autonomous/simulated_log_6.csv
Event log saved to /Users/qingtan/Desktop/AgentSimulator-main/

In [65]:
'''
# start discovery (for simulation approach 2: use petri net model)
# finalize the discovery step (use later for simulation)
import pandas as pd
import ast
from collections import defaultdict, Counter

discovered_df_path = '/Users/qingtan/Desktop/role_mining_code/discovered_for_simu/role_mining_2020_10_0.csv'
discovered_df = pd.read_csv(discovered_df_path)
# print(discovered_df)
original_df_path = '/Users/qingtan/Desktop/role_mining_code/discovery_AM/bpic/role_mining_2020/_log_preprocessed_not_filtered.csv'
original_df = pd.read_csv(original_df_path)
# print(original_df)


# get the group_type_distribution
def get_group_type_distribution(discovered_df):
    total_cases = discovered_df['number_of_cases'].sum()
    
    group_distribution = {}
    for _, row in discovered_df.iterrows():
        gt = row['group_type']
        roles = ast.literal_eval(row['role_type'])  # assuming this is already a tuple like ('Role1', 'Role2')
        proportion = round(row['number_of_cases'] / total_cases, 3)
        group_distribution[gt] = (roles, proportion)
    
    return group_distribution


# get the activity preferences
def discover_activity_preferences(discovered_df, original_df):
    # ---- Step 1. Prepare role and combined identifier ----
    original_df = original_df.copy()
    original_df['role_id'] = original_df['agent_id']
    original_df['role_act'] = original_df['role_id'].astype(str) + '|' + original_df['activity_type'].astype(str)

    # ---- Step 2. Map each group_id to group_type ----
    group_to_type = {}
    for _, row in discovered_df.iterrows():
        for gid in ast.literal_eval(row['group_id']):
            group_to_type[gid] = row['group_type']

    original_df['group_type'] = original_df['group_id'].map(group_to_type)
    original_df = original_df.dropna(subset=['group_type'])

    # ---- Step 3. Collect valid transitions (Petri net edges) for each group_type ----
    petri_paths = defaultdict(set)
    for _, row in discovered_df.iterrows():
        gt = row['group_type']

        for src, dst in ast.literal_eval(row['direct_path']):
            petri_paths[gt].add((src, dst))

    # ---- Step 4. Build traces by group_type ----
    traces_by_gt = defaultdict(list)
    for (gt, case), group in original_df.groupby(['group_type', 'case_id']):
        trace = group.sort_index()['role_act'].tolist()
        trace = ['Start'] + trace + ['End']
        traces_by_gt[gt].append(trace)

    # ---- Step 5. Count transitions that appear in both the traces and Petri net ----
    counts = defaultdict(lambda: defaultdict(Counter))
    for gt, traces in traces_by_gt.items():
        for trace in traces:
            for i in range(len(trace) - 1):
                src, dst = trace[i], trace[i + 1]
                # Always include Start→first and last→End, or valid Petri transitions
                # if (src, dst) in petri_paths[gt] or src == 'Start' or dst == 'End':
                if (src, dst) in petri_paths[gt]:
                    counts[gt][src][dst] += 1

    # ---- Step 6. Normalize counts to probabilities ----
    prob_dict = {}
    for gt, src_dict in counts.items():
        prob_dict[gt] = {}
        for src, dst_counter in src_dict.items():
            total = sum(dst_counter.values())
            if total > 0:
                prob_dict[gt][(src,)] = {dst: round(cnt / total, 3) for dst, cnt in dst_counter.items()}

    return prob_dict


# get group distribution for each group type
def get_group_percent_per_gt(discovered_df, original_df):

    # Step 1: Ensure group_id is a tuple
    df = discovered_df.copy()
    df['group_id'] = df['group_id'].apply(ast.literal_eval)

    # Step 2: Explode so each row has a single group
    df_exploded = df.explode('group_id')

    # Step 3: Count number of cases per group from original_df
    group_counts = (
        original_df.groupby('group_id')['case_id']
        .nunique()
        .reset_index(name='num_cases')
    )

    # Step 4: Merge counts into exploded df
    merged = pd.merge(df_exploded, group_counts, on='group_id', how='left')
    merged['num_cases'] = merged['num_cases'].fillna(0)

    # Step 5: Calculate proportion per group type
    result = {}
    for gt, group_df in merged.groupby('group_type'):
        total_cases = group_df['num_cases'].sum()
        result[gt] = {
            g: round(num / total_cases, 3) if total_cases > 0 else 0
            for g, num in zip(group_df['group_id'], group_df['num_cases'])
        }

    return result


group_type_distri_dic = get_group_type_distribution(discovered_df)
discovered_petri_prob_dic = discover_activity_preferences(discovered_df, original_df)
group_type_group_dic = get_group_percent_per_gt(discovered_df, original_df)
# this function is from approach 1 (parameters change)
group_role_resource_dic = extract_group_role_activities(original_df, group_col='group_id', role_col='agent_id', agent_col='resource_id')
# this function is also from approach 1 (parameters change)
group_resource_hand_dic = compute_group_agent_transition_prob(original_df, group_col='group_id', agent_col='resource_id', case_col='case_id', order_col='timestamp', max_len=10)
'''

"\n# start discovery (for simulation approach 2: use petri net model)\n# finalize the discovery step (use later for simulation)\nimport pandas as pd\nimport ast\nfrom collections import defaultdict, Counter\n\ndiscovered_df_path = '/Users/qingtan/Desktop/role_mining_code/discovered_for_simu/role_mining_2020_10_0.csv'\ndiscovered_df = pd.read_csv(discovered_df_path)\n# print(discovered_df)\noriginal_df_path = '/Users/qingtan/Desktop/role_mining_code/discovery_AM/bpic/role_mining_2020/_log_preprocessed_not_filtered.csv'\noriginal_df = pd.read_csv(original_df_path)\n# print(original_df)\n\n\n# get the group_type_distribution\ndef get_group_type_distribution(discovered_df):\n    total_cases = discovered_df['number_of_cases'].sum()\n    \n    group_distribution = {}\n    for _, row in discovered_df.iterrows():\n        gt = row['group_type']\n        roles = ast.literal_eval(row['role_type'])  # assuming this is already a tuple like ('Role1', 'Role2')\n        proportion = round(row['number

In [66]:
'''
# start simulation (approach 2: use Petri net)
# get the group type based on GT distribution
import random
from collections import Counter

def simulate_group_types(group_type_dic, num_cases):
    """
    Randomly sample group types based on their probability and
    return a dictionary mapping case IDs to sampled group types.

    Args:
        group_type_dic (dict): e.g.,
            {'GT1': (('Role1',), 0.574),
             'GT2': (('Role1', 'Role2'), 0.068), ...}
        num_cases (int): total number of cases to simulate.

    Returns:
        dict: {c1: 'GT1', c2: 'GT2', ...}
    """
    group_types = list(group_type_dic.keys())
    probs = [v[1] for v in group_type_dic.values()]

    # Normalize probabilities if needed
    total = sum(probs)
    probs = [p / total for p in probs]

    # Sample group types
    sampled = random.choices(group_types, weights=probs, k=num_cases)

    # Build dictionary output
    case_group_type_dic = {f'c{i+1}': gt for i, gt in enumerate(sampled)}

    return case_group_type_dic


# simulate the activities based on the specific GT
import random
from collections import defaultdict

import random

def simulate_activity_traces_using_petri(case_group_type_dic, discovered_petri_prob_dic, max_len=100):
    """
    Simulate activity traces for each case using Petri net probability information.
    Only the current activity is used to predict the next activity (Markov-1).
    
    Args:
        case_group_type_dic (dict): {case_id: group_type, ...}
        discovered_petri_prob_dic (dict): {group_type: {source_tuple: {dest_tuple_str: prob}}}
        max_len (int): maximum length of a trace to prevent infinite loops
    
    Returns:
        dict: {case_id: [(Role, Activity), ...], ...}
    """
    simulated_traces = {}

    for case_id, gt in case_group_type_dic.items():
        trace = []
        current_activity = ('Start',)  # start symbol
        gt_prob_dict = discovered_petri_prob_dic[gt]
        
        for _ in range(max_len):
            dest_dict = gt_prob_dict.get(current_activity)
            if not dest_dict:
                break  # no further transition
            
            # Choose next activity based on probabilities
            dest_list = list(dest_dict.keys())
            probs = list(dest_dict.values())
            next_dest = random.choices(dest_list, weights=probs, k=1)[0]
            
            if next_dest == 'End':
                # trace.append(('End', 'End'))
                break
            
            # Convert string 'Role|Activity' to tuple ('Role', 'Activity')
            if isinstance(next_dest, str) and '|' in next_dest:
                trace.append(tuple(next_dest.split('|')))
            else:
                trace.append((next_dest,))
            
            # For Markov-1, only the last activity matters
            current_activity = (next_dest,)

        simulated_traces[case_id] = trace

    return simulated_traces



# locate the specific group based on the specific GT
def assign_cases_to_groups(case_group_type_dic, group_type_group_dic):
    """
    Assign each case to a specific group within its group type, 
    based on the group-type-to-group probability dictionary.

    Args:
        case_group_type_dic (dict): {c1: 'GT1', c2: 'GT2', ...}
        group_type_group_dic (dict): 
            {'GT1': {'G1': 0.08, 'G10': 0.029, ...}, 'GT2': {'G5': 0.6, ...}, ...}

    Returns:
        dict: {c1: ('GT1', 'G4'), c2: ('GT2', 'G5'), ...}
    """
    case_group_assignment = {}

    for case_id, gt in case_group_type_dic.items():
        # Get group probability dictionary for this GT
        group_probs = group_type_group_dic.get(gt, {})
        if not group_probs:
            # If no group info, mark as None
            case_group_assignment[case_id] = (gt, None)
            continue

        groups = list(group_probs.keys())
        probs = list(group_probs.values())

        # Normalize probabilities (in case they don't sum to 1)
        total = sum(probs)
        if total > 0:
            probs = [p / total for p in probs]

        selected_group = random.choices(population=groups, weights=probs, k=1)[0]
        case_group_assignment[case_id] = selected_group

    return case_group_assignment


# assign agent attribute based on the agent interaction in this specific group
import random
from collections import defaultdict

def assign_agents_to_trace(
    case_role_activity_dic,
    case_group_dic,
    group_role_resource_map,
    group_resource_hand,
    max_seq_len=10,
):
    """
    Assigns agents to each (role, activity) in the simulated traces based on group handover probabilities.

    Args:
        case_role_activity_dic (dict): 
            e.g., {'c1': [('Role1','A1'), ('Role1','A2'), ...]}
        case_group_dic (dict): 
            e.g., {'c1': 'G1', 'c2': 'G2', ...}
        group_role_resource_map (dict): 
            e.g., {'G1': {'Role1': ('r560532',), 'Role2': ('r560458',)}}
        group_resource_hand (dict): 
            e.g., {'G1': {('s*',): {'r560532': 1.0}, ('r560532',): {'r560532': 0.9, 'e*':0.1}}}
        max_seq_len (int): max number of previous agents to use for context.

    Returns:
        dict: {case_id: [(role, activity, agent), ...]}
    """
    simulated_agent_traces = {}

    for case_id, role_act_seq in case_role_activity_dic.items():
        group_id = case_group_dic.get(case_id)
        if not group_id:
            continue

        # Get resource handover model for this group
        hand_model = group_resource_hand.get(group_id, {})
        # Get available agents per role
        role_resource_map = group_role_resource_map.get(group_id, {})

        agent_seq = ['s*']  # start symbol
        case_trace = []

        for role, act in role_act_seq:
            assigned_agent = None
            valid_agents = role_resource_map.get(role, [])

            # Backoff: try from max_seq_len → 1
            for seq_len in range(max_seq_len, 0, -1):
                context = tuple(agent_seq[-seq_len:])
                if context in hand_model:
                    next_agent_probs = hand_model[context]
                    # Filter by valid agents only
                    filtered_probs = {
                        a: p for a, p in next_agent_probs.items() if a in valid_agents
                    }
                    if filtered_probs:
                        agents = list(filtered_probs.keys())
                        probs = list(filtered_probs.values())
                        probs = [p / sum(probs) for p in probs]  # normalize
                        assigned_agent = random.choices(agents, weights=probs, k=1)[0]
                        break

            # If no valid transition found, random valid agent fallback
            if not assigned_agent and valid_agents:
                assigned_agent = random.choice(valid_agents)
            elif not assigned_agent:
                assigned_agent = 'UNKNOWN'

            case_trace.append((role, act, assigned_agent))
            agent_seq.append(assigned_agent)

        agent_seq.append('e*')
        simulated_agent_traces[case_id] = case_trace

    return simulated_agent_traces


# test the whole simulation process
for i in [0,1,2,3,4,5,6,7,8,9]:
    output_file_path_petri = f"/Users/qingtan/Desktop/AgentSimulator-main/simulated_data_role_mining/BPIC2020_petri/autonomous/simulated_log_{i}.csv"
    case_group_type_dic = simulate_group_types(group_type_distri_dic, num_cases=num_test_cases)
    simulated_activity_traces = simulate_activity_traces_using_petri(case_group_type_dic, discovered_petri_prob_dic, max_len=200)
    case_group_map = assign_cases_to_groups(case_group_type_dic, group_type_group_dic)
    simulated_activity_traces_petri = assign_agents_to_trace(simulated_activity_traces, case_group_map, group_role_resource_dic, group_resource_hand_dic, max_seq_len=10)
    simulated_traces_to_event_log(simulated_activity_traces_petri, output_file_path_petri)
'''

'\n# start simulation (approach 2: use Petri net)\n# get the group type based on GT distribution\nimport random\nfrom collections import Counter\n\ndef simulate_group_types(group_type_dic, num_cases):\n    """\n    Randomly sample group types based on their probability and\n    return a dictionary mapping case IDs to sampled group types.\n\n    Args:\n        group_type_dic (dict): e.g.,\n            {\'GT1\': ((\'Role1\',), 0.574),\n             \'GT2\': ((\'Role1\', \'Role2\'), 0.068), ...}\n        num_cases (int): total number of cases to simulate.\n\n    Returns:\n        dict: {c1: \'GT1\', c2: \'GT2\', ...}\n    """\n    group_types = list(group_type_dic.keys())\n    probs = [v[1] for v in group_type_dic.values()]\n\n    # Normalize probabilities if needed\n    total = sum(probs)\n    probs = [p / total for p in probs]\n\n    # Sample group types\n    sampled = random.choices(group_types, weights=probs, k=num_cases)\n\n    # Build dictionary output\n    case_group_type_dic = {f\