In [51]:
#imports
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib
import numpy as np
from collections import Counter
import importlib
import sys
import json
import plotly.express as px
import pandas as pd
import nltk
import warnings
import os
from os import path

#custom imports from python files in scripts folder
import custom_funcs_for_wf as myFuncs

#reload custom modules
_ = importlib.reload(sys.modules['custom_funcs_for_wf']) #reloads the functions without needing to restart the kernel

# Note
### Use cyber_wf_pipeline_parameter_finding.ipynb to find the parameters for all tasks in a given event BEFORE running it here (and add the appropriate event below for task_data_df)

# Options

In [52]:
#if save = True, all results will be saved to the below directory as json files and pngs.
#this is specifically useful for the developed Rust GUI to visualize the results of this pipeline, otherwise is not important for any other data analysis.
save = False

#if save_report = True, the top 10 subtasks for each task, BoT Term Strategies, and Echo Term Strategies will be saved to CSV files in a "report_CSVs" folder in the below directory.
#these report CSV files are intended to be copied from Excel into tables within other Office products to easily create reports.
#tip: to open in Excel, add "sep=;" to the top of the raw csv file. i use commas in the information, so use ; as a delimiter.
save_report = False

#TODO This is a directory on my computer, it will not be on your computer.
#ensure the directory is easily accessible, as the Rust GUI will require you to select the folder via a file explorer window
directory_to_save_json_objects = '/Users/Tom/Documents/TEP Analysis/ECSC/'

#if we are saving, create images directory in the specified directory if it doesn't exist yet (setup for saving)
if save:
    #check if reports directory exists, make it if not
    images_dir =  fr'{directory_to_save_json_objects}/images/'#directory to save images
    if not os.path.exists(images_dir): #if the images directory doesn't exist, create it
        os.mkdir(images_dir)

print_all = False #if true then prints all image representations of runs in each task and their corresponding hierarchically encoded runs

### Read Data

In [53]:
cooked_directory = r'data/'

ascend_ctf_data = True #TODO True if using 300_wf.csv from ASCEND, else set to false and add file name below
if ascend_ctf_data:
    chosen_event = 'ecsc'
    task_data_df = myFuncs.read_ascend_ctf_data(cooked_directory, chosen_event)
else:
    csv_file_name = 'random_name.csv' #TODO if you are not using CTF data, change this to the actual file name
    task_data_df = pd.read_csv(path.join(cooked_directory, csv_file_name))

# Run Through Pipeline

## Define All Allowed Actions

In [54]:
task_data_df = myFuncs.clean_terms(task_data_df) #clean actions, add more or delete cleaning terms in custom_funcs_for_wf
#get bad and good actions
if ascend_ctf_data:
    bad_actions, all_action_types = myFuncs.get_bad_actions(task_data_df['action'], path.join(cooked_directory, '200_terminal_features.csv'))
else:
    bad_actions, all_action_types = myFuncs.get_bad_actions(task_data_df['action'])

In [55]:
#finds actions that do not have side effects yet (they are new to analysis)
check = False #set to True if you want to see the actions that do not yet have a manual side effect added
if check:
    check_cmds = list(task_data_df['action'].unique())
    check_cmds = [cmd for cmd in check_cmds if cmd not in bad_actions and cmd not in myFuncs.get_action_side_effects().keys()]
    check_cmds

## Define Pipeline Hyperparameters

In [56]:
if ascend_ctf_data:
    parameters = myFuncs.get_parameters(chosen_event)
else: 
    parameters = myFuncs.get_parameters("custom_task")

## Run Pipeline

In [57]:
#runs pipeline for every task, keeping the subtasks found in each
#nltk.download('punkt_tab')# download if needed

warnings.filterwarnings("ignore")

tasks = list(np.unique(task_data_df['task']))

pipeline_results = dict() #will hold pipeline results
subtasks = dict() #empty, but can manually define names if desired (if you wanted to merge subtasks across events, you would import a subtasks dict here)
allowed_actions = dict() #will hold the actions that each task was clustered on
for task in tasks: 
    print(f'{task} started...')
    if save: #saves the dendrograms
        pipeline_result, subtasks, task_allowed_actions = myFuncs.get_pipeline_results(task_data_df, bad_actions, all_action_types, parameters, task, subtasks, save_json_directory=directory_to_save_json_objects)
    else: #doesnt save the dendrograms
        pipeline_result, subtasks, task_allowed_actions = myFuncs.get_pipeline_results(task_data_df, bad_actions, all_action_types, parameters, task, subtasks)
    pipeline_results[task] = pipeline_result #this task's results
    allowed_actions[task] = task_allowed_actions
    print(f'{task} finished')

cult-1 started...
cult-1 finished
cult-5 started...
cult-5 finished
loss-1a started...
loss-1a finished
loss-1b started...
loss-1b finished
rep-8a started...
rep-8a finished
rep-8b started...
rep-8b finished


# Interesting Statistics

## Run Frequency and Term Frequency

In [58]:
subtask_levels = [1]+list(subtasks.keys()) #ngram sizes used
print(f'subtask levels being recorded: {subtask_levels}')
doc_freqs = dict() #will hold data based on task
term_freqs = dict()
for task in tasks: #go through every task
    doc_freq = dict() #new dicts for the individual frequencies for this task
    term_freq = dict()
    for subtask_level in subtask_levels: #go through every subtask level (ngram sizes)
        doc_freq[subtask_level] = Counter() #new counter for every ngram size
        term_freq[subtask_level] = Counter()
    for run, bot, echo  in pipeline_results[task]: #go through every run for this task
        subtasks_in_run = [] #tracks the subtasks already counted for this run
        for subtask_level in subtask_levels: #for every subtask ngram size
            for subtask in run.encoded_run[subtask_level]: #for every subtask in the run
                term_freq[subtask_level][subtask.name] += 1 #count the usage for the term
                if subtask.name not in subtasks_in_run: #count the usage only if we havent already counted it for this run
                    doc_freq[subtask_level][subtask.name] += 1
                    subtasks_in_run.append(subtask.name) #add it to the subtasks we have already counted
    #save frequencies for task
    doc_freqs[task] = doc_freq
    term_freqs[task] = term_freq
    

subtask levels being recorded: [1, 2, 3, 4]


## Percentages of Strategies

In [59]:
percentages = dict() #will hold the counts of each straegy within each task
for task in tasks:
    bot_counts = Counter()
    bot_echo_counts = Counter()
    for run, bot, echo  in pipeline_results[task]:
        bot_counts[bot] += 1
        bot_echo_counts[(bot, echo)] += 1
    percentages[task] = [bot_counts, bot_echo_counts]

In [60]:
percentages_bot_float = dict()
percentages_echo_float = dict()

percentage_bot_json_list = []
percentage_echo_json_list = []
for task in tasks:
    total_num_participants = len(pipeline_results[task])
    print('\n-----------------------------------------------------------------------')
    print(f'TASK: {task}')
    print(f'# PARTICIPANTS: {total_num_participants}')
    print('-----------------------------------------------------------------------')
    #get stats for BOT strategies
    print('___________________BOT_PERCENTAGES_____________')
    bot_counts = percentages[task][0]
    
    #for saving stats to json
    percentages_bot_float[task] = dict()
    percentages_echo_float[task] = dict()
    
    bot_stats_list = '' #used for JSON
    for key in bot_counts.keys(): #go through each count of bot(BoT) strategy and get the float percentage based on counts and number of participants
        percentage = (bot_counts[key]/total_num_participants) * 100
        #for saving stats to json
        percentages_bot_float[task][key] = percentage
        percentages_echo_float[task][key] = dict()
        
        #print stat
        print(f'{key}: {percentage:.2f}%')
        bot_stats_list += f'{key}:{percentage:.2f}%|'
    
    #for saving stats to json
    stats_list = str(bot_stats_list.replace(",","").replace("'",""))
    bot_json = {
            'hierarchyLevel': 'task',
            'statType': 'percentage_strategy_bot',
            'statSubtype': f'bot_percentage_{task}',
            'identifier': task,
            'statsList': stats_list[:-1],
            'header': 'BoT Strategy Percentages'
        }
    percentage_bot_json_list.append(bot_json)
    
    #now get percentages for ECHO strategies
    print('___________________BOT_ECHO_PERCENTAGES_____________')
    bot_echo_counts = percentages[task][1]
    
    
    
    echo_stats_list = dict() #saves for JSON use
    for key in bot_echo_counts.keys(): #go through each count of bot_echo(BoT->Echo) strategy and get the float percentage based on counts and number of participants
        percentage = (bot_echo_counts[key]/total_num_participants) * 100
        #for saving stats to json
        percentages_echo_float[task][key[0]][key[1]] = percentage
        #print result
        print(f'{key}:{percentage:.2f}%')
        #for saving stats to json
        if key[0] in echo_stats_list.keys():
            echo_stats_list[key[0]] += f'{key[1]}:{percentage:.2f}%|'
        else:
            echo_stats_list[key[0]] = f'{key[1]}:{percentage:.2f}%|'
    
    #for saving stats to json
    for key in list(echo_stats_list.keys()):
        stats_list = str(echo_stats_list[key].replace(",","").replace("'",""))
        echo_json = {
            'hierarchyLevel': 'bot',
            'statType': 'percentage_strategy_echo',
            'statSubtype': 'strategy usage out of all participants',
            'identifier': f'{task}_{key}',
            'statsList': stats_list[:-1],
            'header': 'Echo Strategy Percentages'
        }
        percentage_echo_json_list.append(echo_json)
        
    


-----------------------------------------------------------------------
TASK: cult-1
# PARTICIPANTS: 43
-----------------------------------------------------------------------
___________________BOT_PERCENTAGES_____________
1: 27.91%
4: 32.56%
6: 2.33%
7: 2.33%
5: 2.33%
9: 6.98%
0: 4.65%
10: 11.63%
2: 2.33%
3: 4.65%
8: 2.33%
___________________BOT_ECHO_PERCENTAGES_____________
(1, np.int64(-1)):9.30%
(4, np.int64(0)):18.60%
(6, np.int64(-1)):2.33%
(7, np.int64(-1)):2.33%
(5, np.int64(-1)):2.33%
(9, np.int64(0)):4.65%
(0, np.int64(-1)):4.65%
(1, np.int64(0)):9.30%
(10, np.int64(0)):4.65%
(1, np.int64(1)):4.65%
(4, np.int64(-1)):9.30%
(4, np.int64(1)):4.65%
(2, np.int64(-1)):2.33%
(1, np.int64(2)):4.65%
(10, np.int64(1)):4.65%
(3, np.int64(-1)):4.65%
(10, np.int64(-1)):2.33%
(8, np.int64(-1)):2.33%
(9, np.int64(-1)):2.33%

-----------------------------------------------------------------------
TASK: cult-5
# PARTICIPANTS: 8
---------------------------------------------------------------

## Side Effects

In [61]:
#get side effects
action_side_effects = myFuncs.get_action_side_effects()

In [62]:
#print side effects of each user's run in each task
new_ones = []
for task in tasks:
    print('\n-----------------------------------------------------------------------')
    print(f'TASK: {task}')
    print('-----------------------------------------------------------------------')
    for run, bot, echo  in pipeline_results[task]:
        print(f'user {run.participant}[{bot}:{echo}] side effects\n=============================')
        found_side_effects = []
        for subtask in run.encoded_run[1]:
            if subtask.name not in found_side_effects:
                if subtask.name in action_side_effects.keys():
                    print(action_side_effects[subtask.name])
                    found_side_effects.append(subtask.name)
                else:
                    print(f'{subtask.name}-NA')
                    found_side_effects.append(f'{subtask.name}-NA')
                    if subtask.name not in new_ones:
                        new_ones.append(subtask.name)
                    
        print('\n\n')


-----------------------------------------------------------------------
TASK: cult-1
-----------------------------------------------------------------------
user P11050[1:-1] side effects
information gain of some command
access gain to online network protocol



user P16792[4:0] side effects
access gain to online network protocol



user P20253[6:-1] side effects
new directory created
file(s) downloaded from server to local directory
access gain to online network protocol
commands ran for each file within a set of files
gzip compressed file(s) decompressed
file(s) copied



user P26759[7:-1] side effects
keywords searched within a file(s)
information gain of some command
commands ran for each file within a set of files



user P29729[5:-1] side effects
data transferred
hidden files or directories found
gzip compressed file(s) decompressed
file(s) copied
keywords searched within a file(s)



user P29730[9:0] side effects
access gain to online network protocol
hidden files or directorie

In [63]:
#list of all actions that are new and don't have side_effects (some may be bad actions and should be added to "bad terms" list to be taken out)
check = False
if check == True:
    print(new_ones)

## Descriptions

In [64]:
#get descriptions
action_descriptions = myFuncs.get_action_descriptions()

In [65]:
#print descriptions of each user's run in each task
for task in tasks:
    print('\n-----------------------------------------------------------------------')
    print(f'TASK: {task}')
    print('-----------------------------------------------------------------------')
    for run, bot, echo  in pipeline_results[task]:
        print(f'user {run.participant}[{bot}:{echo}] attempted to...')
        found_descriptions = []
        for subtask in run.encoded_run[1]:
            if subtask.name not in found_descriptions:
                if subtask.name in action_descriptions.keys():
                    print(action_descriptions[subtask.name])
                    found_descriptions.append(subtask.name)
                else:
                    print(f'{subtask.name}-NA')
                    found_descriptions.append(f'{subtask.name}-NA')
        print('\n\n')


-----------------------------------------------------------------------
TASK: cult-1
-----------------------------------------------------------------------
user P11050[1:-1] attempted to...
read the manual of some command
use parallelized brute force password cracking on an online network protocol



user P16792[4:0] attempted to...
use parallelized brute force password cracking on an online network protocol



user P20253[6:-1] attempted to...
create a new directory(ies)
download a file(s) from a server to their local directory
use parallelized brute force password cracking on an online network protocol
run a specified command for each file within a set of files
decompress a gzip compressed file(s)
copy a file(s)



user P26759[7:-1] attempted to...
keyword search within a file(s)
read the manual of some command
run a specified command for each file within a set of files



user P29729[5:-1] attempted to...
transfer data with urls
use a brute force hidden file or directory search
de

### Description Term and Run Frequency Per Challenge

In [66]:
#get thge term and run frequency
descriptions_terms_freq = {}
descriptions_doc_freq = {}
for task in tasks: #for every task
    descriptions_terms_freq[task] = Counter()
    descriptions_doc_freq[task] = Counter()
    actions = term_freqs[task][1] #get frequencies found above
    for action in actions: #get the corresponding definintions of definitions for each action frequency
        if action in action_descriptions.keys(): #the action has a description
            descriptions_terms_freq[task][action_descriptions[action]] = actions[action]
            descriptions_doc_freq[task][action_descriptions[action]] = doc_freqs[task][1][action]
        else: # the action does not have a description
            desc = f'{action}-NA'
            descriptions_terms_freq[task][desc] = actions[action]
            descriptions_doc_freq[task][desc] = doc_freqs[task][1][action]
    

### Get The Side Effect and Description Of a Subtask Given the Subtask Name

In [67]:
subtask_str = 'st31' #subtask to look up

#print the side effects and description
subtask_description = myFuncs.get_subtask_description(subtask_str, subtasks, action_descriptions)
print(f'in subtask {subtask_str} the user attempted to...')
for desc in subtask_description:
    print(desc)

print(f'\n\nsubtask {subtask_str} side effects\n===========================')
subtask_side_effects = myFuncs.get_subtask_side_effects(subtask_str, subtasks, action_side_effects)
for se in subtask_side_effects:
    print(se)

in subtask st31 the user attempted to...
read the manual of some command
use parallelized brute force password cracking on an online network protocol
read the manual of some command


subtask st31 side effects
information gain of some command
access gain to online network protocol
information gain of some command


## Visualize Encoded Runs

In [68]:
if print_all: #print all hierarchically encoded runs

    text_offset = .05
    colors = cm.rainbow(np.linspace(0, 1, len(subtasks.keys())+1)) #get amount of colors that correspond to number of ngram sizes
    for task in tasks:
        for pipeline_result in pipeline_results[task]:

            #get the results
            run = pipeline_result[0]
            FA_label = pipeline_result[1]
            echo_dist_label = pipeline_result[2]
            fig, ax = plt.subplots()
            for i, ngram_size in enumerate(run.encoded_run.keys()): #for every ngram size, we want to move the plotted subtasks up
                #ax.plot((-.6, len(run.tokenized_run)), (ngram_size, ngram_size))
                coin = .1
                for subtask in run.encoded_run[ngram_size]:
                    #print(subtask.start, ngram_size, subtask.end, ngram_size)
                    ax.plot((subtask.start, subtask.end), (ngram_size+coin, ngram_size+coin), marker ='|', color=colors[i])
                    ax.text(subtask.start, ngram_size+coin+text_offset, subtask.name)
                    coin+=.1 #we want to move the lines up slightly so we can clearly see them
            raw_run_text = f'raw run: {run.raw_run}'
            matplotlib.rcParams.update({'font.size': 5})
            ax.text(0, .9, raw_run_text)
            matplotlib.rcParams.update({'font.size': 10})
            ax.set_ylim([0.9, 5])
            ax.set_xlim([-.5, 10])
            ax.axis('off')
            title = f'tokenized run for {chosen_event} {task} {run.participant} FA: {FA_label} echo_dist: {echo_dist_label}'
            ax.set_title(title)
            plt.show()

# Save Data Into Specifed Directory (JSON objects for use in GUI)

## WARNING

This section is used explicitly to save the results as json and png files for use in a developed Rust GUI that summarizes/visualizes the results. 

If you do not need the data for GUI or other purposes, ensure save is set to False.

## Subtasks

In [69]:

def subtasks_in_subtask(st_dict_list, st):
    #given st as the list form and the subtasks list-dict, return the names of the encased smaller subtasks (not including unigram subtasks)
    if len(st) == 2:
        return []
    
    smaller_st_list = []
    
    
    
    st_bigrams = list(nltk.bigrams(st))
    smaller_st_list.extend(list(st_dict_list[2].keys()))
    
    st_trigrams = []
    st_size = len(st)
    if st_size > 3:
        st_trigrams = list(nltk.trigrams(st))
        smaller_st_list.extend(list(st_dict_list[3].keys()))
        
    
    smaller_st_in_st = []
    
    for smaller_st in smaller_st_list:
        if smaller_st in st_bigrams or smaller_st in st_trigrams:
            smaller_st_in_st.append(st_dict_list[len(smaller_st)][smaller_st])
            
    return smaller_st_in_st
            
    

In [70]:
if save:


    subtask_json_list = []
    subtask_lens_to_check = [2,3,4]
    for length in subtask_lens_to_check:
        for key, value in subtasks[length].items():
            st_name = value
            st_raw_actions = f'{key}'
            encased_subtasks = f'{tuple(subtasks_in_subtask(subtasks, key))}'
            description = f'the user attempted to... {tuple(myFuncs.get_subtask_description(st_name, subtasks, action_descriptions))}'.replace("(", "").replace(")", "").replace("'", '')
            side_effects = f'{tuple(myFuncs.get_subtask_side_effects(st_name, subtasks, action_side_effects))}'.replace("(", "").replace(")", "").replace("'", '')
            #print(st_name, st_raw_actions, encased_subtasks, description, side_effects)
            subtask_json = {
                'name': st_name,
                'rawActions': st_raw_actions,
                'encasedSubtasks': encased_subtasks,
                'description': description,
                'sideEffects': side_effects
            }
            subtask_json_list.append(subtask_json)

    with open(f'{directory_to_save_json_objects}subtasks.json', 'w', encoding='utf-8') as f:
        json.dump(subtask_json_list, f, ensure_ascii=False, indent=4)
        

        
        

## Statistics

In [71]:
#success rate #HERE
if save:
    #dictionary for participant and flag per game
    success_flags_dict = dict()
    for task in tasks:
        success_flags_dict[task] = dict()
        participants_in_task_df = task_data_df[task_data_df['task'] == task]['participant'].unique()
        
        for index, row in participants_in_task_df.iterrows():
            participant = row['participant']
            flag = row['result']
            success_flags_dict[task][participant] = flag
        
    
    bot_success_rate_json_list = []
    echo_success_rate_json_list = []

    bot_report_num_success_dict = dict() #csv report
    echo_report_num_success_dict = dict() #csv report

    for task in tasks:
        bot_report_num_success_dict[task] = dict() #csv report
        echo_report_num_success_dict[task] = dict() #csv report
        #get counts of each action in clusters
        bot_counter = Counter() 
        for bot in percentages_echo_float[task].keys():
                echo_report_num_success_dict[task][bot] = dict() #csv report
                    
                #get runs in this bot strategy
                bot_runs = []
                for run, bot, echo  in pipeline_results[task]:
                        if run.task == task and bot == bot:
                                bot_runs.append(run)
                
                
                
                for run in bot_runs:
                        bot_counter[bot] = 0
                        
                        
                        echo_counter = Counter()
                        for echo in percentages_echo_float[task][bot].keys():
                                echo_counter[echo] = 0
                                #get runs in this echo strategy
                                echo_runs = []
                                for run, bot, echo  in pipeline_results[task]:
                                        if run.task == task and bot == bot and echo == echo:
                                                echo_runs.append(run)
                                
                                for echo_run in echo_runs:
                                        if success_flags_dict[task][echo_run.participant] == 'Flag':
                                                echo_counter[echo] += 1 #increase echo count
                                                bot_counter[bot] += 1 #increase bot count

                #create stats_list for echo
                stats_list = ''
                for count in echo_counter.most_common():
                        stats_list += f'{count[0]} : {count[1]}|'
                        echo_report_num_success_dict[task][bot][count[0]] = count[1] #csv report
                        
                        
                echo_success_json = {
                        "hierarchyLevel": "bot",
                        "statType": "echo_success_rate",
                        "statSubtype": f"echo_success_per_strategy",
                        "identifier": f'{task}_{bot}',
                        "statsList": stats_list.replace("'","")[:-1],
                        "header": "Echo Task Completion Success Rates"
                }
                
                echo_success_rate_json_list.append(echo_success_json)
                                
                                
                

                       
                
        #create stats_list for bot
        stats_list = ''
        for count in bot_counter.most_common():
                stats_list += f'{count[0]} : {count[1]}|'
                bot_report_num_success_dict[task][count[0]] = count[1] #csv report
                
                
        bot_success_json = {
                "hierarchyLevel": "task",
                "statType": "bot_succes_rate",
                "statSubtype": f"bot_success_per_task",
                "identifier": f'{task}',
                "statsList": stats_list.replace("'","")[:-1],
                "header": "BoT Task Completion Success Rates"
        }
                
                
                
        bot_success_rate_json_list.append(bot_success_json)
                        

In [72]:
if save:
    term_freq_json_list = []
    for task in term_freqs.keys():
        for length in term_freqs[task].keys():
            stats_list = ''
            for subtask_count in term_freqs[task][length].most_common():
                stats_list += f'{subtask_count[0]}:{subtask_count[1]}|'
            term_freq_json = {
                "hierarchyLevel": "task",
                "statType": "term_frequency_subtask",
                "statSubtype": f"st{length}",
                "identifier": task,
                "statsList": stats_list.replace("'","")[:-1],
                "header": "Subtask Term Frequency"
            }
            term_freq_json_list.append(term_freq_json)

In [73]:
if save:
    doc_freq_json_list = []
    for task in doc_freqs.keys():
        for length in doc_freqs[task].keys():
            stats_list = ''
            for subtask_count in doc_freqs[task][length].most_common():
                stats_list += f'{subtask_count[0]}:{subtask_count[1]}|'
            doc_freq_json = {
                "hierarchyLevel": "task",
                "statType": "doc_frequency_subtask",
                "statSubtype": f"st{length}",
                "identifier": task,
                "statsList": stats_list.replace("'","")[:-1],
                "header": "Subtask Document Frequency"
            }
            doc_freq_json_list.append(doc_freq_json)

In [74]:
#allowed actions inside of bot for task
if save:
    task_allowed_actions_json_list = []
    for task in tasks:
        #create stats_list for task
        stats_list = ''
        for action in allowed_actions[task]:
                stats_list += f'{action}|'
                
                
        task_allowed_actions_json = {
                "hierarchyLevel": "task",
                "statType": "actions_in_bot",
                "statSubtype": f"allowed_actions_to_cluster_on",
                "identifier": f'{task}',
                "statsList": stats_list.replace("'","")[:-1],
                "header": "Actions That BoT Strategies Were Clustered On"
        }
        task_allowed_actions_json_list.append(task_allowed_actions_json)

## Actions Within Strategies

In [75]:
if save:
        action_occurence_json_list = []
        term_strat_bot_report_dict = [] #csv report

        for task in tasks:
                #term_strat_bot_report_dict[task] = dict()
                for bot in np.sort(list(percentages_bot_float[task].keys())):
                        #term_strat_bot_report_dict[task] = []
                        
                        #get runs in this bot strategy
                        bot_runs = []
                        for run, bot, echo  in pipeline_results[task]:
                                if run.task == task and bot == bot:
                                        bot_runs.append(run)

                        #get counts of each action in clusters
                        bot_counter = Counter()
                        
                        for run in bot_runs:
                                action_counter = Counter()
                                for action in run.raw_run: 
                                        if action == 'python3':
                                                action = 'python'
                                        action_counter[action] = 1 #don't need an if since it will set it to 1 even if it sees it again
                                bot_counter += action_counter #add the occurences
                        
                        #create stats_list
                        num_participants_in_bot = len(bot_runs)
                        stats_list = ''

                        
                        strat_num = bot #csv report
                        terms_strat = '' #csv report
                        other_terms = '' #csv report
                        num_users = num_participants_in_bot
                        num_completed = bot_report_num_success_dict[task][bot]

                        
                        for action in bot_counter.most_common():
                                percentage = (action[1]/num_participants_in_bot)*100
                                stats_list += f'{action[0]} : {percentage:.2f}%|'

                                #csv report
                                #rules
                                #1. only terms with more than 33.33% are kept for BoT with more than or equal to 3 users. 
                                #2. only terms with 100% are kept for BoT with 2 or less users
                                #3. An “~” designates that large majority (80% or more) of the users utilized the term.
                                #4. If less than 80%, the term is added to other_terms
                                if num_participants_in_bot >= 3:
                                        if percentage > 33.3:
                                                if percentage == 100.0:
                                                        terms_strat += f'{action[0]}, '
                                                elif percentage >= 80:
                                                        terms_strat += f'~{action[0]}, '
                                                else:
                                                        other_terms += f'{action[0]}, '
                                else: #less than 3 -> 2 or less
                                        if percentage == 100.0:
                                                terms_strat += f'{action[0]}, '
                                        
                        if other_terms == '': #ensures that if there are none, it shows up as none
                                other_terms = 'N/A  '
                                

                        term_strat_bot_report_dict.append([task, strat_num, terms_strat[:-2], other_terms[:-2], num_users, num_completed]) #csv report
                                
                                
                        action_occurence_json = {
                                "hierarchyLevel": "bot",
                                "statType": "action_frequncy",
                                "statSubtype": f"bot_action_occurences_per_strategy",
                                "identifier": f'{task}_{bot}',
                                "statsList": stats_list.replace("'","")[:-1],
                                "header": "BoT Action Occurences Per Strategy"
                        }
                        
                        action_occurence_json_list.append(action_occurence_json)
                
                


## Save Statistics to Artifacts

In [76]:
if save:
    stats_json_list = bot_success_rate_json_list #this hierarchy also tells us which statistic comes first in the task explorer app
    stats_json_list.extend(echo_success_rate_json_list)
    stats_json_list.extend(task_allowed_actions_json_list)
    stats_json_list.extend(action_occurence_json_list)
    stats_json_list.extend(percentage_bot_json_list)
    stats_json_list.extend(percentage_echo_json_list)
    stats_json_list.extend(term_freq_json_list)
    stats_json_list.extend(doc_freq_json_list)

    with open(f'{directory_to_save_json_objects}statistics.json', 'w', encoding='utf-8') as f:
        json.dump(stats_json_list, f, ensure_ascii=False, indent=4)

## Save Report CSV

In [77]:
#lets us see what actions were allowed, easy for copy paste into report
#this does not get saved into CSV report as I can just copy from here (not any easier to use CSV as it won't be a table)
if save_report:
    for task in tasks:
        print(task)
        allowed_action_string = ''
        for action in allowed_actions[task]:
            allowed_action_string += f'{action}, '
        print(allowed_action_string[:-2])
        print('----------------------')

In [78]:
if save_report:
        def longest_common_subsequence_multiple(lists):
                #con of this method (iteratively starting with the first longest common subsequence):
                #if there could be a second-, third-, ..., nth-longest common subsequence in the first two lists that exists between all 
                #lists and just never gets skipped. 
                # e.g. [[1,2,5,6,3,4], [5,1,2,3,4,6], [5,6]] -> would return no LCS due to 1,2,3,4 not being in third, even though 5,6 exists in all

                current_lcs = longest_common_subsequence_two(lists[0], lists[1]) #get the longest common subsequence of the first two lists (this gives us a starting longest common subsequence)

                for i in range(2, len(lists)): #go through the rest of the lists (now using the current LCS) to widdle down the LCS that is shared among all
                        current_lcs = longest_common_subsequence_two(current_lcs, lists[i])

                return current_lcs

        def longest_common_subsequence_two(list1, list2):
                m, n = len(list1), len(list2) #get lengths
                dp = [[0] * (n + 1) for _ in range(m + 1)] #create matrix to hold values

                for i in range(1, m + 1): #compare each value in lists witch eachother
                        for j in range(1, n + 1):
                                if list1[i - 1] == list2[j - 1]: #if we find an item in the current sequence that is the same, add to matrix
                                        dp[i][j] = 1 + dp[i - 1][j - 1]
                                else: #otherwise get the max between the positions
                                        dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])

                # Reconstruct the LCS
                lcs = []
                i, j = m, n
                while i > 0 and j > 0:
                        if list1[i - 1] == list2[j - 1]: #if we have matching items, append the item
                                lcs.append(list1[i - 1])
                                i -= 1
                                j -= 1
                        elif dp[i - 1][j] > dp[i][j - 1]: #else if the number at first list cel is greater, decrement i
                                i -= 1
                        else: #otherwise decrement j (means second list cell is being advanced)
                                j -= 1
                return lcs[::-1]  #reverse the list to get the correct order
        

        echo_strats_matrix = [] #will hold the each term strategies for each echo strategy
        for task in tasks:
                for bot in np.sort(list(percentages_echo_float[task].keys())):
                        for echo in np.sort(list(percentages_echo_float[task][bot].keys())):
                                if echo != -1: #only look at the echo strategies that are real (-1 means no strategy)
                                        #get runs in this echo strategy
                                        echo_runs = []
                                        for run, bot, echo  in pipeline_results[task]:
                                                if run.task == task and bot == bot and echo == echo:
                                                        echo_runs.append(run.raw_run)

                                        
                                        #get the LCS of the runs
                                        lcs = longest_common_subsequence_multiple(echo_runs) 

                                        #make it into a pretty string we can save to a file (that we can then copy into a report)
                                        lcs_string = ''
                                        for idx, term in enumerate(lcs):
                                                if idx != len(lcs)-1:
                                                        lcs_string+= f'{term}, '
                                                else:
                                                        lcs_string+= f'{term}'
                                        
                                        #save the LCS into the matrix of echo term strategies for each echo strategy
                                        num_users = len(echo_runs)
                                        num_completed = echo_report_num_success_dict[task][bot][echo]
                                        echo_strats_matrix.append([task, bot, echo, lcs_string, num_users, num_completed])


In [79]:
if save_report:
    top_10_subtasks_dict = dict() #will hold the top 10 subtasks used in each task
    for task in doc_freqs.keys():
            for length in doc_freqs[task].keys(): 
                if length not in top_10_subtasks_dict.keys(): #if we have not looked at this length yet, make an empty list
                    top_10_subtasks_dict[length] = []

                coin = 0 #tracks how many subtasks we added for this task and length
                for subtask_count in doc_freqs[task][length].most_common(10): #get the top 10 (or top n if less than 10), subtask_count[0] is the subtask name, [1] is the count
                    if length != 1: #if we are dealing with a length greater than 1, get the actual term list for the subtask
                        top_10_subtasks_dict[length].append([task, subtask_count[0], subtask_count[1], myFuncs.get_subtask_ngram(subtask_count[0], subtasks)])
                        coin += 1
                    else: #otherise length is 1 and we can just add the single term and count
                        top_10_subtasks_dict[length].append([task, subtask_count[0], subtask_count[1]])
                        coin += 1
                while coin < 10: #if we did not have 10 subtasks for the length and task, add N/A for the remaining 10-n missing subtasks
                     top_10_subtasks_dict[length].append([task, 'N/A', 'N/A', 'N/A'])
                     coin += 1

In [80]:
if save_report:
    
    #this part creates CSVs that can be copied into a Word table from Excel for a report, automating most of the manual labor I would otherwise do

    #check if reports directory exists, make it if not
    report_dir =  fr'{directory_to_save_json_objects}/report_CSVs/'#directory to save csv report
    if not os.path.exists(report_dir): #if the directory doesn't exist, create it #TODO do for images too
        os.mkdir(report_dir)
    
    #save csv reports

    #BoTs
    bots_report_df = pd.DataFrame(term_strat_bot_report_dict, columns=['Task', 'Strat_Num', 'Term_Strat', 'Other_Terms', 'Num_Users', 'Num_Completed'])
    bots_report_df.to_csv(f'{report_dir}bot_report.csv', sep=';', index=False)

    #echos
    echo_report_df = pd.DataFrame(echo_strats_matrix, columns=['Task', 'BoT_Strat', 'Echo_Strat', 'Term_Strat', 'Num_Users', 'Num_Completed'])
    echo_report_df.to_csv(f'{report_dir}echo_report.csv', sep=';', index=False)

    #subtasks
    top_10_subtasks_df = pd.DataFrame(top_10_subtasks_dict[1], columns=['Task', 'Uni-gram', 'Doc_Freq'])

    top_10_subtasks_df['st2'] = [row[1] for row in top_10_subtasks_dict[2]]
    top_10_subtasks_df['Bi-gram'] = [row[3] for row in top_10_subtasks_dict[2]]
    top_10_subtasks_df['Doc_Freq_2'] = [row[2] for row in top_10_subtasks_dict[2]]

    top_10_subtasks_df['st3'] = [row[1] for row in top_10_subtasks_dict[3]]
    top_10_subtasks_df['Tri-gram'] = [row[3] for row in top_10_subtasks_dict[3]]
    top_10_subtasks_df['Doc_Freq_3'] = [row[2] for row in top_10_subtasks_dict[3]]

    top_10_subtasks_df['st4'] = [row[1] for row in top_10_subtasks_dict[4]]
    top_10_subtasks_df['Quad-gram'] = [row[3] for row in top_10_subtasks_dict[4]]
    top_10_subtasks_df['Doc_Freq_4'] = [row[2] for row in top_10_subtasks_dict[4]]

    top_10_subtasks_df.to_csv(f'{report_dir}subtasks_report.csv', sep=';', index=False)

## Runs

In [81]:
if save:
    runs_json_list = []

    for task in tasks:
        for run, bot, echo  in pipeline_results[task]:
            description = 'user attempted to... '
            side_effects = ''
            found_descriptions = []
            for subtask in run.encoded_run[1]:
                if subtask.name not in found_descriptions:
                    found_descriptions.append(subtask.name)
                    if subtask.name in action_descriptions.keys():

                        description += f'{action_descriptions[subtask.name]}, '
                        side_effects+= f'{action_side_effects[subtask.name]}, '
                    else:
                        description += f'{subtask.name}-NA, '
                        side_effects+= f'{subtask.name}-NA, '
            run_json = {
                "participant": run.participant,
                "task": task,
                "rawRun": f'{run.raw_run}'.replace("[","").replace("]","").replace("'",""),
                "bot": f'{bot}',
                "echo": f'{echo}',
                "description": description[:-1],
                "sideEffects": side_effects[:-1]
            }
            runs_json_list.append(run_json)
            
    with open(f'{directory_to_save_json_objects}runs.json', 'w', encoding='utf-8') as f:
        json.dump(runs_json_list, f, ensure_ascii=False, indent=4)

## Visuals

In [82]:
#spider plots of the BoT percentages per task
if save:
    for task in tasks:
        percentages_df = pd.DataFrame(dict(
            r=list(percentages_bot_float[task].values()),
            theta=list(map(str,percentages_bot_float[task].keys()))
            ))



        fig = px.line_polar(percentages_df, r='r', theta='theta', line_close=True, range_r=[0,50])
        fig.update_traces(fill='toself')
        fig.update_layout(
            autosize=False,
            width=800,
            height=800,
            font=dict(
                size=25,
            ),
            title={
                    'text' : f'Percentages Of Users In BoT Strategies Used In Task {task}',
                    'x':0.5,
                    'xanchor': 'center',
                    'font': {
                        'size':15
                    }
                    
            }
        )
        #fig.show()
        fig.write_image(f'{directory_to_save_json_objects}images/spider_{task}.png')

In [83]:
#spider plots of the Echo percentages per task and BoT
if save:
    for task in tasks:
        for bot in percentages_echo_float[task].keys():
            percentages_df = pd.DataFrame(dict(
            r=list(percentages_echo_float[task][bot].values()),
            theta=list(map(str,percentages_echo_float[task][bot].keys()))
            ))



            fig = px.line_polar(percentages_df, r='r', theta='theta', line_close=True, range_r=[0,50])
            fig.update_traces(fill='toself')
            fig.update_layout(
                autosize=False,
                width=800,
                height=800,
                font=dict(
                    size=25,
                ),
                title={
                        'text' : f'Percentages Of Users in Echo Strategies Used In BoT Strategy {bot}',
                        'x':0.5,
                        'xanchor': 'center',
                        'font': {
                            'size':15
                        }
                }
            )
            #fig.show()
            fig.write_image(f'{directory_to_save_json_objects}images/spider_{task}_{bot}.png')

In [84]:
#saves the encoded runs for every run in every task
if save:
    text_offset = .05
    colors = cm.rainbow(np.linspace(0, 1, len(subtasks.keys())+1)) #get amount of colors that correspond to number of ngram sizes
    for task in tasks:
        for pipeline_result in pipeline_results[task]:
            run = pipeline_result[0]
            FA_label = pipeline_result[1]
            echo_dist_label = pipeline_result[2]
            fig, ax = plt.subplots()
            for i, ngram_size in enumerate(run.encoded_run.keys()):
                #ax.plot((-.6, len(run.tokenized_run)), (ngram_size, ngram_size))
                coin = .1
                for subtask in run.encoded_run[ngram_size]:
                    #print(subtask.start, ngram_size, subtask.end, ngram_size)
                    print(subtask.ngram)
                    ax.plot((subtask.start, subtask.end), (ngram_size+coin, ngram_size+coin), marker ='|', color=colors[i])
                    ax.text(subtask.start, ngram_size+coin+text_offset, subtask.name)
                    coin+=.1
            raw_run_text = f'raw run: {run.raw_run}'
            matplotlib.rcParams.update({'font.size': 5})
            ax.text(0, .9, raw_run_text)
            matplotlib.rcParams.update({'font.size': 10})
            ax.set_ylim([0.9, 5])
            ax.set_xlim([-.5, 10])
            ax.axis('off')
            title = f'tokenized run for {chosen_event} {task} {run.participant} FA: {FA_label} echo_dist: {echo_dist_label}'
            ax.set_title(title)
            plt.savefig(f'{directory_to_save_json_objects}images/{run.participant}_{task}.png')
            plt.close()

# Data as Pandas

## Subtasks Dataframe

In [85]:
n_sizes = [2,3,4]
rows = []
for n in n_sizes:
    for raw, name in subtasks[n].items():
        row = [n, name, raw]
        rows.append(row)

columns = ['size', 'name', 'raw_actions']
subtasks_df = pd.DataFrame(rows, columns=columns)
    


## Results Dataframe With Encoded Runs Objects

In [86]:
rows = []
for task in tasks:
    for run, bot, echo in pipeline_results[task]:
        row =  [run.task, run.participant, run.raw_run, run.tokenized_run, run.encoded_run, bot, echo]
        rows.append(row)

columns = ['task', 'participant', 'raw_run', 'tokenized_run', 'encoded_run', 'bot_cluster', 'echo_cluster']
results_object_encoded_df = pd.DataFrame(rows, columns=columns)
        

## Results Dataframe With Text "Encoded Runs" That Can Be Parsed

In [87]:
rows = []
n_sizes = [1,2,3,4]
for task in tasks:
    for run, bot, echo in pipeline_results[task]:
        encoded_run_text = dict()
        for n in n_sizes:
            encoded_subtasks_text = ''
            for subtask in run.encoded_run[n]:
                encoded_subtasks_text += f'{subtask.name}-{subtask.start}-{subtask.end}|' #adds as subtask_name-start_position-end_position ex. "st21-0-3"
            if len(encoded_subtasks_text) > 0:
                encoded_subtasks_text = encoded_subtasks_text[:-1] #remove last '|'
            encoded_run_text[n] = encoded_subtasks_text 
        
        row = [run.task, run.participant, run.raw_run, run.tokenized_run, encoded_run_text[1], encoded_run_text[2], encoded_run_text[3], encoded_run_text[4], bot, echo]
        rows.append(row)

columns = ['task', 'participant', 'raw_run', 'tokenized_run', 'st1', 'st2', 'st3', 'st4', 'bot_cluster', 'echo_cluster']
results_text_encoded_df = pd.DataFrame(rows, columns=columns)
        

## Results Dataframe With Text Subtasks Of Each Size

In [88]:
rows = []
n_sizes = [1,2,3,4]
for task in tasks:
    for run, bot, echo in pipeline_results[task]:
        run_text = dict()
        for n in n_sizes:
            subtasks_text = ''
            for subtask in run.encoded_run[n]:
                subtasks_text += f'{subtask.name}|' #adds as subtask_name-start_position-end_position ex. "st21-0-3"
            if len(subtasks_text) > 0:
                subtasks_text = subtasks_text[:-1] #remove last '|'
            encoded_run_text[n] = subtasks_text 
        
        row = [run.task, run.participant, run.raw_run, run.tokenized_run, encoded_run_text[1], encoded_run_text[2], encoded_run_text[3], encoded_run_text[4], bot, echo]
        rows.append(row)

columns = ['task', 'participant', 'raw_run', 'tokenized_run', 'st1', 'st2', 'st3', 'st4', 'bot_cluster', 'echo_cluster']
results_text_subtasks_df = pd.DataFrame(rows, columns=columns)
        