## Import libraries

In [54]:
import ast
import os
import sys
import random
import itertools
sys.path.append('../')

import pandas as pd
import numpy as np
import json
from configs.config import cache as cache_path 

## Defining global variables for df of each category

## Read input data

In [55]:
activities_file_name = "activities.csv"
variants_file_name = "variants.csv"
filtered_folder_name = "filtered"
chained_filter_params_file_name = "chained_filter_params.json"

activities = pd.read_csv(os.path.join(cache_path, activities_file_name))
variants = pd.read_csv(os.path.join(cache_path, variants_file_name))

## Define chaining of filters

In [56]:
def apply_filter_chain(param_chain):
    for param_dict in param_chain[1:]:
        param_dict['params'].setdefault('is_filtered', True)

    for chain_param in param_chain:
        function = chain_param["function"]
        params = chain_param["params"]
        function(**params)

    param_chain = [{'function': str(param['function']), 'params': param['params']} for param in param_chain]

## Filter activities according to the new variants

In [57]:
def get_filtered_activities(variants_list, activities):
    tasks_from_variants_with_min_tasks = variants_list.values.tolist()
    flattened_list = list(itertools.chain.from_iterable(tasks_from_variants_with_min_tasks))
    unique_tasks = list(set(flattened_list))

    filtered_activities = activities[activities['task'].isin(unique_tasks)]

    return filtered_activities

## Filter top variants and activities by variant/case counts

In [58]:
def get_top_k_variants(variants, k):
    top_k_variants = variants.sort_values(by=['count'], ascending=False).head(k)
    return top_k_variants

def get_tasks_from_top_k_variants(top_variants_series):
    tasks_from_top_variants = top_variants_series.values.tolist()
    flattened_list = [elem.strip('][').split(', ') for sublist in tasks_from_top_variants for elem in sublist.split(", ")]

    # Merge all lists into a single list and make it unique
    unique_tasks = list(set([elem.strip("'") for sublist in flattened_list for elem in sublist]))

    return unique_tasks

def get_activities_from_top_k_variants(top_variants_series, activities):
    tasks_from_top_variants = get_tasks_from_top_k_variants(top_variants_series)

    # Filtering rows in the activities dataframe based on tasks present in unique_tasks
    top_activities = activities[activities['task'].isin(tasks_from_top_variants)]

    return top_activities

def filter_top_k_variants(top_variant_count, is_filtered=False, variants=variants, activities=activities):
    global top_k_variants
    global top_k_activity_cases
    
    variants_out_path = os.path.join(cache_path, filtered_folder_name, variants_file_name)
    activities_out_path = os.path.join(cache_path, filtered_folder_name, activities_file_name)

    if is_filtered:
        variants = pd.read_csv(variants_out_path)
        activities = pd.read_csv(activities_out_path)

    top_k_variants = get_top_k_variants(variants, k=top_variant_count)
    top_k_activity_cases = get_activities_from_top_k_variants(top_k_variants['task_list'], activities)

    top_k_variants.to_csv(variants_out_path, index=False)
    top_k_activity_cases.to_csv(activities_out_path, index=False)

## Filter variants and activities having min number of associated tasks

In [59]:
def get_variants_with_min_tasks(variants, min_tasks_per_variant):
    variants['task_list'] = variants['task_list'].apply(ast.literal_eval)
    variants_with_min_tasks = variants[variants['task_list'].apply(lambda x: len(x)) >= min_tasks_per_variant]
    return variants_with_min_tasks

def filter_variants_with_min_tasks(min_tasks_per_variant,is_filtered=False, variants=variants, activities=activities):
    global variants_with_min_tasks
    global activities_with_min_tasks

    variants_out_path = os.path.join(cache_path, filtered_folder_name, variants_file_name)
    activities_out_path = os.path.join(cache_path, filtered_folder_name, activities_file_name)

    if is_filtered:
        variants = pd.read_csv(variants_out_path)
        activities = pd.read_csv(activities_out_path)
        
    variants_with_min_tasks = get_variants_with_min_tasks(variants, min_tasks_per_variant)
    activities_with_min_tasks = get_filtered_activities(variants_with_min_tasks['task_list'], activities)

    variants_with_min_tasks.to_csv(variants_out_path, index=False)
    activities_with_min_tasks.to_csv(activities_out_path, index=False)

## Filter variants and activities by min number of case counts

In [60]:
def get_variants_with_min_case_counts(variants, min_cases_per_variant):
    variants['task_list'] = variants['task_list'].apply(ast.literal_eval)
    variants_with_min_case_counts = variants[variants['count'] >= min_cases_per_variant]
    return variants_with_min_case_counts

def filter_variants_with_min_case_counts(min_cases_per_variant, is_filtered=False, variants=variants, activities=activities):
    global variants_with_min_case_counts
    global activities_with_min_case_counts
    
    variants_out_path = os.path.join(cache_path, filtered_folder_name, variants_file_name)
    activities_out_path = os.path.join(cache_path, filtered_folder_name, activities_file_name)

    if is_filtered:
        variants = pd.read_csv(variants_out_path)
        activities = pd.read_csv(activities_out_path)

    variants_with_min_case_counts = get_variants_with_min_case_counts(variants, min_cases_per_variant)
    activities_with_min_case_counts = get_filtered_activities(variants_with_min_case_counts['task_list'], activities)

    variants_with_min_case_counts.to_csv(variants_out_path, index=False)
    activities_with_min_case_counts.to_csv(activities_out_path, index=False)


## Filter variants and activities based on variantID

In [61]:
def get_variants_with_variant_id(variants, variant_id):
    variants_with_variant_id = variants[variants['variant_ID'].isin(variant_id) if isinstance(variant_id, list) else (variants['variant_ID'] == variant_id)]
    return variants_with_variant_id

def filter_variants_with_variant_id(variant_id, is_filtered=False, variants=variants, activities=activities):
    global variants_with_variant_id
    global activities_with_variant_id

    variants_out_path = os.path.join(cache_path, filtered_folder_name, variants_file_name)
    activities_out_path = os.path.join(cache_path, filtered_folder_name, activities_file_name)

    if is_filtered:
        variants = pd.read_csv(variants_out_path)
        activities = pd.read_csv(activities_out_path)
        
    variants_with_variant_id = get_variants_with_variant_id(variants, variant_id)
    activities_with_variant_id = get_filtered_activities(variants_with_variant_id['task_list'], activities)

    variants_with_variant_id.to_csv(variants_out_path, index=False)
    activities_with_variant_id.to_csv(activities_out_path, index=False)

## Filter activities that have a given task in them

In [62]:
def get_variants_with_task(variants, task):
    # Filter variants based on the presence of the task_list_to_check
    variants_with_task = variants[variants['task_list'].apply(lambda x: all(t in x for t in task))]

    return variants_with_task

def filter_variants_with_task(task, is_filtered=False, variants=variants, activities=activities):
    global variants_with_task
    global activities_with_task

    variants_out_path = os.path.join(cache_path, filtered_folder_name, variants_file_name)
    activities_out_path = os.path.join(cache_path, filtered_folder_name, activities_file_name)

    if is_filtered:
        variants = pd.read_csv(variants_out_path)
        activities = pd.read_csv(activities_out_path)
        
    variants_with_task = get_variants_with_task(variants, task)
    activities_with_task = get_filtered_activities(variants_with_task['task_list'], activities)

    variants_with_task.to_csv(variants_out_path, index=False)
    activities_with_task.to_csv(activities_out_path, index=False)

## Filter variants and activities that cover k% of the cases

In [63]:
def get_variants_covering_k_percent_cases(variants, cases):
    # Get the total number of cases by summing the count column in the variants dataframe
    total_cases = variants['count'].sum()

    # Percent of cases to cover
    cases_to_cover = int(total_cases * (cases / 100))

    sorted_df = variants.sort_values(by='count', ascending=False)

    # Sorting by the count of the same variant_ID in the variants DataFrame
    sorted_df['variant_ID_count'] = sorted_df['variant_ID'].map(variants['variant_ID'].value_counts())
    sorted_df = sorted_df.sort_values(by='variant_ID_count')

    # Drop the temporary count column used for sorting
    sorted_df = sorted_df.drop(columns='variant_ID_count')

    # Get the cumulative sum of the count column
    sorted_df['cum_sum'] = sorted_df['count'].cumsum()

    # Get the variants covering the cases_to_cover
    variants_covering_cases = sorted_df[sorted_df['cum_sum'] <= cases_to_cover]

    return variants_covering_cases

def filter_variants_covering_k_percent_cases(percent_cases, is_filtered=False, variants=variants, activities=activities):
    global variants_covering_cases
    global activities_covering_cases
    
    variants_out_path = os.path.join(cache_path, filtered_folder_name, variants_file_name)
    activities_out_path = os.path.join(cache_path, filtered_folder_name, activities_file_name)

    if is_filtered:
        variants = pd.read_csv(variants_out_path)
        activities = pd.read_csv(activities_out_path)

    variants_covering_cases = get_variants_covering_k_percent_cases(variants, percent_cases)
    activities_covering_cases = get_filtered_activities(variants_covering_cases['task_list'], activities)

    variants_covering_cases.to_csv(variants_out_path, index=False)
    activities_covering_cases.to_csv(activities_out_path, index=False)

## Apply the filters

In [64]:
# top_variant_count = 10
# filter_top_k_variants(top_variant_count, is_filtered=False)

# top_k_variants.head()
# top_k_activity_cases.head()

In [65]:
# min_tasks_per_variant = 5
# filter_variants_with_min_tasks(min_tasks_per_variant, is_filtered=False)

# variants_with_min_tasks.head()
# activities_with_min_tasks.head()

In [66]:
# min_cases_per_variant = 5
# filter_variants_with_min_case_counts(min_cases_per_variant, is_filtered=False)

# variants_with_min_case_counts.head()
# activities_with_min_case_counts.head()

In [67]:
# selected_variant_id = 9
# filter_variants_with_variant_id(selected_variant_id, is_filtered=False)

# variants_with_variant_id.head()
# activities_with_variant_id.head()

In [68]:
# selected_task = 'Attachments'
# filter_variants_with_task(selected_task, is_filtered=False)

# variants_with_task.head()
# activities_with_task.head()

In [69]:
# cover_percent_cases = 80
# filter_variants_covering_k_percent_cases(cover_percent_cases, is_filtered=False)

# variants_covering_cases.head()
# activities_covering_cases.head()

## Apply chain of filters

In [70]:
chained_filter_params = [
    {
        "function": filter_top_k_variants,
        "params": {
            "top_variant_count": 10
        }
    },
    {
        "function": filter_variants_with_min_tasks,
        "params": {
            "min_tasks_per_variant": 2
        }
    },
]

apply_filter_chain(chained_filter_params)