## Extract Variants



| rule    | variant_ID | tasks_list |     variant_name       | count | avg_TAT | avg_PT |
| ------- | :---------:|:----------:|:----------------------:|:-----:|:-------:|:------:|
| Default |     1      |  [T1,T2]   |     Variant T1 T2      |   2   |   20    |   10   |
| Default |     2      | [T1,T2,T4] |     Variant T1 T2 T4   |   1   |   40    |   20   |
| Default |     2      | [T2,T1,T4] |     Variant T1 T2 T4   |   1   |   50    |   11   |
| Default |     3      | [T1,T2,T3] |     Variant T1 T2 T3   |   2   |   10    |    9   |
| Default |     3      | [T2,T1,T3] |     Variant T1 T2 T3   |   1   |   30    |   17   |

**Notes**
* Each variant set is based on some Rule. 
* For now "default" rule is first occurance of task instance defines the order of tasks in the variant
* Variant ID is unique for each variant set within a rule
* Tasks are ordered based on the order of their occurance in the variant
* Variant name is the name of the variant set. If possible it could be different for sub variants
* we may use LLM for generating variant names
* count is the number of cases in the variant set
* avg_TAT is the average TAT of the cases in the variant set
* avg_PT is the average Processing time of the cases in the variant set 
* task -> defined or not can vary depending on the rule -> can define when a task is complete and only complete tasks appear in the task_list for the variant.

## Setup

In [None]:
!pip install langchain
!pip install openai

## Import libraries

In [None]:
import os
import sys
import random
sys.path.append('../')

import pandas as pd
import numpy as np
import json
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI, AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.chains import LLMChain
from configs.config import cache as cache_path 

## Read input data

In [None]:
date_format = '%Y-%m-%d %H:%M:%S'

in_file_name = "features.csv"
out_file_name = "variants.csv"
variants_name_file = "variants_names.json"
out_path = cache_path + out_file_name
features_data = pd.read_csv(cache_path + in_file_name)

In [None]:
def add_variant_name(data):
    # Converting task list to set and assigning variant names directly
    data['variant_name'] = 'variant ' + data['task_list'].apply(lambda x: ' '.join(sorted(set(x))))

    # change task list from tuple to list
    data['task_list'] = data['task_list'].apply(list)

    return data

def aggregate_unique_variants_count(data):
    # Define aggregation functions
    agg_functions = {'task': ['size'], 'case_TAT': ['mean'], 'processing_time': ['mean']}

    # Aggregate by grouping on 'TASK' and 'VARIANT_ID', calculating count, mean of 'CASE_TAT', and 'VARIANT_ID'
    aggregrated_counts = data.groupby(['variant_ID', 'task']).agg(agg_functions).reset_index()

    # Rename columns
    aggregrated_counts.columns = ['variant_ID', 'task_list', 'count', 'avg_TAT', 'avg_PT']

    return aggregrated_counts

def get_variant_ID(data, rule):
    # Generate unique IDs based on the set of tasks in 'task_list'
    task_list_ids = data['task'].apply(lambda x: hash(tuple(sorted(x)))).astype(str)
    data['variant_ID'] = pd.factorize(task_list_ids)[0] + 1

    return data

def aggregate_task_as_list(data):
    # Convert 'FIRST_TS' and 'LAST_TS' columns to datetime
    data[['first_TS', 'last_TS']] = data[['first_TS', 'last_TS']].apply(pd.to_datetime)


    # Sort the data by 'FIRST_TS' within each case, then group tasks by 'CASE' and aggregate them into a list
    aggregated_tasks = data.sort_values(by=['case', 'first_TS']).groupby('case').agg({
        'task': tuple,
        'processing_time': 'sum'  # Calculate the sum of processing times for each case
    }).reset_index()
    
    # Calculate 'CASE_TAT' column as the difference between the maximum last timestamp and minimum first timestamp of the case
    # UPDATE THIS!!!
    aggregated_tasks['case_TAT'] = data.groupby('case').apply(lambda x: (x['last_TS'].max() - x['first_TS'].min()).total_seconds()).values

    # drop the CASE column
    aggregated_tasks = aggregated_tasks.drop('case', axis=1)

    return aggregated_tasks

def create_variants_table(rule, features_data):
    # Aggregate tasks as a list
    aggregated_tasks = aggregate_task_as_list(features_data)

    # Create unique IDs for each unique task list
    aggregated_tasks = get_variant_ID(aggregated_tasks, rule)

    # Aggregate unique variants count
    aggregated_tasks = aggregate_unique_variants_count(aggregated_tasks)

    # Add variant name
    aggregated_tasks = add_variant_name(aggregated_tasks)

    # add a rule column as the first column
    aggregated_tasks.insert(0, 'rule', rule)

    return aggregated_tasks

## Create variants according to a rule

In [None]:
def get_rule(rule = 'default'):
    return rule

rule = get_rule()
variants_data = create_variants_table(rule, features_data)

## Update the Variant Names using ChatGPT

### Define ChatGPT API creds

In [None]:
api_endpoint = os.getenv("AZURE_OPENAI_GPT4_ENDPOINT") # os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_GPT4_KEY") # os.getenv("AZURE_OPENAI_KEY")
api_type = 'azure'
model_deployment_name=os.getenv("AZURE_OPENAI_GPT4_DNAME") # os.getenv("AZURE_OPENAI_DNAME")
model_api_version='2023-05-15'
max_tokens = 1000
chat_temperature = 0.8

### Define functions for calling API

In [None]:
def get_prompt_template():
    variant_name_generation_prompt_template = """
    In the variants_list delimited by <>, a list of variant names is given.
    Each variant name is a sequence of task names separated by a whitespace present in that variant. 
    Do the following:
    1. For each variant name, generate a meaningful name with max 6 words for the variant, ignore the task ordering, 
    generate the most meaningful name.
    2. Place each of this new variant name in the variants_list in the same order as the original variant names.
    3. Return a python dictionary with the key as the original variant name and the value as the new variant name.
    4. Omit any kind of other information or explanation in the output.

    variants_list: <{variants_list}>
    """
    return variant_name_generation_prompt_template

def set_llm(prompt_input_variable='variants_list'):
    prompt = PromptTemplate(
        template=get_prompt_template(), input_variables=[prompt_input_variable], output_parser=CommaSeparatedListOutputParser()
    )

    llm_chain = AzureChatOpenAI(
        deployment_name=model_deployment_name,
        openai_api_version=model_api_version,
        azure_endpoint=api_endpoint,
        openai_api_key=api_key,
        temperature=chat_temperature,
        streaming=True,
        verbose=True
    )

    llm_chain = LLMChain(prompt=prompt, llm=llm_chain)

    return llm_chain

### Generate random names and send back the name to the dataframe

In [None]:
def generate_variant_names(llm_chain, variant_names, batch_size=15):
    list_dict = {}
    batch_number = 0

    # Split variant_names into batches using list comprehension
    batches = [variant_names[i:i + batch_size] for i in range(0, len(variant_names), batch_size)]

    print(f"Total batches to process: {len(batches)}")

    # Process batches and update list_dict
    for batch in batches:
        print(f"Processing batch {batch_number}...")
        result = eval(llm_chain.predict(variants_list=batch))
        
        # Update list_dict with results, using batch elements as keys
        list_dict.update(result)
        batch_number += 1

    return list_dict
    

def replace_variant_names(llm_chain, variant_names, new_variants_names):
    variant_names_list = variant_names.values.tolist()

    if new_variants_names:
        print("Generated variant names found, reading names from file")
        if set(variant_names_list) == set(new_variants_names.keys()):
            mapped_variants = list(map(new_variants_names.get, variant_names_list))

            return mapped_variants
        else:
            print("Variant names mismatch, generating new variants names")
            new_variant_names = generate_variant_names(llm_chain, variant_names_list)
            mapped_variants = list(map(new_variant_names.get, variant_names_list))
            with open(cache_path + variants_name_file, 'w') as outfile:
                json.dump(new_variant_names, outfile)

            return mapped_variants
    else:
        print("Generated variant names not found, generating new variants names")
        new_variant_names = generate_variant_names(llm_chain, variant_names_list)
        mapped_variants = list(map(new_variant_names.get, variant_names_list))
        with open(cache_path + variants_name_file, 'w') as outfile:
            json.dump(new_variant_names, outfile)

        return mapped_variants


def run_variant_names_generation_pipeline(variants_data, variants_id, use_chatgpt=False):
    if not use_chatgpt:
        new_variant_names = "variant " + variants_id.astype(str)
        mapped_variants = {k: v for k, v in zip(variants_data, new_variant_names)}
        with open(cache_path + variants_name_file, 'w') as outfile:
            json.dump(mapped_variants, outfile)
        return new_variant_names
    
    variants_names = None
    # read the json file with the variant names if it exists
    if os.path.exists(cache_path + variants_name_file):
        with open(cache_path + variants_name_file) as json_file:
            variants_names = json.load(json_file)

    llm_chain = set_llm()
    return replace_variant_names(llm_chain, variants_data, variants_names)

### Run the generation pipeline

In [None]:
variants_data.rename(columns={'variant_name': 'old_variant_name'}, inplace=True)
variants_data['variant_name'] = run_variant_names_generation_pipeline(variants_data['old_variant_name'], variants_data['variant_ID'], use_chatgpt=False)

## Export variants to variants.csv file

In [None]:
# save the variants data to csv
variants_data.to_csv(cache_path + out_file_name, index=False)