In [None]:
# ! pip install tiktoken
# ! pip install python-dotenv
# ! pip install pandas
# ! pip install pandasql
# ! pip install openai
# ! pip install langchain
# ! pip install langchain\[all\]
# ! pip install pyspark
# ! pip install findspark
# ! pip install faker

In [1]:
import os
import openai
import tiktoken
from dotenv import load_dotenv, find_dotenv

import pandas as pd
import warnings
import time
import random

warnings.filterwarnings('ignore')

In [3]:
# fix this
_ = load_dotenv(find_dotenv()) # read local .env 
openai.api_key  =  os.getenv("AZURE_OPENAI_KEY")

### Define some questions to ask

In [None]:
# question_1 = f"""Who has the longest average processing time?"""
# question_2 = f"""Who has the shortest average processing time?"""
# question_3 = f"""Who is the most productive participant based on the number of tasks completed?"""
# question_4 = f"""Who uses the least number of keystrokes on average?"""
# question_5 = f"""Name of the participant that uses the most number of long cut keys on average?""" # An invalid question, to check hallucination
# exception_question = f"""How was the day?"""

### Define hyperparams/Global variables

In [None]:
MODEL = "gpt-3.5-turbo"

DELIMITER_START = "<"
DELIMITER_END = ">"

EXCEPTION_OUTPUT = "Sorry, I'm not intelligent enough to answer your question now. Please try again or with a different question!"

### Define the Schema

In [None]:
def get_schema():
    # Specify the path to the text file
    file_path = '../data/sample/schema0.txt'

    # Read the contents of the text file
    with open(file_path, 'r') as file:
        file_contents = file.read()

    return file_contents

### Utils

In [None]:
def get_token_dict(response):
    token_dict = {
        'prompt_tokens':response['usage']['prompt_tokens'],
        'completion_tokens':response['usage']['completion_tokens'],
        'total_tokens':response['usage']['total_tokens'],
    }

    return token_dict

### Define role based prompts

In [None]:
def define_prompts(schema):
    system_prompt = f"""You are a SQL code generator.
    You will be provided with a user question delimited by {DELIMITER_START}{DELIMITER_END}.
    Perform the following actions:
    1. Understand the user question. 
    2. Understand the schema delimited by {DELIMITER_START}{DELIMITER_END}.
    3. Use the schema to generate SQL query answering the question.
    5. If the information required by the user question cannot be derived from the schema, \
        output a string saying, {EXCEPTION_OUTPUT}
    6. Output computationally most efficient SQL query.
    7. Omit any explanations.

    Use the following format:
        Output: {DELIMITER_START}valid output{DELIMITER_END}

    Schema: {DELIMITER_START}{schema}{DELIMITER_END}
    """
    
    return system_prompt



### Using ChatGPT API for returning answers 

In [None]:
def get_completion(user_prompt, system_prompt, model=MODEL):
    # messages = [{
    #     "role": "user", 
    #     "content": prompt
    #     }]
    
    messages=[
        {"role": "system", "content": system_prompt},
        #{"role": "assistant", "content": "Who's there?"},
        {"role": "user", "content": f"{DELIMITER_START}{user_prompt}{DELIMITER_END}"},
    ]

    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )

    content = response.choices[0].message["content"]

    token_dict = get_token_dict(response)
    return content, token_dict

In [None]:
schema = get_schema()

In [None]:

system_prompt = define_prompts(schema)

print("User question 1:", question_1)
start = time.time()
response_1, token_dict_1 = get_completion(question_5, system_prompt)
end = time.time()

print("Time taken for prompt_1:", end-start)
print("\nQuestion 1 output:")
print(response_1)
print("\nToken dict for question 1:")
print(token_dict_1)

### Using LangChain for returning answers

In [4]:
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [5]:
import os
import yaml
import json

from langchain.llms.openai import OpenAI
from langchain.chat_models import ChatOpenAI

from langchain.document_loaders import CSVLoader
from langchain.document_loaders.text import TextLoader

In [6]:
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [7]:
from langchain.chat_models import AzureChatOpenAI
from langchain.llms import AzureOpenAI
from langchain.vectorstores import FAISS

In [None]:
fallback_message = "Sorry, I'm not intelligent enough to answer your question now. Please try again or with a different question!"

#### Define Azure keys

In [8]:
api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_KEY")
api_type = 'azure'
model_deployment_name=os.getenv("AZURE_OPENAI_DNAME")
embeddings_deployment_name= os.getenv("AZURE_OPENAI_EMBED_NAME") # os.getenv("AZURE_OPENAI_EMBED_NAME")
embedding_api_version='2022-12-01'
model_api_version='2023-05-15'

max_tokens = 1000
chat_temperature = 0
 #This will correspond to the custom name you chose for your deployment when you deployed a model. 

#### Read schema in JSON format
Why only JSON?

In [9]:
file0 = '../askskan/data/sample/schema0.txt'
file1 = '../askskan/data/original/schema/schema.csv'
file3 = '../askskan/data/sample/schema0.csv'

data_file1 = '../askskan/data/sample/data0.csv'
definition_text_file = '../askskan/data/original/definitions/definitions.txt'
fake_data_file = '../askskan/data/original/sample_data.csv'

#### Generate fake data based on original schema

In [None]:
from faker import Faker
import pandas as pd

fake = Faker()

# Generate 10 unique rows of data
data = []
unique_values = set()

while len(data) < 20:
    row = {
        "event_id": fake.uuid4(),
        "sequence_id": fake.random_int(min=10000, max=99999),
        "event_time": fake.iso8601(),
        "persona_name": fake.name(),
        "app_name": fake.word(),
        "agent_type": fake.random_int(min=0, max=200),
        "clipboard": fake.random_int(min=0, max=10),
        "participant_name": fake.name(),
        "title": fake.sentence(),
        "event_date": fake.date(),
        "navigation_key_count": fake.random_int(min=0, max=10),
        "number_key_count": fake.random_int(min=0, max=10),
        "mouse_count": fake.random_int(min=0, max=10),
        "mouse_wheel": fake.random_int(min=0, max=10),
        "alpha_key_count": fake.random_int(min=0, max=10),
        "active_time": fake.random_int(min=0, max=100),
        "idle_time": random.uniform(0, 2000)/100.0, # fake.random_number(max=2000) / 100.0,
        "wait_time": random.uniform(0, 2000)/100.0,
        "processing_time": random.uniform(0, 1), # fake.random_number(max=1),
        "TaT_event": random.uniform(0, 1),
        "session_switch": fake.random_int(min=0, max=1),
        "app_switch": fake.random_int(min=0, max=1),
        "case_switch": fake.random_int(min=0, max=1),
        "activity_id": fake.uuid4(),
        "activity_abstraction_level_id": fake.uuid4(),
        "activity_abstraction_level_name": fake.word(),
        "parent_activity_id": fake.uuid4(),
        "activity_discovered_name": fake.sentence(),
        "activity_alias_name": fake.sentence(),
        "activity_instance_id": fake.uuid4(),
        "activity_instance_abstraction_level_alias_name": fake.word(),
        "activity_instance_original_end_time": fake.iso8601(),
        "activity_instance_end_time": fake.iso8601(),
        "activity_instance_event_count": fake.random_int(min=1, max=10),
        "activity_instance_start_time": fake.iso8601(),
        "case_id_name": fake.word(),
        "case_id_value": fake.word(),
        "url": fake.url(),
        "is_pruned": fake.random_int(min=0, max=1),
        "source": fake.random_element(["UA", "AA", "EA"]),
        "event_control_type": fake.random_int(min=10000, max=99999),
        "event_input_type": fake.random_int(min=100, max=999),
    }
    
    # Check if the generated row is unique
    row_tuple = tuple(row.values())
    if row_tuple not in unique_values:
        data.append(row)
        unique_values.add(row_tuple)

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Print the dataset
# print(df.head())

# Convert DataFrame to CSV
df.to_csv('../data/original/sample_data.csv', index=False)



#### Convert csv to text file

In [None]:
import csv

csv_file = definition_file
text_file = '../data/original/definitions.txt'

with open(csv_file, 'r') as file:
    reader = csv.DictReader(file)
    terms_definitions = []
    for row in reader:
        term = row['Term'].strip()
        definition = row['Definition'].strip()
        terms_definitions.append(f'{term}:{definition}')

with open(text_file, 'w') as file:
    file.write('\n'.join(terms_definitions))


#### Convert text to csv file

In [None]:
import csv

# Specify the input text file path
input_file = file0

# Specify the output CSV file path
output_file = '../data/sample/schema0.csv'

# Define the field names for the CSV file
field_names = [
    'case_id',
    'case_status',
    'case_type',
    'task_name',
    'participant_id',
    'process_name',
    'process_variant',
    'applications_used',
    'processing_time',
    'wait_time',
    'turnaround_time',
    'start_time',
    'no_of_keystrokes',
    'no_of_shortcut_keys'
]

# Open the input file for reading
with open(input_file, 'r') as file:
    # Read the lines from the input file
    lines = file.readlines()

# Create a list to store the data rows for the CSV
data_rows = []

# Parse each line and extract the values
for line in lines:
    # Remove leading/trailing whitespaces and split the line by ':'
    line = line.strip()
    parts = line.split(':')

    # Extract the field name and value
    field_name = parts[0].strip()
    field_value = ':'.join(parts[1:]).strip()

    # Append the extracted values to the data row
    data_row = field_value
    data_rows.append(data_row)

# Write the data rows to the output CSV file
with open(output_file, 'w', newline='') as file:
    writer = csv.writer(file)
    
    # Write the header row
    writer.writerow(field_names)
    
    # Write the data rows
    writer.writerow(data_rows)


In [None]:
# Read the Excel file and skip the first row
df = pd.read_excel(file2, skiprows=1)

# Drop the desired column
column_to_drop = ['Source', 'Comments']
df = df.drop(column_to_drop, axis=1)

# Convert DataFrame to CSV
df.to_csv('../data/original/schema1.csv', index=False)


#### Load documents

In [10]:
loader_text = TextLoader(file_path=file0)
docs_text = loader_text.load()


loader_csv = CSVLoader(file_path=file1)
docs_csv = loader_csv.load()

#### Split the documents into chunks

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7, chunk_overlap=6)
documents = text_splitter.split_documents(docs_csv)

In [None]:
print(documents)

[Document(lc_kwargs={'page_content': 'Column', 'metadata': {'source': '../askskan/data/original/schema/schema.csv', 'row': 0}}, page_content='Column', metadata={'source': '../askskan/data/original/schema/schema.csv', 'row': 0}), Document(lc_kwargs={'page_content': 'Name:', 'metadata': {'source': '../askskan/data/original/schema/schema.csv', 'row': 0}}, page_content='Name:', metadata={'source': '../askskan/data/original/schema/schema.csv', 'row': 0}), Document(lc_kwargs={'page_content': 'event_', 'metadata': {'source': '../askskan/data/original/schema/schema.csv', 'row': 0}}, page_content='event_', metadata={'source': '../askskan/data/original/schema/schema.csv', 'row': 0}), Document(lc_kwargs={'page_content': 'event_i', 'metadata': {'source': '../askskan/data/original/schema/schema.csv', 'row': 0}}, page_content='event_i', metadata={'source': '../askskan/data/original/schema/schema.csv', 'row': 0}), Document(lc_kwargs={'page_content': 'vent_id', 'metadata': {'source': '../askskan/data/

In [None]:
import pandas as pd
from pprint import pprint
  
df = pd.read_csv(file3)
  

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   case_id              1 non-null      object
 1   case_status          1 non-null      object
 2   case_type            1 non-null      object
 3   task_name            1 non-null      object
 4   participant_id       1 non-null      object
 5   process_name         1 non-null      object
 6   process_variant      1 non-null      object
 7   applications_used    1 non-null      object
 8   processing_time      1 non-null      object
 9   wait_time            1 non-null      object
 10  turnaround_time      1 non-null      object
 11  start_time           1 non-null      object
 12  no_of_keystrokes     1 non-null      object
 13  no_of_shortcut_keys  1 non-null      object
dtypes: object(14)
memory usage: 240.0+ bytes


In [12]:
# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",
#                               chunk_size=1)
# embeddings = OpenAIEmbeddings()
print(type(embeddings_deployment_name))
embeddings = OpenAIEmbeddings(
                deployment=embeddings_deployment_name,
                openai_api_key=api_key,
                openai_api_base=api_base,
                openai_api_type=api_type,
                openai_api_version=embedding_api_version,
                chunk_size=16,
            )
#print(embeddings)

#vectorstore = DocArrayInMemorySearch.from_documents(documents, embeddings)
vectorstore = FAISS.from_documents(documents, embeddings)


<class 'str'>


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 6 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 2 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Cr

In [None]:
retriever = vectorstore.as_retriever()

vectorstore=<langchain.vectorstores.faiss.FAISS object at 0x1106773a0> search_type='similarity' search_kwargs={}


#### Create template to provide as an input to the chain

In [None]:
schema_template = """
You are a super SQL code generator.
Perform the following actions:

1. Understand the input by human delimited by <>
2. Understand the schema delimited by <>.
3. Based on the question and the schema, first seek to clarify any ambiguities.
    - Specifically first summarise a list of super short bullets of areas that need clarification.
    - Then pick one clarifying question, and wait for an answer from the user before moving to the next question. \
        Do this for all the questions one by one. 
    - Generate the SQL query only after all the clarifying questions have been answered by the user.
    - Do not assume the answer to any of these clarifying questions.
4. Do not assume any schema attributes unless stated explicitly.
5. Use the schema to generate SQL query answering the question.
6. Output computationally most efficient SQL query.
7. Omit any explanations.

Use the following format:
-If there is a clarifying question:
    Output: <clarifying questions generated> 
- Once all the clarifying questions are answered:
    Output query: <valid output>

Schema: <{schema}>

Current conversation:
{history}   
Human: <{input}>
Skan Bot: 
"""

In [None]:
new_schema_template = """
You are a super SQL code generator.
Perform the following actions:

1. Understand the input by human delimited by <>
2. Understand the schema delimited by <>.
3. Based on the question and the schema, first seek to clarify any ambiguities.
    - Specifically first summarise a list of super short bullets of areas that need clarification.
    - Then pick one clarification point, and wait for an answer from the human before moving to the next point.
4. Only human can answer these clarification points.
5. Do not assume any schema attributes unless stated explicitly.
5. Only after all the clarification points are answered:
    - Use the schema to generate a final Spark SQL query answering the original question.
    - Output computationally most efficient Spark SQL query.
7. Omit any explanations.

Schema: <{context}>

If there are clarification points and no Spark SQL query, use this format:
    - Doubt: <clarification point>

If all the clarification points are answered by the human and there's a Spark SQL query, use this format:
    - Query: <valid output in following JSON format with the Spark SQL query in the Query field, 
    All the previous clarification points in the Doubts field, \
    The original question in the Question field.>

Current conversation:
{chat_history}   
Human: <{question}>
Skan Bot: 

8. Output a string saying, {fallback_message} if:
    - more than 7 clarification points are needed.
    - a new schema or schema definitions are needed.
"""

Give the output in JSON format delimited by <> as per the following rules:    
    1. If there are clarification points and no Spark SQL query:
        - Doubt: <clarification point>

<The JSON should have the following fields:
                1. "Query": The generated Spark SQL query
                2. "Code": Python code generated to run the Spark SQL query
                3. "Skan Bot": The final answer printed by the python code in a friendly tone. Place the answer between ##


 {{
    "Query": "SELECT * FROM table",
    "Code": "df = "import pandas as pd\nfrom pyspark.sql import SparkSession\n\n\
        # Create Spark session\nspark = SparkSession.builder.appName('ActiveTimeWindow').getOrCreate()\
        \n\nspark.read.csv('data.csv', header=True)\ndf.createOrReplaceTempView('table')\n\
        spark.sql('SELECT * FROM table').show()",
    "Skan Bot": "The table has the columns #code output#"
}}

SyntaxError: invalid syntax (991682111.py, line 37)

In [None]:
template_with_code = """
You are a super smart code generator.
Perform the following actions:

1. Understand the input by human, schema and schema definitions each delimited by <>
2. Based on the question, the schema and the schema definitions, first seek to clarify any ambiguities.
    - Assume human does not know anything about the schema, so never ask details about the schema or schema \
        definitions or technical details in the clarification points.
    - First try to find the answer to the ambiguities from the schema and schema definitions yourself.
    - If the ambiguities still persist then: 
        1. Without mentioning about the schema or schema definitions think of a list of super short bullets of \
            areas that need clarification in simple terms. 
        2. Then pick one clarification point, and wait for an answer from the human before moving to the next point.
3. Never ask the human to explain the schema or schema definitions.
4. Only human can answer these clarification points.
5. Do not assume any schema attributes unless stated explicitly.
6. Only after all the clarification points are answered:
    - Use only columns in schema to generate a final Spark SQL query answering the original question. Refer to the \
        schema definitions only for more clarity about terms in the schema.
    - Output computationally most efficient Spark SQL query.
7. You can ask maximum 7 clarification points.
9. I have a pandas csv table in data table delimited by <> that contains the data to be queried.
    - Output a valid python code to execute the generated Spark SQL query on the data file ending with a print statement \
        showing the final answer in a friendly tone.
10. Omit any explanations.

Schema: <{context}>
Schema definitions: <{schema_definitions}>
Data table: <{data_table}>


If there are clarification points and no Spark SQL query, ask the clarification question in this format:
    Doubt: <clarification point>

After all the clarification points are answered by the human and there's a Spark SQL query generated, use this format:
    Output: <valid output in following JSON format with the Spark SQL query in the Query field, 
    Python code generated to run the Spark SQL query in the Code field, \
    The final answer printed by the python code in a friendly tone in the Skan Bot field. The answer should be delimited by ##.>

Current conversation:
{chat_history}   
Human: <{question}>
Skan Bot: 
"""

In [None]:
new_template_with_code = """
You are a super smart code generator.
Perform the following actions:

1. Understand the question by human, schema, schema definitions and context each delimited by <>.
2. Based on the question, the schema, the schema definitions and the context, first seek to clarify any ambiguities.
    - Assume human does not know anything about the schema, so never ask details about the data table, schema or schema \
        definitions or technical details in the clarification points.
    - First try to find the answer to the ambiguities from the schema and schema definitions yourself.
    - If the ambiguities still persist then: 
        1. Without mentioning about the schema or schema definitions only think of a list of super short bullets of \
            areas that need clarification in simple terms. 
        3. Then pick one clarification point and ask in the following format: 
                Doubt: <clarification point>, 
            and wait for an answer from the human before moving to the next clarification point.
4. Only human can answer these clarification points.
5. Do not assume any schema attributes unless stated explicitly.
6. Once all the clarification points are answered:
    - Use columns the in schema to generate a final Spark SQL query answering the original question. Refer to the \
        schema definitions only for more clarity about terms in the schema.
    - Output computationally most efficient Spark SQL query.
7. You can ask maximum 5 clarification points.
8. I have a pandas csv table in data table delimited by <> that contains the data to be queried.
    - Output a valid python code to execute the generated Spark SQL query on the data file ending with a print statement \
        showing the final answer. 
9. Omit any explanations.

Schema: <{context}>
Schema definitions: <{schema_definitions}>
Data table: <{data_table}>


Only After all the clarification points are answered by the human and there's a Spark SQL query generated, 
use only the following JSON schema format stricly, refer to the example output:
Output: {{
    "Query": The generated Spark SQL query,
    "Code": Python code generated to run the Spark SQL query,
    "Skan Bot": The final answer printed by the python code in a friendly tone. Delimit the answer obtained by python code in <>.
}}

Following is an example of the output:
Output: {{
    "Query": "SELECT app_name, COUNT(*) AS count FROM clipboard GROUP BY app_name ORDER BY count DESC LIMIT 1",
    "Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.appName('example').\
        getOrCreate()\ndata = spark.read.csv('../askskan/data/original/sample_data.csv', header=True, inferSchema=True)\ndata.\
        createOrReplaceTempView('clipboard')\n\nquery = 'SELECT app_name, COUNT(*) AS count FROM clipboard GROUP BY app_name \
        ORDER BY count DESC LIMIT 1'\nresult = spark.sql(query)\nresult.show()\n\
            print('The most used application is + result.collect()[0][0].)",
    "Skan Bot": "The most used application is: <application name>."
}}
    

Current conversation:
{chat_history}   
Human: <{question}>
Skan Bot: 
"""

In [None]:
prompt_without_followups = """
You are a super smart code generator.
Perform the following actions:

1. Understand the question by human, schema, schema definitions and context each delimited by <>.
2. Do not assume any schema attributes unless stated explicitly.
3. Use columns in the schema to generate a final Spark SQL query from the table in data table name delimited by <>\
    answering the original question. The generated query should for data from the start date to end date\
        each delimited by <>. Refer to the schema definitions only for more clarity about terms in the schema.
4. Output computationally most efficient Spark SQL query.
5. Refer to the orginal question again and then select the correct column from the generated Spark SQL query.
5. I have a pandas csv table in data table delimited by <> that contains the data to be queried.
    - Output a valid python code to execute the generated Spark SQL query on the data file.
6. Omit any explanations.

Schema: <{context}>
Schema definitions: <{schema_definitions}>
Data table: <{data_table}>
Data table name: <{data_table_name}>
Start date: <{start_date}>
End date: <{end_date}>


Once the Spark SQL query generated use only the following JSON schema format stricly, refer to the example output:
Output: {{
    "Query": The generated Spark SQL query from the start date to end date,
    "Column": The correct extracted column from the Spark SQL query,
    "Code": Python code generated to run the Spark SQL query. The sql query within this python code should be \
       only be within single quotes e.g. query='SELECT * FROM table' ,
    "Skan Bot": The final answer printed by the python code in a friendly tone. Delimit the answer obtained by \
        python code in ##.
}}

Following is an example of the output:
Output: {{
    "Query": "SELECT app_name, COUNT(*) AS count FROM clipboard WHERE event_date >= '2023-04-01' AND event_date <= '2023-04-30'" \
        GROUP BY app_name ORDER BY count DESC LIMIT 1',
    "Column": "app_name", 
    "Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.appName('schema').\
        getOrCreate()\ndata = spark.read.csv('../askskan/data/original/sample_data.csv', header=True, inferSchema=True)\ndata.\
        createOrReplaceTempView('clipboard')\n\nquery = 'SELECT app_name, COUNT(*) AS count FROM clipboard GROUP BY app_name \
        ORDER BY count DESC LIMIT 1'\nresult = spark.sql(query)\n\n spark.stop()\n",
    "Skan Bot": "The most used application is #result#."
}}

nsure the Output can be parsed by Python json.loads
    

Current conversation:
{chat_history}   
Human: <{question}>
Skan Bot: 
"""

#### Prompts with CoT format

In [None]:
instruction_prompt_template_no_code = """
You are a super smart code generator.
Perform the following actions:

1. Carefully understand the question by human, schema, schema definitions and context each delimited by <>.
2. Do not assume or create any schema attributes not given in the schema unless stated explicitly.
3. Do not assume any facts from the question unless stated explicitly.
4. Do not assume any values of the schema attributes that are not present in the schema or in the question. 
5. Strictly follow the following instructions:
    - The schema has names of the schema columns in Column Name, the descriptions of the columns in Description \
        and an example value of the schema column in Examples.
    - Use only the Column Name in the schema and the question to generate the final Spark SQL query from the table \
        in data table name delimited by <> answering the original question. 
    - Schema definitions and Description in schema can be referred for providing more clarity about Columns in the schema. \
        Do not use values from schema definitions as column names or column values to generate the query.
    - The query result should be LIMITED by 10 rows.
    - The generated query should be from the start date to end date each delimited by <>. 
    
6. Output the correct Spark SQL query.
7. Select all the appropriate column(s) from the generated Spark SQL query by critically referring to the \
    requirements asked in the original question.
8. Write the answer in a friendly tone in response to the question.
9. Stricly use the following JSON schema format for the output, refer to the example output below. Never include any\
extra text.
10. Omit any explanations.
Let's Think Step by Step.

Schema: <{context}>
Schema definitions: <{schema_definitions}>
Data table: <{data_table}>
Data table name: <{data_table_name}>
Start date: <{start_date}>
End date: <{end_date}>


Once the Spark SQL query generated use only the following JSON schema format stricly, refer to the example output:
Output: {{
    "Query": The generated Spark SQL query from the start date to end date. Limit the results to maximum 10 rows for queries \
        with more than 1 row,
    "Column": The correct extracted column(s) from the Spark SQL query in a list,
    "Skan Bot": The final answer in a friendly tone. The answer from the SQL query should be delimited by ##.
}}

Following are examples of the context in the user question along with the user question(s) and corresponding desired output:
Question Context: Whenever the user question does not specify about the number of results needed, then return maximum 10 rows.
1.  Question: What are the events that happened?
    Output: {{
        "Query": "SELECT event_id WHERE event_date >= '2023-04-01' AND event_date <= '2023-04-30' DESC LIMIT 10",
        "Column": ["event_id"], 
        "Skan Bot": "The events are #event_id#."
    }}

2. Question: What are applications used?
    Output: {{
        "Query": "SELECT app_name, COUNT(*) AS count FROM unum_askskan.events_delta_tb WHERE event_date >= '2023-04-01' \
            AND event_date <= '2023-04-30' GROUP BY app_name ORDER BY count DESC LIMIT 10",
        "Column": ["app_name"], 
        "Skan Bot": "The applications used are #app_name#."
    }}

Question Context: Whenever most used or top is asked, it means the use LIMIT 1 for giving 1 row in the result.
1.  Question: What is the most used application?
    Output: {{
        "Query": "SELECT app_name, COUNT(*) AS count FROM unum_askskan.events_delta_t WHERE event_date >= '2023-04-01' AND event_date <= '2023-04-30'" \
            GROUP BY app_name ORDER BY count DESC LIMIT 1",
        "Column": ["app_name"], 
        "Skan Bot": "The most used application is #app_name#."
    }}

Question Context: Whenever total time spent or average time spent is asked in  questions regarding applications, then use the active_time column in the schema. 
1.  Question: What is the total time spent on process applications for this Do Not Use persona?
    Output: {{
        "Query": "SELECT SUM(active_time) AS total_time FROM unum_askskan.events_delta_tb WHERE agent_type != 0 AND persona_name = 'Do Not Use' AND \
            event_date >= '2023-04-01' AND event_date <= '2023-04-30' LIMIT 1",
        "Column": ["total_time"],
        "Skan Bot": "The total time spent on process applications for this Do Not Use persona is #total_time#."
    }}

2.  Question: What is the total time spent on non process applications?
    Output: {{
        "Query": "SELECT SUM(active_time) AS total_time FROM unum_askskan.events_delta_tb WHERE agent_type == 0 AND \
            event_date >= '2023-04-01' AND event_date <= '2023-04-30' LIMIT 1",
        "Column": ["total_time"],
        "Skan Bot": "The total time spent on non process applications is #total_time#."
    }}

3.  Question: What is the average time spent on process applications for this Do Not Use persona?
    Output: {{
        "Query": "SELECT AVG(active_time) AS avg_time FROM unum_askskan.events_delta_tb WHERE agent_type != 0 AND persona_name = 'Do Not Use' AND \
                event_date >= '2023-04-01' AND event_date <= '2023-04-30' LIMIT 1",
        "Column": ["avg_time"],
        "Skan Bot": "The average time spent on process applications for this Do Not Use persona is #avg_time#."
    }}

Question Context: Whenever total average time per day is asked in  questions regarding applications, then first sum the active_time and divide by COUNT(DISTINCT event_date).
1.  Question: What is the average time spent in non process applications per day?
    Output: {{
        "Query": "SELECT SUM(active_time)/COUNT(DISTINCT event_date) AS avg_time_per_day FROM unum_askskan.events_delta_tb WHERE agent_type==0 AND \
            event_date >= '2023-04-01' AND event_date <= '2023-04-30' LIMIT 1",
        "Column": ["avg_time_per_day"],
        "Skan Bot": "The average time spent on non process applications in non process applications per day is #avg_time_per_day#.
    }}

2.  Question: What is the average time spent in process applications per day?
    Output: {{
        "Query": "SELECT SUM(active_time)/COUNT(DISTINCT event_date) AS avg_time_per_day FROM unum_askskan.events_delta_tb WHERE agent_type!=0 AND \
            event_date >= '2023-04-01' AND event_date <= '2023-04-30' LIMIT 1",
        "Column": ["avg_time_per_day"],
        "Skan Bot": "The average time spent on non process applications in process applications per day is #avg_time_per_day#.
    }}

Question Context: Whenever asked about number of events then use the event_id column in the schema. Whenever asked about \
per day then sum(column)/COUNT(DISTINCT event_date).
1.  Question: what is the average number of events in a day?
    Output: {{
        "Query": "SELECT COUNT(event_id)/COUNT(DISTINCT event_date) AS avg_events_per_day FROM unum_askskan.events_delta_tb WHERE event_date >= '2023-04-01' \
            AND event_date <= '2023-04-30' LIMIT 1",
        "Column": ["avg_events_per_day"],
        "Skan Bot": "The average number of events in a day is #avg_events_per_day#.
    }}

    
Question Context: Whenever asked about average of a column per case then sum(column)/count(unique case_id_value). DO NOT \
    DO AVG(column) as it will give wrong results.
1.  Question: which participant  has the least  average processing time per case?
    Output: {{
        "Query": "SELECT participant_name as participant,SUM(processing_time)/count(DISTINCT case_id_value)  AS avg_processing_per_case FROM unum_askskan.events_delta_tb WHERE event_date >= '2023-04-01' \
            AND event_date <= '2023-04-30' GROUP BY participant ORDER BY avg_processing_per_case LIMIT ASC 1",
        "Column": ["avg_processing_per_case"],
        "Skan Bot": "The average processing time per case is #avg_processing_per_case#.
    }}

Question Context: Whenever asked about average of a column is then do sum(column)/COUNT(DISTINCT event_date). DO NOT \
    DO AVG(column) as it will give wrong results.
1.  Question: What is the average case switch?
    Output: {{
        "Query": "SELECT SUM(CAST(case_switch AS INT))/count(DISTINCT event_date) AS avg_case_switch FROM unum_askskan.events_delta_tb WHERE event_date >= '2023-04-01' \
            AND event_date <= '2023-04-30' LIMIT 1",
        "Column": ["avg_case_switch"],
        "Skan Bot": "the average case switch is #avg_processing_per_case#.
    }}


Question Context: Whenever asked about last event or first event for the day use \
    SELECT time as tm,COUNT(date) as cnt FROM(SELECT event_date as date,MIN(SUBSTRING(event_time,12,8)) as time. DO NOT USE \
activity_instance_original_end_time or activity_instance_original_start_time as it will give wrong results.
1.  Question: When is first event for the day?
    Output: {{
        "Query": "SELECT time as first_event_time,COUNT(date) as count FROM(SELECT event_date as date,MIN(SUBSTRING(event_time,12,8)) as time FROM unum_askskan.events_delta_tb \
            WHERE event_date >= '2023-04-01' AND event_date <='2023-04-30' GROUP BY date ORDER BY time) GROUP BY first_event_time ORDER BY count DESC LIMIT 1",
        "Column": ["first_event_time"],
        "Skan Bot": "The first event occurs at #first_event_time#.
    }}

2.  Question: When is last event for the day? 
    Output: {{
        "Query": "SELECT time as last_event_time,COUNT(date) as count FROM(SELECT event_date as date,MAX(SUBSTRING(event_time,12,8)) as time FROM unum_askskan.events_delta_tb \
            WHERE event_date >= '2023-04-01' AND event_date <='2023-04-30' GROUP BY date ORDER BY time) GROUP BY last_event_time ORDER BY count DESC LIMIT 1",
        "Column": ["last_event_time"],
        "Skan Bot": "The last event occurs at #last_event_time#.
    }}

Question Context: Whenever asked about  average Count of users on leave  then use the event_date column in the schema.
1.  Question:what is the average count of users on leave for this LES persona?
    Output: {{
        "Query": "SELECT SUM(30-active_days)/30 AS avg_user_on_leave FROM (SELECT participant_name as participant,COUNT(DISTINCT event_date) AS active_days \
            FROM unum_askskan.events_delta_tb WHERE event_date >= '2023-04-01' AND event_date <= '2023-04-30' GROUP BY participant ORDER BY active_days) LIMIT 1",
        "Column": ["event_date"],
        "Skan Bot": "Average User on leave is  #event_date#.
    }}

Question Context: Whenever asked about users on leave then use event_date column.
1.  Question:Which pariticipant was on leave most days in this month ?
    Output: {{
        "Query": "SELECT participant_name, COUNT(DISTINCT event_date) AS leave_days FROM unum_askskan.events_delta_tb WHERE event_date >= '2023-04-01' \
            AND event_date <= '2023-04-30' GROUP BY participant_name ORDER BY leave_days ASC LIMIT 1",
        "Column": ["event_date"],
        "Skan Bot": "Average User on leave is  #event_date#.
    }}


Question Context: persona_name = 'Do Not Use' should NOT BE USED unless explicitly mentioned in the question.
1.  Question: which is the most used application by the user which is not used for business operation?
    Output: {{
        "Query": "SELECT app_name, COUNT(*) AS count FROM unum_askskan.events_delta_tb WHERE agent_type = 0 AND \
            app_name IS NOT NULL AND event_date >= '2023-04-01' AND event_date <= '2023-04-30' GROUP BY app_name ORDER BY count DESC LIMIT 1",
        "Column": ["app_name"],
        "Skan Bot": "The most used  application by the user which is not used for business operation is #app_name#.
    }}

Question Context: Whenever asked about activity then definitely use activity_id column in the schema.
1.  Question: Which is the most common activity?
    Output: {{
        "Query": "SELECT activity_id, COUNT(*) AS count FROM unum_askskan.events_delta_tb WHERE event_date >= '2023-04-01' AND \
            event_date <= '2023-04-30' GROUP BY activity_id ORDER BY count DESC LIMIT 1",
        "Column": ["activity_id"],
        "Skan Bot": "The most common activity is #activity_id#.
    }}

Question Context: Whenever asked clipboard then always use SUM(clipboard).
1.  Question: which participant has the highest no of clipboard count?
    Output: {{
        "Query": "SELECT participant_name, SUM(clipboard) as count from unum.data Where event_date>='2023-04-01' and event_date<='2023-04-30' \
            GROUP BY participant_name ORDER BY count DESC LIMIT 1",
        "Column": ["participant_name"],
        "Skan Bot": "The participant with the highest clipboard count is #participant_name#.
    }}

    
Please use "double quotes" for json keys and ensure the Output can be parsed by Python json.loads
    

Current conversation:
{chat_history}   
Human: <{question}>
Skan Bot: 
"""

#### Questions to be asked by the user

In [None]:
question_1 = "Total time spent on process application?"
question_2 = "Total time spent of non process application?"
question_3 = "Is there an outstanding performer in CES persona?"
question_4 = "Which is the most used application?"
question_5 = "Which is the most active time window during the day?"
question_6 = "What is the average case effort per persona?"
question_7 = "What is the average utilization per persona?"
question_8 = "Which week had highest productivity in CES?"
question_9 = "What is the distribution of time spent in processing application per day?"
question_10 = "What is the average number of participants per case?"
question_11 = "What is the frequency of cases per participant per day?"
question_12 = "Who will be most productive next week?"
question_13 = "Any patterns observed that give an insight on inefficiencies?"
question_14 = "Would taking breaks improve the efficiency in performance?"

question_15 = "What is the standard deviation of time spent on process application?"
question_16 = "What is the standard deviation of time spent on non process application?"

# question_1 = f"""Who has the longest average processing time?"""
# question_2 = f"""Who has the shortest average processing time?"""
# question_3 = f"""Who is the most productive participant based on the number of tasks completed?"""
# question_4 = f"""Who uses the least number of keystrokes on average?"""
# question_5 = f"""Name of the participant that uses the most number of long cut keys on average?""" # An invalid question, to check hallucination
exception_question = f"""How was the day?"""

#### Replacing schema in the template with the actual Schema string
This has to be replaced by either:
1. A custom chain that can take in 2 inputs as currently no Chain can take in multiple inputs in LangChain
2. Provide schema as an embedding -> start with this as this can select out relevant parts of the schema for the query reducing the input token size.

In [None]:
filled_template = schema_template.replace("{schema}", schema)

#### LLM and prompt template

In [None]:
llm = ChatOpenAI(temperature=0.6)
prompt = PromptTemplate(input_variables=["history", "input"], template=filled_template)

NameError: name 'filled_template' is not defined

#### Conversation Chain

In [None]:
conversation = ConversationChain(
    prompt=prompt,
    llm=llm,
    verbose=True,
    memory=ConversationBufferMemory(),
)

NameError: name 'prompt' is not defined

#### Outputs from the LLM

In [None]:
print(conversation.predict(input=question_3))

In [None]:
conversation.predict(input="Consider all tasks")

In [None]:
print(conversation.predict(input="The returned code is wrong"))

#### Experimenting with Conversational Retrieval QA

In [None]:
# Specify the path to the text file
file_path = definition_text_file

# Read the contents of the text file
with open(file_path, 'r') as file:
    schema_definitions = file.read()


#### Providing output format

In [None]:
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser

class FormatCodeOutput(BaseModel):
    Query: str = Field(description="The generated Spark SQL query")
    Code: str = Field(description="Python code generated to run the Spark SQL query")
    Skan_Bot: str = Field(description="The final answer printed by the python code in a friendly tone. Place the answer between ##")

In [None]:
class FormatDoubtOutput(BaseModel):
    Doubt: str = Field(description="The clarification point asked by the bot")

#### The model

In [None]:
def get_chat_history(inputs) -> str:
    print(inputs)
    res = []
    for human, ai in inputs:
        res.append(f"Human:{human}\nAI:{ai}")
    return "\n".join(res)

#### Azure LLM

In [None]:
# Initialize LangChain with Azure OpenAI
chat_llm = AzureChatOpenAI(
    deployment_name=model_deployment_name,
    openai_api_version=model_api_version,
    openai_api_base=api_base,
    openai_api_key=api_key,
    # max_tokens=max_tokens,
    temperature=0.3,
    streaming=True,
    verbose=True
)

#### Personal Key LLM

In [None]:

llm = ChatOpenAI(temperature=0.3, streaming=True)

In [None]:


# Set up a parser + inject instructions into the prompt template.
# code_parser = PydanticOutputParser(pydantic_object=FormatCodeOutput)
# doubt_parser = PydanticOutputParser(pydantic_object=FormatDoubtOutput)

# new_prompt = PromptTemplate(
#     template=new_template_with_code, 
#     input_variables=["context", "question", "chat_history"] , #"fallback_message"]
#     partial_variables={"schema_definitions": schema_definitions,
#                        "data_table": fake_data_file}
# )

new_prompt = PromptTemplate(
    template=prompt_without_followups,
    input_variables=[
        "context",
        "question",
        "chat_history",
    ],
    partial_variables={"schema_definitions": schema_definitions,
                        "data_table": fake_data_file,
                        "data_table_name": "hive_metastore.unum_askskan.events",
                        "start_date": "2023-04-01",
                        "end_date": "2023-04-30"}
)

chain_type_kwargs = {"prompt": new_prompt}


In [None]:
buffer_memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")
qa = ConversationalRetrievalChain.from_llm(
    llm = chat_llm, 
    retriever = retriever, 
    memory=buffer_memory,
    combine_docs_chain_kwargs=chain_type_kwargs,
    verbose=False,
    get_chat_history=lambda h:h
    )

#### Obtaining results

In [None]:
# result = qa({"context": vectorstore.similarity_search(question_9, k=5),
#              "question": question_9})
chat_history = []
result = qa({"question": question_1})
print(result['answer'])

{
    "Query": "SELECT SUM(processing_time) AS total_time FROM hive_metastore.unum_askskan.events WHERE agent_type != 0 AND event_date >= '2023-04-01' AND event_date <= '2023-04-30'",
    "Column": "total_time",
    "Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.appName('schema').getOrCreate()\ndata = spark.read.csv('../askskan/data/original/sample_data.csv', header=True, inferSchema=True)\ndata.createOrReplaceTempView('hive_metastore.unum_askskan.events')\n\nquery = 'SELECT SUM(processing_time) AS total_time FROM hive_metastore.unum_askskan.events WHERE agent_type != 0 AND event_date >= '2023-04-01' AND event_date <= '2023-04-30''\nresult = spark.sql(query)\n\nspark.stop()\n",
    "Skan Bot": "The total time spent on process applications is #result#."
}


In [None]:
result = qa({"question": "Check yourself in the schema"})
print(result['answer'])

I cannot generate the Spark SQL query without the name of the column that contains the total time spent on non-process applications. Please provide the name of the column.


In [None]:
result = qa({"question": "Just give me the output"})
print(result['answer'])

I am looking for the total time spent on non-process applications.


In [None]:
result = qa({"question": "Give me the final output."})
print(result['answer'])

{
    "Query": "SELECT activity_instance_original_end_time, SUM(active_time) AS total_active_time FROM data_table GROUP BY activity_instance_original_end_time ORDER BY total_active_time DESC LIMIT 1",
    "Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.appName('example').getOrCreate()\ndata = spark.read.csv('../askskan/data/original/sample_data.csv', header=True, inferSchema=True)\ndata.createOrReplaceTempView('data_table')\n\nquery = \"SELECT activity_instance_original_end_time, SUM(active_time) AS total_active_time FROM data_table GROUP BY activity_instance_original_end_time ORDER BY total_active_time DESC LIMIT 1\"\nresult = spark.sql(query)\nresult.show()\n",
    "Skan Bot": "The most active time window during the day is from <start time> to <end time>."
}


In [None]:
result = qa({"question": question_6})
print(result['answer'])

Doubt: Can you please clarify what you mean by "case effort"? Are you asking for the average time spent on each case for each persona?


In [None]:
result = qa({"question": "Yes"})
print(result['answer'])

The average time spent on each case for each persona is as follows:

Output: {
    "Query": "SELECT persona_name, AVG(processing_time) AS average_case_effort FROM data_table WHERE agent_type != 0 GROUP BY persona_name",
    "Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.getOrCreate()\ndata = pd.read_csv('../askskan/data/original/sample_data.csv')\ndata_table = spark.createDataFrame(data)\ndata_table.createOrReplaceTempView('data_table')\n\nquery = \"SELECT persona_name, AVG(processing_time) AS average_case_effort FROM data_table WHERE agent_type != 0 GROUP BY persona_name\"\nresult = spark.sql(query)\nresult.show()",
    "Skan Bot": "The average time spent on each case for each persona is shown in the table below:\n\n+------------+------------------+\n|persona_name|average_case_effort|\n+------------+------------------+\n|         CES|             10.23|\n|         ABC|             15.67|\n|         XYZ|             12.45|\n+---------

In [None]:
result = qa({"question": "Referring to a specific time range within a day"})
print(result['answer'])

Doubt: What is the specific time range for the most active time window during the day across all participants, referring to a specific time range within a day?


In [None]:
code = parser.parse(result['answer']).Code
print(code)

OutputParserException: Failed to parse FormatOutput from completion {"Query": "SELECT SUM(processing_time) FROM data_table WHERE agent_type = 0", "Code": "import pandas as pd\n\n# Read the CSV file\ndata = pd.read_csv('../askskan/data/original/sample_data.csv')\n\n# Calculate the total time spent on non-process applications\ntotal_time = data[data['agent_type'] == 0]['processing_time'].sum()\n\n# Print the result\nprint('The total time spent on non-process applications is:', total_time)", "Skan Bot": "The total time spent on non-process applications is: #total_time#"}. Got: 1 validation error for FormatOutput
Skan_Bot
  field required (type=value_error.missing)

In [None]:
result = qa({"question": "Where is the Spark SQL query?"})
print(result['answer'])

Error in on_chain_start callback: 'name'


Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
Human: Total time spent on process application?
AI: Doubt: Are you asking for the total time spent on all process applications combined or for a specific process application?
Human: total time spent on all process applications combined
AI: The total time spent on all process applications combined can be calculated by summing up the processing_time for all process applications. Let me generate the Spark SQL query to calculate it.
Follow Up Input: Where is the Spark SQL query?
Standalone question:[0m


Error in on_chain_start callback: 'name'
Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m
Prompt after formatting:
[32;1m[1;3m
You are a super smart code generator.
Perform the following actions:

1. Understand the input by human, schema and schema definitions each delimited by <>
2. Based on the question, the schema and the schema definitions, first seek to clarify any ambiguities.
    - Assume human does not know anything about the schema, so never ask details about the schema or schema         definitions or technical details in the clarification points.
    - First try to find the answer to the ambiguities from the schema and schema definitions yourself.
    - If the ambiguities still persist then: 
        1. Without mentioning about the schema or schema definitions only think of a list of super short bullets of             areas that need clarification in simple terms. 
        2. Then pick one clarification point, and wait for an answer from the human before moving to the next point.
3. Never ask the human to explain the schema or schema 

In [None]:
result = qa({"question": "Within a specific day."})
print(result['answer'])

Error in on_chain_start callback: 'name'


Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
Human: Which is the most active time window during the day?
AI: Doubt: Can you please clarify what you mean by "active time window"? Are you referring to the time of day with the highest active time?
Human: Yes.
AI: Doubt: Can you please clarify what you mean by "highest level of activity"? Are you referring to the time period with the highest total active time?
Human: Yes.
AI: Doubt: Can you please clarify the time range for the active time window? Are you looking for the most active time window within a specific day or across multiple days?
Follow Up Input: Within a specific day.
Standalone question:[0m


Error in on_chain_start callback: 'name'
Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m
Prompt after formatting:
[32;1m[1;3m
You are a super smart code generator.
Perform the following actions:

1. Understand the input by human, schema and schema definitions each delimited by <>
2. Based on the question, the schema and the schema definitions, first seek to clarify any ambiguities.
    - Assume human does not know anything about the schema, so never ask details about the schema or schema         definitions or technical details in the clarification points.
    - First try to find the answer to the ambiguities from the schema and schema definitions yourself.
    - If the ambiguities still persist then: 
        1. Without mentioning about the schema or schema definitions only think of a list of super short bullets of             areas that need clarification in simple terms. 
        2. Then pick one clarification point, and wait for an answer from the human before moving to the next point.
3. Never ask the human to explain the schema or schema 

In [None]:
code = parser.parse(result['answer']).Code
print(code)

import pandas as pd
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName('ActiveTimeWindow').getOrCreate()

# Read CSV file
df = spark.read.format('csv').option('header', 'true').load('../askskan/data/original/sample_data.csv')

# Register DataFrame as a temporary table
df.createOrReplaceTempView('data_table')

# Generate Spark SQL query
query = "SELECT MAX(active_time) AS max_active_time, CONCAT(HOUR(event_time), ':', MINUTE(event_time)) AS active_time_window FROM data_table GROUP BY CONCAT(HOUR(event_time), ':', MINUTE(event_time)) ORDER BY max_active_time DESC LIMIT 1"

# Execute query
result = spark.sql(query)

# Convert result to Pandas DataFrame
result_df = result.toPandas()

# Print the final answer
print('The most active time window during the day is', result_df['active_time_window'][0], 'with a total active time of', result_df['max_active_time'][0], 'minutes.')


In [None]:
from langchain.output_parsers import RetryWithErrorOutputParser
retry_parser = RetryWithErrorOutputParser.from_llm(
    parser=parser, llm=OpenAI(temperature=0)
)

In [None]:
retry_parser.parse_with_prompt(code, new_prompt)

AttributeError: 'PromptTemplate' object has no attribute 'to_string'

In [None]:
result = qa({"question": "I want the output to be in another format."})
print(result['answer'])

Error in on_chain_start callback: 'name'


Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
Human: Which is the most used application?
AI: Doubt: Can you please clarify what you mean by "most used application"? Are you asking for the application with the highest active time or the application that was switched to the most frequently?
Human: Appliction with highest active time.
AI: Doubt: Are you asking for the application with the highest active time in general or for a specific time period?
Human: In general
AI: Output: The output should be formatted as a JSON instance that conforms to the JSON schema below.

```
{
  "Query": "SELECT application, MAX(active_time) AS highest_active_time FROM data_table GROUP BY application ORDER BY highest_active_time DESC LIMIT 1",
  "Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.getOrCreate()\n

Error in on_chain_start callback: 'name'
Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m
Prompt after formatting:
[32;1m[1;3m
You are a super smart code generator.
Perform the following actions:

1. Understand the input by human, schema and schema definitions each delimited by <>
2. Based on the question, the schema and the schema definitions, first seek to clarify any ambiguities.
    - Assume human does not know anything about the schema, so never ask details about the schema or schema         definitions or technical details in the clarification points.
    - First try to find the answer to the ambiguities from the schema and schema definitions yourself.
    - If the ambiguities still persist then: 
        1. Without mentioning about the schema or schema definitions only think of a list of super short bullets of             areas that need clarification in simple terms. 
        2. Then pick one clarification point, and wait for an answer from the human before moving to the next point.
3. Never ask the human to explain the schema or schema 

In [None]:
result = qa({"question": "I want the output to be in hours and minutes (HH:MM)."})
print(result['answer'])

Error in on_chain_start callback: 'name'


Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
Human: Which is the most used application?
AI: Doubt: Can you please clarify what you mean by "most used application"? Are you asking for the application with the highest active time or the application that was switched to the most frequently?
Human: Appliction with highest active time.
AI: Doubt: Are you asking for the application with the highest active time in general or for a specific time period?
Human: In general
AI: Output: The output should be formatted as a JSON instance that conforms to the JSON schema below.

```
{
  "Query": "SELECT application, MAX(active_time) AS highest_active_time FROM data_table GROUP BY application ORDER BY highest_active_time DESC LIMIT 1",
  "Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = SparkSession.builder.getOrCreate()\n

Error in on_chain_start callback: 'name'
Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m
Prompt after formatting:
[32;1m[1;3m
You are a super smart code generator.
Perform the following actions:

1. Understand the input by human, schema and schema definitions each delimited by <>
2. Based on the question, the schema and the schema definitions, first seek to clarify any ambiguities.
    - Assume human does not know anything about the schema, so never ask details about the schema or schema         definitions or technical details in the clarification points.
    - First try to find the answer to the ambiguities from the schema and schema definitions yourself.
    - If the ambiguities still persist then: 
        1. Without mentioning about the schema or schema definitions only think of a list of super short bullets of             areas that need clarification in simple terms. 
        2. Then pick one clarification point, and wait for an answer from the human before moving to the next point.
3. Never ask the human to explain the schema or schema 

In [None]:
code = parser.parse(result['answer']).Code
print(code)

import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
data = pd.read_csv('../askskan/data/original/sample_data.csv')
data_table = spark.createDataFrame(data)
data_table.createOrReplaceTempView('data_table')

query = "SELECT application, MAX(active_time) AS highest_active_time FROM data_table GROUP BY application ORDER BY highest_active_time DESC LIMIT 1"
result = spark.sql(query)
result.show()



In [None]:
# result = qa({"question": "processing_time field represents the time taken by the total time taken by a participant for all their tasks"})
# print(result['answer'])

#### Clean the output string to look like JSON

In [None]:
string = result['answer']

# Find and remove the first occurrence of "Query" in the string
stripped_string = string.replace('Output:', '', 1).strip()

print(stripped_string)


#### Extract the SQL Query

In [None]:
import json

json_string = result['answer'] # stripped_string

# Remove invalid escape sequences from the JSON string
#json_string = json_string.encode('utf-8').decode('unicode_escape')

# Convert the modified JSON string to a JSON object
json_object = json.loads(json_string, strict=False)

# Extract the "Query" field
query = json_object['Query']

# Print the extracted query
print(query)


SELECT SUM(processing_time) FROM data_table WHERE agent_type = 0


#### Extract the Python code and answer string

In [None]:
string = """
{"Query": ["SELECT SUM(processing_time) AS total_time FROM data_table WHERE agent_type != 0"],
"Code": ["import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = \
    SparkSession.builder.appName('schema').getOrCreate()\ndata = spark.read.csv('data/sample/data0.csv', \
        header=True, inferSchema=True)\ndata.createOrReplaceTempView('data_table')\n\nquery = \
            "SELECT SUM(processing_time) AS total_time FROM data_table WHERE agent_type != 0"\nresult = \
                spark.sql(query)\nresult.show()\nprint('The total time spent on process applications is:', \
                    result.collect()[0][0])"],
"Skan Bot": ["The total time spent on process applications is: #total_time#."],
}"""

In [None]:
string1 = """
{"Code": "import pandas as pd\nfrom pyspark.sql import SparkSession\n\nspark = \
SparkSession.builder.appName('schema').getOrCreate()\ndata = spark.read.csv('data/sample/data0.csv', \
header=True, inferSchema=True)\ndata.createOrReplaceTempView('data_table')\n\nquery = \
'SELECT SUM(processing_time) AS total_time FROM data_table WHERE agent_type != 0'\nresult = \
spark.sql(query)\nresult.show()\nprint('The total time spent on process applications is:', \
result.collect()[0][0])"
}"""

In [None]:
import json

json_string = string1 # result['answer']

# Remove invalid escape sequences from the JSON string
json_string = json_string.encode('utf-8').decode('unicode_escape')

# Find and remove the first occurrence of "Query" in the JSON string
#json_string = json_string.replace('Query', '', 1).strip()

# Convert the modified JSON string to a JSON object
json_object = json.loads(json_string, strict=False)

# Extract the "Query" field
code = json_object['Code']
#answer = json_object['Skan Bot']

# Print the extracted query
print(code)
#print(answer)

import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('schema').getOrCreate()
data = spark.read.csv('data/sample/data0.csv', header=True, inferSchema=True)
data.createOrReplaceTempView('data_table')

query = 'SELECT SUM(processing_time) AS total_time FROM data_table WHERE agent_type != 0'
result = spark.sql(query)
result.show()
print('The total time spent on process applications is:', result.collect()[0][0])


In [None]:
try:
    exec(code)
except Exception as e:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import sum

    # Create a SparkSession
    spark = SparkSession.builder.getOrCreate()

    # Load the 'data' table into a DataFrame
    data = spark.table('data')

    # Apply the filter condition
    filtered_data = data.filter(data.agent_type != 0)

    # Calculate the sum of processing_time
    result = filtered_data.select(sum('processing_time').alias('total_time'))

    # Show the result
    result.show()


In [None]:
import findspark
findspark.init() 

from pyspark.sql import SparkSession

from pyspark import SparkConf, SparkContext

#spark = SparkSession.builder.appName('Query').getOrCreate()

# conf = SparkConf().setAppName("YourAppName").setMaster("local").set("spark.driver.bindAddress", "127.0.0.1")
# spark = SparkContext(conf=conf)
spark = SparkSession.builder.appName("YourAppName").master("local").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()

data = spark.read.format('csv').option('header', 'true').load('../data/sample/data0.csv')
data.createOrReplaceTempView('data0')

query = 'SELECT AVG(processing_time) AS avg_processing_time FROM data'
avg_processing_time = spark.sql(query).collect()[0]['avg_processing_time']

print('The longest average processing time overall is: {} seconds.'.format(avg_processing_time))

In [None]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Query').getOrCreate()
data = spark.read.format('csv').option('header', 'true').load('../data/sample/data0.csv')
data.createOrReplaceTempView('data0')

query = 'SELECT participant_id, AVG(processing_time) AS avg_processing_time \
FROM data0 GROUP BY participant_id \
ORDER BY avg_processing_time \
DESC LIMIT 1'

result_df = spark.sql(query).toPandas()
print('The participant with the longest average processing time is:', result_df['participant_id'][0])

In [None]:
exec(code)

In [None]:
from azure.identity import DefaultAzureCredential 
from azure.storage.blob import BlobClient

# Storage Account name
account_name = os.environ['DATALAKE_ACCOUNT_NAME']
# Storage Account key
account_key = os.environ['DATALAKE_ACCOUNT_KEY']

# Name of the container where the blob is stored
container_name = "model-training"

# Name of the blob you want to fetch
blob_name = "DataShare/data1/2023-03-01-to-2023-03-31/events/events.csv"

file_path = "../data/original/"
file_name = "events.csv"


account_url = "https://{account_key}.blob.core.windows.net"
# Create a BlobClient object with data transfer options for download
blob_client = BlobClient(
    account_url=account_url, 
    container_name=container_name, 
    blob_name=blob_name,
    credential=account_key, #DefaultAzureCredential(),
    max_single_get_size=1024*1024*32, # 32 MiB
    max_chunk_get_size=1024*1024*4 # 4 MiB
)

with open(file=os.path.join(r'../askskan/data/original/', 'events.csv'), mode="wb") as sample_blob:
    download_stream = blob_client.download_blob(max_concurrency=2)
    sample_blob.write(download_stream.readall())

ServiceRequestError: <urllib3.connection.HTTPSConnection object at 0x140512820>: Failed to resolve 'model-training' ([Errno 8] nodename nor servname provided, or not known)