In [11]:
from memoization import get_gpt

In [1]:
with open('./dataset/worksheet.yml', 'r') as f:
    yml_schema = f.read()

In [133]:
import yaml
import json
import re


def select_json_block(text: str):
    match = re.search(r"```json\n([\s\S]*?)\n```", text)
    if match:
        json_data = match.group(1)
    else:
        raise ValueError("No valid JSON data found in the string.")

    return json.loads(json_data)


def build_schema(yaml_content):
    parsed_yaml = yaml.safe_load(yaml_content)
    subjects_schema = {}

    for subject, content in parsed_yaml['subjects'].items():
        schema = {}
        for field in content['fields']:
            field_schema = {
                "type": field['schema']['type']
            }
            if 'enum' in field['schema']:
                field_schema['enum'] = field['schema']['enum']
            if 'format' in field['schema']:
                field_schema['format'] = field['schema']['format']
            
            schema[field['name']] = field_schema
        
        subjects_schema[subject] = schema

    return subjects_schema



def build_field_prompt(field):
    field_instructions = []
    field_description = f"  - {field['name']}: {field['schema']['type']}"
    if 'format' in field['schema']:
        field_description += f" ({field['schema']['format']} format)"
    field_instructions.append(field_description)
            
    field_metadata = f"    description: {field['description']}"
    field_instructions.append(field_metadata)
            
    if 'enum' in field['schema']:
        possible_values = f"    possible values: {', '.join(field['schema']['enum'])}"
        field_instructions.append(possible_values)

    return '\n'.join(field_instructions)
    

def build_schema_prompt(yaml_content, subject: str = None):
    parsed_yaml = yaml.safe_load(yaml_content)
    instructions = []

    subjects = parsed_yaml['subjects']

    if subject:
        subjects = {subject: subjects[subject]} if subject in subjects else {}

    for subject_name, content in subjects.items():
        subject_description = f"{subject_name}: {content['description']}"
        instructions.append(subject_description)
        
        for field in content['fields']:
            field_description = f"  - {field['name']}: {field['schema']['type']}"
            if 'format' in field['schema']:
                field_description += f" ({field['schema']['format']} format)"
            instructions.append(field_description)
            
            field_metadata = f"    description: {field['description']}"
            instructions.append(field_metadata)
            
            if 'enum' in field['schema']:
                possible_values = f"    possible values: {', '.join(field['schema']['enum'])}"
                instructions.append(possible_values)

    return "\n".join(instructions)


def define_worksheet_fields(subject, subject_fields: dict, conversation_history, **kwargs):
    subject_str = f'{subject['name']}: {subject['description']}'
    fields_str = '\n'.join(build_prompt_from_subject_fields(subject_fields))

    conversation_history_str = '\n'.join([f"{message['from']}: {message['text']}" for message in conversation_history])

    prompt = f"""
Please find entities at speech of Latin America candidate on security guy position.

Here is possible fields:
{subject_str}
{fields_str}

Here is conversation:
{conversation_history_str}

When analyzing the conversation, follow these guidelines:

	1.	Track which fields have been asked about in the conversation.
	2.	Collect and compile all relevant field information, even if it spans multiple steps.
	3.	Finalize the field only when all required details have been provided.
	4.	Only include incomplete field errors for fields that have been asked about but not fully completed.
	5.	Do not include errors for fields that have not yet been asked.

incomplete_field_error: (array)
    field_name: <field name>
    current_value: <current field value>
    required_details: <required details>

    
Please put result info json format, like this ```json```.
"""
    # print(prompt)
    
    response = tget_gpt(prompt, model='gpt-4o', temperature=0, **kwargs)

    return select_json_block(response)


def build_prompt_from_subject_fields(fields):
    instructions = []
        
    for field in fields:
        field_description = f"  - {field['name']}: {field['schema']['type']}"
        if 'format' in field['schema']:
            field_description += f" ({field['schema']['format']} format)"
        instructions.append(field_description)
            
        field_metadata = f"    description: {field['description']}"
        instructions.append(field_metadata)
            
        if 'enum' in field['schema']:
            possible_values = f"    possible values: {', '.join(field['schema']['enum'])}"
            instructions.append(possible_values)
    return instructions

In [146]:
def clarify_field_details(field_name: str, current_value: str, validation_error: str, **kwargs):
    field_error_str = f'field: {field_name}\ncurrent value: {current_value}\nrequired details: {validation_error}'
    
    response = tget_gpt(f"""
You are recruiter. Ensure the accurate and complete collection of candidate information by reasking questions 
and clarifying details to complete the worksheet field information.

**Instructions:**

1. **Identify Incomplete Fields:**
   - Review the provided list of fields that have incomplete or missing details.

2. **Addressing Incomplete Information:**
   - For each field with incomplete information, identify the missing details.
   - Politely ask the candidate to provide the missing information to complete the field.

3. **Sample Prompt Structure:**
   - Use a polite and clear approach to reask for the missing details.
   - Ensure the candidate understands what specific information is needed.

4. **Handling Specific Fields:**
   - **Field Name:** Birth Date
     - **Current Value:** September 12
     - **Required Details:** Year of birth
     - **Clarification Prompt:** "Could you please provide the year of your birth to complete your birth date?"

5. **General Tips:**
   - Be patient and understanding if the candidate is unsure or needs time to find the information.
   - Confirm with the candidate once the information is provided to ensure accuracy.

Please ask for correction by the following field:

{field_error_str}

Put the response into ```json``` with schema:

```
{{
    "text": string // Text message
}}
    """, **kwargs)
    return select_json_block(response)

In [135]:
def next_dialog_step(subject, subject_fields, worksheet_fields: dict, **kwargs):
    subject_str = f'{subject['name']}: {subject['description']}'
    fields_str = '\n'.join(build_prompt_from_subject_fields(subject_fields))
    
    fillded_fields = []
    for subject, fields in worskheet_field_storage.items():
        fillded_fields.append(f'{subject}:')
        for field, value in fields.items():
            fillded_fields.append(f'   - {field}: {value}')
        fillded_fields.append('')
    
    response = tget_gpt(f"""
You are a recruiter in Latin America, hiring candidates for a security guard position in a store. 
For employment, you need to make a call with the candidate and gather all the necessary information to fill out the application form.

Your task is to ask the question needed to fill in the next field based on the previous answers.
The examples of questions:
- Can you please tell me your first names?
- Great, thank you. Now, can you tell me your last names?
- Thank you. What is your gender?
- Perfect. And your ID number?
- Alright. Now, your date of birth, please.
- Thank you. Can you also tell me your place of birth?

Here is already filled fields by user:
{'\n'.join(fillded_fields)}

Here is possible fields:
{subject_str}
{fields_str}

Put the response into ```json``` with schema:

```
{{
    "message": string // Text message
}}

""", **kwargs)

    response_data = select_json_block(response)
    return response_data
    

In [None]:
subject['fields']

In [136]:
def find_discussed_fields(subject_fields, conversation_history, **kwargs):
    subject_fields_str = {'\n'.join(build_prompt_from_subject_fields(subject_fields))}
    conversation_history_str = '\n'.join([f"{message['from']}: {message['text']}" for message in conversation_history])

    response = tget_gpt(f"""
Please find entities at speech of Latin America candidate on security guy position.

Here is possible fields:
{subject_fields_str}

Here is conversation:
{conversation_history_str}

Please recognize which field or field mentioned at latest message from security guard.

    
Please put result info json format, like this ```json```. For instance:
[{{
    'field_name': <field_name>,
    'value': <field value>
}}]

""", **kwargs)

    return select_json_block(response)

In [173]:
def validate_fields(fields_rules, fields, **kwargs):
    field_names = [ f['field_name'] for f in fields]
    matched_fields = [f for f in subject['fields'] if f['name'] in field_names]
    
    fields_rules = '\n'.join([build_field_prompt(f) for f in matched_fields])
    actual_fields = '\n'.join([ f'{f['field_name']}: {f['value']}' for f in fields])

    response = tget_gpt(f"""
Please validate the field values, focusing on their meanings rather than their formatting, character case, or similar aspects. 
This is needed for a worksheet that might be filled out via voice, which can introduce format variations.

The rules:
{fields_rules}

The actual field values:
{actual_fields}

Please put response into ```json``` structure, for example:
[{{
    'field_name': <field_name>,
    'validation_error': <field validation error or null>
}}]
    """, **kwargs)

    return select_json_block(response)

```json
{
    "message": "Great, thank you. Now, can you tell me your last names?"
}
```

In [75]:
validated_fields

[{'field_name': 'first_names', 'validation_error': None}]

In [None]:
filled_fields = [r['field_name'] for r in json_data]

In [80]:
parsed_yaml = yaml.safe_load(yml_schema)

subject_name = 'personal_info'

subject = parsed_yaml['subjects'][subject_name]

subject_details = {
    'name': subject_name,
    'description': subject['description']
}


In [158]:
from uuid import uuid4
import uuid
from IPython.display import display, JSON  # For displaying JSON in Jupyter


class Trace:

    def __init__(self):
        self.trace_storage = {}

    def relate_with_parent(self, parent_id, obj: dict):
        if parent_id not in self.trace_storage:
            self.trace_storage[parent_id] = []
        self.trace_storage[parent_id].append(obj)

    def get_id(self):
        return uuid.uuid4()

    def create_trace(self, obj: dict):
        trace_id = self.get_id()
        self.trace_storage[trace_id] = [obj]
        return trace_id

    def get_trace(self, trace_id):
        if trace_id in self.trace_storage:
            return self._build_hierarchy(trace_id)
        return {}

    def _build_hierarchy(self, trace_id):
        if trace_id not in self.trace_storage:
            return {}
        trace_items = self.trace_storage[trace_id]
        result = []
        for item in trace_items:
            item_copy = item.copy()
            if 'parent_trace_id' in item_copy:
                child_trace_id = item_copy.pop('parent_trace_id')
                item_copy['children'] = self._build_hierarchy(child_trace_id)
            result.append(item_copy)
        return result

    def get_all_traces(self):
        all_traces = {}
        for trace_id in self.trace_storage:
            all_traces[trace_id] = self._build_hierarchy(trace_id)
        return all_traces


def tget_gpt(content, model="gpt-4o", temperature=0, max_tokens=1000, stream=True, **kwargs):
    try:
        response = get_gpt(content, model, temperature, max_tokens, stream)
        trace_data = {
            'content': content,
            'response': response,
            'model': model,
            'temperature': temperature,
            'max_tokens': max_tokens,
            'stream': stream
        }
        parent_trace_id = kwargs.get('parent_trace_id', None)
        if parent_trace_id:
            trace.relate_with_parent(parent_trace_id, trace_data)
        else:
            new_trace_id = trace.create_trace(trace_data)
            print(f"New trace created with ID: {new_trace_id}")
        return response
    except Exception as e:
        print(f"An error occurred: {e}")
        trace_data = {
            'content': content,
            'error': str(e),
            'model': model,
            'temperature': temperature,
            'max_tokens': max_tokens,
            'stream': stream
        }
        parent_trace_id = kwargs.get('parent_trace_id', None)
        if parent_trace_id:
            trace.relate_with_parent(parent_trace_id, trace_data)
        else:
            new_trace_id = trace.create_trace(trace_data)
            print(f"New trace created with ID: {new_trace_id}")
        raise


def build_pretty_trace(trace_data, indent=0):
    output = ""
    indent_str = "  " * indent
    for item in trace_data:
        output += f"{indent_str}<h2>Prompt:</h2>\n{indent_str}{item['content'].replace('\\n', '<br>' + indent_str)}\n"
        if 'response' in item:
            output += f"{indent_str}<h4>Response:</h4>\n{indent_str}{item['response'].replace('\\n', '<br>' + indent_str)}\n"
        if 'error' in item:
            output += f"{indent_str}<b>Error:</b>\n{indent_str}{item['error'].replace('\\n', '<br>' + indent_str)}\n"
        if 'children' in item and item['children']:
            output += build_pretty_trace(item['children'], indent + 1)
    return output


def display_pretty_trace(trace_id):
    trace_data = trace.get_trace(trace_id)
    pretty_trace = build_pretty_trace(trace_data)
    # display(JSON(trace.get_all_traces()))
    display(HTML(f"<pre>{pretty_trace}</pre>"))

In [118]:
trace = Trace()

In [151]:
from dataclasses import dataclass
from typing import Optional, List
from uuid import uuid4


class FromType:
    Recruiter = 'recruiter'
    SecurityGuard = 'security guard'


def handle_candidate_messages(conversation_history, worskheet_field_storage):
    worsheet_result = define_worksheet_fields(subject_details, subject['fields'], conversation_history)

    append_keys_to_storage(subject_name, worsheet_result, worskheet_field_storage)

    if len(worsheet_result['incomplete_field_error']) > 0:
        incomplete_field_error = worsheet_result['incomplete_field_error'][0]

    # if incomplete_field_error:
        response = clarify_field_details(incomplete_field_error)['text']
    else:
        response = next_dialog_step(subject_name, subject_fields, worskheet_field_storage)['message']
    
    return response


def merge_fields(js_found_fields: List[dict], validated_fields: List[dict]) -> List[Field]:
    field_map = {field['field_name']: Field(name=field['field_name'], value=field['value']) for field in js_found_fields}
    
    for v_field in validated_fields:
        if v_field['field_name'] in field_map:
            field_map[v_field['field_name']].validation_error = v_field['validation_error']
    
    return list(field_map.values())


def handle_candidate_messages_v2(conversation_history, worskheet_field_storage): 
    trace_id = trace.get_id()
    kwargs = {}
    print(f'Parent trace id: {trace_id}')
    parent_trace = {'parent_trace_id': str(trace_id)}
    kwargs.update(parent_trace)
    
    js_found_fields = find_discussed_fields(subject['fields'], conversation_history, **kwargs)
    validated_fields = validate_fields(subject['fields'], js_found_fields, **kwargs)
    
    found_fields = merge_fields(js_found_fields, validated_fields)

    append_keys_to_storage_v2(subject_name, found_fields, worskheet_field_storage)
    
    invalid_field = next((f for f in validated_fields if f['validation_error']), None)
    if invalid_field:
        field_value = next(f.value for f in found_fields if f.name == invalid_field['field_name'])
        response = clarify_field_details(invalid_field['field_name'], field_value, invalid_field['validation_error'], **kwargs)['text']
    else:
        response = next_dialog_step(subject_details, subject['fields'], worskheet_field_storage, **kwargs)['message']

    return response


def append_keys_to_storage(subject_name: str, worsheet_result, worskheet_field_storage: dict):
    subject_worksheet_fields = worsheet_result[subject_name]
    
    for field_name, value in subject_worksheet_fields.items():
        if subject_name not in worskheet_field_storage or not worskheet_field_storage.get(subject_name, None):
            worskheet_field_storage[subject_name] = {}

        incomplete_fields = {error['field_name'] for error in worsheet_result['incomplete_field_error']}
    
        if field_name not in worskheet_field_storage[subject_name]:
            if field_name not in incomplete_fields:
            # if worsheet_result['incomplete_field_error'] and field_name != worsheet_result['incomplete_field_error']['field_name']:
                worskheet_field_storage[subject_name][field_name] = value


@dataclass
class Field:
    name: str
    value: str
    validation_error: Optional[str] = None
    

def append_keys_to_storage_v2(subject_name: str, fields_info: List[Field], worskheet_field_storage: dict):
    if subject_name not in worskheet_field_storage or not worskheet_field_storage.get(subject_name, None):
        worskheet_field_storage[subject_name] = {}

    incomplete_fields = {field.name for field in fields_info if field.validation_error}

    for field in fields_info:
        if field.name not in worskheet_field_storage[subject_name] and field.name not in incomplete_fields:
            worskheet_field_storage[subject_name][field.name] = field.value
        

In [8]:
worskheet_field_storage = {}
conversation_history = []

## Test

In [90]:
conversation_history = [
    {
        'from': FromType.Recruiter,
        'text': 'Please answer what your first name'
    },
    {
        'from': FromType.SecurityGuard,
        'text': 'My name is Jaez'
    },
    {
        'from': FromType.Recruiter,
        'text': 'Alright! Please say your birth date.'
    },
    {
        'from': FromType.SecurityGuard,
        'text': 'September 12'
    }
]

In [170]:
conversation_history = [
    {
        'from': FromType.SecurityGuard,
        'text': 'My name is Carlos Amigos'
    },
    {
        'from': FromType.Recruiter,
        'text': "Thank you for providing your last name. However, 'Amigos' is not a typical last name for a security guard. Could you please confirm if 'Amigos' is indeed your legal last name, or if there might have been a mistake? If it is a mistake, could you please provide your correct last name?"
    },
    {
        'from': FromType.SecurityGuard,
        'text': "It's correct"
    },
    {
        'from': FromType.Recruiter,
        'text': "Thank you, Carlos Amigos. What is your gender?"
    },
    {
        'from': FromType.SecurityGuard,
        'text': 'male'
    }
]

In [13]:
handle_candidate_messages(conversation_history, worskheet_field_storage)

```json
{
  "personal_info": {
    "first_names": "Jaez",
    "birth_date": "September 12"
  },
  "incomplete_field_error": [
    {
      "field_name": "birth_date",
      "current_value": "September 12",
      "required_details": "Year of birth"
    }
  ]
}
```

'Could you please provide the year of your birth to complete your birth date? Currently, we have September 12, but we need the full date including the year. Thank you!'

In [174]:
handle_candidate_messages_v2(conversation_history, worskheet_field_storage)

Parent trace id: 1b28b915-f6d5-45ad-ab83-2f3510a9933a
```json
[
    {
        "field_name": "first_names",
        "validation_error": null
    },
    {
        "field_name": "last_names",
        "validation_error": null
    },
    {
        "field_name": "gender",
        "validation_error": null
    }
]
```

'Perfect. And your ID number?'

In [148]:
display(JSON(trace.get_all_traces()))

<IPython.core.display.JSON object>

In [175]:
display_pretty_trace('1b28b915-f6d5-45ad-ab83-2f3510a9933a')

## Interface

In [92]:
worskheet_field_storage = {}
conversation_history = []

In [93]:
import gradio as gr


hist = None
def echo(message, history):
    conversation_history.append({
        'from': FromType.SecurityGuard,
        'text': message
    })

    response = handle_candidate_messages_v2(conversation_history, worskheet_field_storage)
    
    conversation_history.append({
        'from': FromType.Recruiter,
        'text': response
    })
    return response
    

demo = gr.ChatInterface(fn=echo, examples=["hello", "hola", "merhaba"], title="Echo Bot", chatbot=gr.Chatbot(height=300))
demo.launch()

Running on local URL:  http://127.0.0.1:7887

To create a public link, set `share=True` in `launch()`.




```json
[
    {
        "field_name": "first_names",
        "value": "Carlos"
    },
    {
        "field_name": "last_names",
        "value": "Amigos"
    }
]
``````json
[
    {
        "field_name": "first_names",
        "validation_error": null
    },
    {
        "field_name": "last_names",
        "validation_error": "Last name 'Amigos' is not a typical last name for a security guard."
    }
]
``````json
{
    "text": "Thank you for providing your last name. However, 'Amigos' is not a typical last name for a security guard. Could you please confirm if 'Amigos' is indeed your legal last name, or if there might have been a mistake? If it is a mistake, could you please provide your correct last name?"
}
``````json
[
    {
        "field_name": "last_names",
        "value": "Amigos"
    }
]
``````json
[{
    "field_name": "last_names",
    "validation_error": null
}]
``````json
{
    "message": "Thank you, Carlos Amigos. What is your gender?"
}
``````json
[
    {
        "field_n

In [None]:
## разделить промпты чтобы один классифицировал поле, второй писал сообщение 

## Test Another prompt

In [48]:
response = get_gpt(f"""
Please find entities at speech of Latin America candidate on security guy position.

Here is possible fields:
{'\n'.join(build_prompt_from_subject_fields(subject['fields']))}

Here is conversation:
security guard: What to do?
security guard: My name is Carlos Alberto
recruiter: Could you please provide the last name of the security guard to complete the 'last_names' field?
security guard: García Fernández
recruiter: Alright. Say your birth day
security guard: Sept 14

Please recognize which field or field mentioned at latest message from security guard.

    
Please put result info json format, like this ```json```. For instance:
[{{
    'field_name': <field_name>,
    'value': <field value>
}}]

""")

json_data = select_json_block(response)

filled_fields = [r['field_name'] for r in json_data]
filled_fields

['birth_date']

In [61]:
actual_fields = '\n'.join([ f'{f['field_name']}: {f['value']}' for f in json_data])
print(actual_fields)

birth_date: Sept 14


In [51]:
matched_fields = [f for f in subject['fields'] if f['name'] in filled_fields]
matched_fields

[{'name': 'birth_date',
  'description': 'Date of birth',
  'required': True,
  'schema': {'type': 'string'}}]

In [56]:
fields_rules = '\n'.join([build_field_prompt(f) for f in matched_fields])
fields_rules

'  - birth_date: string\n    description: Date of birth'

In [65]:
def validate_fields(fields_rules, actual_fields):
    response = get_gpt(f"""
Please validate the field value. Validate by value, not by field format.

The rules:
{fields_rules}

The actual field values:
{actual_fields}

Please put response into ```json``` structure, for example:
[{{
    'field_name': <field_name>,
    'validation_error': <field validation error or null>
}}]
    """)

    return select_json_block(response)

In [None]:
validate_fields()

In [32]:
fields_str = '\n'.join(build_prompt_from_subject_fields(subject['fields']))

print(fields_str)

  - first_names: string
    description: First names (can be multiple in Hispanic cultures)
  - last_names: string
    description: Last name of security guard
  - gender: string
    description: Gender (Male or Female)
  - id_number: string
    description: ID number. Format ###-########-#, last digit is a checksum
  - has_drivers_license: boolean
    description: Do you have a driver's license?
  - license_category: string
    description: Driver's license category
    possible values: A, B, C, D, E, F
  - birth_date: string
    description: Date of birth
  - birth_place: string
    description: Place of birth
  - nationality: string
    description: Nationality (non-citizens cannot be security guards)
  - marital_status: string
    description: Marital status: Married, Single, Civil union
    possible values: Married, Single, Civil union


In [43]:
conversation_history_str = '\n'.join([f"{message['from']}: {message['text']}" for message in conversation_history])

conversation_history_str += f'\n{FromType.SecurityGuard}: García Fernández'
conversation_history_str += f'\n{FromType.Recruiter}: Alright. Say your birth day'
conversation_history_str += f'\n{FromType.SecurityGuard}: Sept 14'


print(conversation_history_str)

security guard: What to do?
security guard: My name is Carlos Alberto
recruiter: Could you please provide the last name of the security guard to complete the 'last_names' field?
security guard: García Fernández
recruiter: Alright. Say your birth day
security guard: Sept 14
