In [1]:
import json

def load_json(file_path):
    """
    Load a JSON file and return its content.

    :param file_path: Path to the JSON file
    :return: Parsed JSON data
    """
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

In [2]:
data = load_json(file_path=r'D:\Desktop\AlphaReportChatbot\src\notebook\research\arb_datasets.json')

In [3]:

def parse_ner_data(data):
    """
    Parse the 'ner_data' field from the JSON data.

    :param data: List of dictionaries containing 'ner_data'
    :return: List of parsed 'ner_data'
    """
    parsed_data = [item['ner_data'] for item in data if 'ner_data' in item]
    new_parsed_data = []
    for i in parsed_data:
        for j in i:
            new_parsed_data.append(j)
            
    return new_parsed_data

# Example usage
parsed_data = parse_ner_data(data)

In [4]:
def find_entity_index(query, entity):
    """
    Find the start and end index of an entity in a query.

    :param query: The query string
    :param entity: The entity string to find
    :return: A tuple containing the start and end index of the entity, or None if not found
    """
    start_index = query.find(entity)
    if start_index == -1:
        return []
    end_index = start_index + len(entity)
    return [start_index, end_index]

In [9]:
from tqdm import tqdm

def adjust_index(data):
    """
    Adjust the from_index and to_index for each entity in parsed_data.

    :param parsed_data: List of dictionaries containing 'query' and 'entities'
    :return: The updated parsed_data with corrected indices
    """

    for p in tqdm(data, desc="Adjusting indices", bar_format="{l_bar}{bar:20}{r_bar}{bar:-10b}", colour='green'):
        query = p['query']
        for entity in p['entities']:
            entity_text = entity['entity']
            corrected_indices = find_entity_index(query, entity_text)
            entity['from_index'], entity['to_index'] = corrected_indices

    return data

In [10]:
adjust_parsed_date = adjust_index(parsed_data)

Adjusting indices: 100%|[32m████████████████████[0m| 21/21 [00:00<?, ?it/s][32m[0m


In [11]:
parsed_data[0]

{'query': 'Can you generate a Win/Loss report for ABCTeam, specifically for the user ABCTeam2345, who is an active member within the Sportsbook Product with details during the period from 10/05/2023 to 15/06/2023?',
 'entities': [{'entity': 'ABCTeam',
   'from_index': 39,
   'to_index': 46,
   'label': 'user'},
  {'entity': 'Sportsbook Product',
   'from_index': 122,
   'to_index': 140,
   'label': 'product_detail'},
  {'entity': '10/05/2023 to 15/06/2023',
   'from_index': 177,
   'to_index': 201,
   'label': 'date_range'},
  {'entity': 'ABCTeam2345',
   'from_index': 74,
   'to_index': 85,
   'label': 'user'}]}

In [14]:
adjust_parsed_date

[{'query': 'Can you generate a Win/Loss report for ABCTeam, specifically for the user ABCTeam2345, who is an active member within the Sportsbook Product with details during the period from 10/05/2023 to 15/06/2023?',
  'entities': [{'entity': 'ABCTeam',
    'from_index': 39,
    'to_index': 46,
    'label': 'user'},
   {'entity': 'Sportsbook Product',
    'from_index': 122,
    'to_index': 140,
    'label': 'product_detail'},
   {'entity': '10/05/2023 to 15/06/2023',
    'from_index': 177,
    'to_index': 201,
    'label': 'date_range'},
   {'entity': 'ABCTeam2345',
    'from_index': 74,
    'to_index': 85,
    'label': 'user'}]},
 {'query': 'I need a Win/Loss report for the user ABCTeam2345 in the Sportsbook Product covering activities from February to April 2023.',
  'entities': [{'entity': 'ABCTeam2345',
    'from_index': 38,
    'to_index': 49,
    'label': 'user'},
   {'entity': 'Sportsbook Product',
    'from_index': 57,
    'to_index': 75,
    'label': 'product_detail'},
   {'en

In [15]:
adjust_parsed_date[0]['query'][adjust_parsed_date[0]['entities'][1]['from_index'] : adjust_parsed_date[0]['entities'][1]['to_index']]

'Sportsbook Product'

In [16]:
def save_json(data, file_path):
    """
    Save data to a JSON file.

    :param data: Data to be saved (e.g., a dictionary or list)
    :param file_path: Path to the JSON file
    """
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

In [17]:
save_json(adjust_parsed_date, './correct_arb_datasets.json')