#### Append the final label to the explaining part

In [5]:
import json

# Load data from a file
with open('/workspaces/source-of-injury/data/synthec_data.json', 'r') as file:
    data = json.load(file)

# Appending InjurySource to Chain of Thought
for incident in data:
    incident["Chain of Thought"] += f" - InjurySource: {incident['InjurySource']}"
    del incident["InjurySource"]

# Output the modified data
with open('/workspaces/source-of-injury/data/synthec_data_modified.json', 'w') as file:
    json.dump(data, file, indent=4)

# Print the modified data
print(json.dumps(data, indent=4))


[
    {
        "IncidentDescription": "While organizing the warehouse, I was stacking large crates onto a pallet. One crate slipped from the top and landed on my arm, causing severe bruising. I reported the incident immediately.",
        "Chain of Thought": "The task involved handling large crates, and the injury occurred due to a crate falling. The injury source is clearly related to the crates being moved. - InjurySource: Boxes, crates, cartons"
    },
    {
        "IncidentDescription": "I was tasked with unloading boxes from a delivery truck. As I was lifting a particularly heavy box, it slipped and fell onto my foot, resulting in a fracture.",
        "Chain of Thought": "The incident description centers around unloading boxes and the injury was directly caused by a falling box. The injury source is therefore the boxes being handled. - InjurySource: Boxes, crates, cartons"
    },
    {
        "IncidentDescription": "While arranging the storage room, I tried to carry too many c

#### Convert JSON to JSONL

In [6]:
import json

# File paths
input_file_path = '/workspaces/source-of-injury/data/synthec_data_modified.json'
output_file_path = '/workspaces/source-of-injury/data/synthec_data_modified.jsonl'

# Load data from the modified JSON file
with open(input_file_path, 'r') as file:
    data = json.load(file)

# Prepare the JSONL data
jsonl_data = []

for incident in data:
    conversation = [
        {"from": "system", "value": "Workers Compensation Board of Manitoba manages claims by reviewing incident descriptions submitted by workers. Claim coders review the incident description and populate a database with reasoning towards determining the source of injury (InjurySource). You are an assistant that takes an incident description and reason towards determining the InjurySource."},
        {"from": "human", "value": f"\n\nIncidentDescription: {incident['IncidentDescription']}"},
        {"from": "gpt", "value": f"\n{incident['Chain of Thought']}"}
    ]
    jsonl_data.append({"conversations": conversation})

# Write data to a JSONL file
with open(output_file_path, 'w') as file:
    for item in jsonl_data:
        file.write(json.dumps(item) + '\n')

print(f"Data successfully converted to JSONL and saved to {output_file_path}")


Data successfully converted to JSONL and saved to /workspaces/source-of-injury/data/synthec_data_modified.jsonl


# Shuffle the data

In [1]:
import json
import pandas as pd

with open('/workspaces/source-of-injury/data/synthec_data.json', 'r') as f:

    data = json.load(f)

df = pd.DataFrame(data)
df.head(5)

In [3]:
# Shuffle the order of the data by 'InjurySource'
shuffled_df = df.sample(frac=1).reset_index(drop=True)

shuffled_df.head(30)