In [34]:
from xml.etree.ElementTree import Element, SubElement, tostring
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
import random

from tqdm import tqdm

# Creating XML Datasets

### Single record xml example:

```
<records>
   <employee_record>
      <id>1</id>
      <name>Rick</name>
      <salary>623.3</salary>
      <state>MA</state>
      <position>Production Technician</position>
      <performance_score date=2017-01-01>Fully Meets<performance_score>
      <performance_score date=2022-05-08>Exceeds<performance_score>
   </employee_record>
</records>
```

We want to create two datasets to benchmark:
1. A single very large XML file  
2. Many small xml files  

The goal is to read the xml files, convert to json format, then save the json files


In [3]:
df = pd.read_csv("hr_seed_data.csv")

In [4]:
df.head()

Unnamed: 0,Employee_Name,EmpID,Salary,PositionID,Position,State,DateofHire,PerformanceScore,LastPerformanceReview_Date
0,"Adinolfi, Wilson K",10026,62506,19,Production Technician I,MA,7/5/2011,Exceeds,1/17/2019
1,"Ait Sidi, Karthikeyan",10084,104437,27,Sr. DBA,MA,3/30/2015,Fully Meets,2/24/2016
2,"Akinkuolie, Sarah",10196,64955,20,Production Technician II,MA,7/5/2011,Fully Meets,5/15/2012
3,"Alagbe,Trina",10088,64991,19,Production Technician I,MA,1/7/2008,Fully Meets,1/3/2019
4,"Anderson, Carol",10069,50825,19,Production Technician I,MA,7/11/2011,Fully Meets,2/1/2016


In [29]:
#using the dataset to get text data
employee_names = df['Employee_Name'].unique()
positions = df['Position'].unique()
states = df['State'].unique()
performance_scores = df['PerformanceScore'].unique()


num_xml_records = 1_000_000

xml_names = np.random.choice(employee_names, num_xml_records)
xml_positions = np.random.choice(positions, num_xml_records)
xml_states = np.random.choice(states, num_xml_records)
xml_salary = np.random.rand(num_xml_records)* 100_000
xml_performance_scores = np.random.choice(performance_scores, num_xml_records)




In [35]:
# --- write one big file ---

root = Element('records')

base_date = np.datetime64('2019-01-01')
ext_days = np.arange(0, 1000)



for i in tqdm(range(0, num_xml_records)):
    
    empl_record = SubElement(root,"employee_record")
    ids = SubElement(empl_record, "id")
    name = SubElement(empl_record, "name")
    state = SubElement(empl_record, "state")
    salary = SubElement(empl_record, "salary")
    
    ids.text = np.random.choice(xml_names)
    name.text = np.random.choice(xml_positions)
    state.text = np.random.choice(xml_states)
    salary.text = str(int(np.random.choice(xml_salary)))
    
    
    # up to 5 historical performance scores
    for i in range(0, random.randrange(5)):
        performance_score = SubElement(empl_record, "performance_score")
        
        random_date = base_date + np.random.choice(ext_days)
        performance_score.set("date", str(random_date))
        performance_score.text = np.random.choice(xml_performance_scores)
        
    
tree = ET.ElementTree(root)
ET.indent(tree, '  ')
tree.write("large_dataset.xml", encoding="utf-8", xml_declaration=True)

100%|██████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:52<00:00, 8920.29it/s]


In [38]:
# --- write a lot of little files ---
num_records_per_file = 1000
total_files = int(num_xml_records / num_records_per_file)

base_date = np.datetime64('2019-01-01')
ext_days = np.arange(0, 1000)


for file in tqdm(range(0, total_files)):
    root = Element('records')
    
    for i in range(0, num_records_per_file):

        empl_record = SubElement(root,"employee_record")
        ids = SubElement(empl_record, "id")
        name = SubElement(empl_record, "name")
        state = SubElement(empl_record, "state")
        salary = SubElement(empl_record, "salary")

        ids.text = np.random.choice(xml_names)
        name.text = np.random.choice(xml_positions)
        state.text = np.random.choice(xml_states)
        salary.text = str(int(np.random.choice(xml_salary)))


        # up to 5 historical performance scores
        for i in range(0, random.randrange(5)):
            performance_score = SubElement(empl_record, "performance_score")

            random_date = base_date + np.random.choice(ext_days)
            performance_score.set("date", str(random_date))
            performance_score.text = np.random.choice(xml_performance_scores)


    tree = ET.ElementTree(root)
    ET.indent(tree, '  ')
    tree.write(f"xml_micro/small_dataset_{file}.xml", encoding="utf-8", xml_declaration=True)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:31<00:00,  6.59it/s]
