In [1]:
#v2 with fixed role hierarchy, asset source column removed
#to do: update mapping files

In [2]:
import pandas as pd
from faker import Faker
import random

In [3]:
# Generate the roles and sub-roles
def subrolegen(num_levels, role, num_subroles, roles, subroles, levels):
    if num_levels!=0:
        for subrole in range (1,num_subroles + 1):
            role = f"{role}"
            roles.append(role)
            subrole = f"{role}_{subrole}"
            subroles.append(subrole)
            levels.append(num_levels)
            subrolegen(num_levels-1, subrole, num_subroles, roles, subroles, levels)
    return(roles, subroles, levels)

In [4]:
# role hierarchy dataset
def role_hierarchy(num_levels,num_sub_roles, prefix=""):
    # Create empty lists to store the roles and sub-roles
    roles = []
    sub_roles = []
    levels= []

    # Generate the roles and sub-roles for all but the last level
    (roles_out, subroles_out, levels_out) = (subrolegen(num_levels-2,1,num_sub_roles,roles, sub_roles, levels))

    # Create a dictionary with the roles and sub-roles
    data = {"ParentRole": roles_out, "Role": subroles_out, "Level": levels_out}

    # Create the pandas DataFrame
    df = pd.DataFrame(data)
    #reverse levels
    df['Level'] = (num_levels-1) - df['Level']

    #add sub-roles for all last level roles
    total_last_level_subroles = 200000 - len(df)

    #distribution of roles across second last role level (total number of last level roles / number of second last roles
    secondlast_roles = df.loc[df['Level']==(num_levels-2)]['Role']
    last_num_roles = total_last_level_subroles / len(secondlast_roles)
    secondlastroles_list=[]
    lastsubroles_list=[]
    secondlast_levels_list=[]
    for r2 in secondlast_roles:
        for i in range (1, round(last_num_roles)+1):
            secondlastroles_list.append(f"{r2}")
            lastsubroles_list.append(f"{r2}_{i}")
            secondlast_levels_list.append(num_levels-1)

    # Create a dictionary with the roles for last level
    secondlastlevel_data = {"ParentRole": secondlastroles_list, "Role": lastsubroles_list, "Level": secondlast_levels_list}

    # Create the pandas DataFrames
    secondlastlevel_df = pd.DataFrame(secondlastlevel_data)

    #append to main dataframe
    df=df.append(secondlastlevel_df, ignore_index=True)

    #insert level 0 parents (no records for these yet)
    level0_parents= df.loc[df['Level']==(1)]['ParentRole'].unique()
    for i in level0_parents:
        df = df.append({'Role': i, 'Level': 0}, ignore_index=True)

    #add prefix to role and subrole ID
    df['ParentRole'] = prefix + df['ParentRole']
    df['Role'] = prefix + df['Role']

    #create lists of fake attributes
    fake = Faker()
    fake_enum_list = []
    fake_description_list = []
    fake_role_status_list = []
    fake_criticality_list = []
    for i in (range (1, len(df.index)+1)):
        enum=f"{fake.ssn()}"
        fake_enum_list.append(enum)
        desc=f"{fake.paragraph(nb_sentences= 1)}"
        fake_description_list.append(desc)
        role_status=f"{fake.word(ext_word_list=['Active','Deprecated','Suspended','Planned'])}"
        fake_role_status_list.append(role_status)
        criticality = random.randint(1, 10)
        fake_criticality_list.append(criticality)
    #add as new attributes in df    
    df['EntityNum'] = fake_enum_list
    df['Description'] = fake_description_list
    df['RoleStatus'] = fake_role_status_list
    df['Criticality'] = fake_criticality_list
    #rearrange df column ordering
    df = df[['Role', 'ParentRole',
     'Level',
     'EntityNum',
     'Description',
     'RoleStatus',
     'Criticality']]
    return df

In [5]:
#spatial hierarchy dataset
#not an ideal structure of data but helpful to sort data to create and connect to other associated sources per TH spec of test data
def spatial_hierarchy(facilities, facility_areas, buildings, levels, rooms, interior_areas):
    #generate data
    data=[]
    for facility in facilities:
        for area in facility_areas:
            data.append([facility, facility + area, facility, "Facility"])
            for building in buildings: 
                data.append([facility + area, facility + area + building, facility, "FacilityArea"])
                for level in levels:
                    data.append([facility + area + building,facility + area + building + level, facility, "Building"])
                    for room in rooms:
                        data.append([facility + area + building + level, facility + area + building + level + room, facility, "Level"])
                        for int_area in int_areas:
                            data.append([facility + area + building + level + room, facility + area + building + level + room + int_area, facility, "Room"])
                            data.append([facility + area + building + level + room + int_area, None, facility, "InteriorArea"])                            
    df_space = pd.DataFrame(data, columns=['Space', 'ContainsSpace', 'Facility', 'SpaceType'])
    
    #fake attributes
    fake = Faker()
    fake_spacenum_list=[]
    fake_spacedesc_list=[]
    for i in range(0, len(df_space.index)):
        spacenum=f"{fake.ean(length=8)}"
        fake_spacenum_list.append(spacenum)
        spacedesc=f"{fake.paragraph(nb_sentences= 1)}"
        fake_spacedesc_list.append(spacedesc)
    df_space['SpaceNum']=fake_spacenum_list
    df_space['Description']=fake_spacedesc_list
    return df_space

In [6]:
#asset dataset
def asset_builder(uid,num_assets,special_class,num_special_class,special_property):
    fake = Faker()
    asset_list=[]
    assetnum_list=[]
    serial_num_list=[]
    asset_class_list=[]
    state_list=[]
    motorsize_list=[]

    #Asset: procedurally generated ID
    #Asset number: bothify (unique)
    #Serialnum: bothify (unique)
    #Asset class list: "http://ontology.eil.utoronto.ca/FAMO/assets/" + random word
    #State list: from set

    #Later - overwrite num_special_class Asset class randomly as special_class
    #Later: Motorsize list: randomly from 10-1500 hp; 5% over 1000 hp
    
    for i in range(1,num_assets+1):
        asset = f"asset_{uid}_{i}"
        asset_list.append(asset)
        assetnum = fake.bothify(text="A0####")
        assetnum_list.append(assetnum)
        serial_num = fake.bothify(text="se-rial-num-ber-####")
        serial_num_list.append(serial_num)
        asset_class = f"http://ontology.eil.utoronto.ca/FAMO/assets/{fake.word(part_of_speech='noun').capitalize()}"
        asset_class_list.append(asset_class)
        state = f"{fake.word(ext_word_list=['Active Service','Retired','Removed','On Site'])}"
        state_list.append(state)
    
    num_to_add = num_special_class - asset_class_list.count(special_class)
    
    #randomly (re-)assign asset class to num_special_class of the asset records
    for _ in range (0,num_to_add):
        asset_class_list[random.randint(0,len(asset_class_list)-1)] = special_class
    # Create a dictionary for the facility 1 assets
    asset_data = {"Asset": asset_list, "AssetNum": assetnum_list, "SerialNum": serial_num_list, "AssetClass": asset_class_list, "State": state_list}

    asset_df = pd.DataFrame(asset_data)
    
    # add motor size attribute for all motors
    #list of random motor sizes:
    motorsize_list=[]
    for i in range(0,sum(p == special_class for p in asset_class_list)):
        motorsize=random.randint(10,1500)
        motorsize_list.append(motorsize)
    asset_df.loc[asset_df['AssetClass']==special_class, special_property] = motorsize_list

    # Create the pandas DataFrame
    return asset_df

In [9]:
#generate spatial data
# building blocks for the spatial hierarchy
facilities = ['A','B','C','D']
facility_areas = ['A1','A2','A3','A4']
buildings = ['B1', 'B2', 'B3', 'B4', 'B5']
levels = ['l1','l2','l3']
rooms = ["r" + str(i) for i in range(1, 101)]
int_areas = ['ia1', 'ia2', 'ia3', 'ia4']

dfspace=spatial_hierarchy(facilities, facility_areas, buildings, levels, rooms, int_areas)

#generate role hierarchy with distinct labels (one for each facility)
#role hierarchy parameters
role_levels=9
subroles=4
roledf_A=role_hierarchy(role_levels,subroles,"role_A_")
roledf_B=role_hierarchy(role_levels,subroles,"role_B_")
roledf_C=role_hierarchy(role_levels,subroles,"role_C_")
roledf_D=role_hierarchy(role_levels,subroles,"role_D_")

#relate roles to spatial data
#take n samples from the list of facility A interior areas spatial data with an even distribution where n = the number of level9 roles
df_interiorarea_A = (dfspace.loc[(dfspace['Facility']=="A") & (dfspace['SpaceType']=="InteriorArea")]['Space']).sample(n=(len(roledf_A.loc[roledf_A['Level']==role_levels-1])), random_state=1, replace=True, ignore_index=True)
df_interiorarea_B = (dfspace.loc[(dfspace['Facility']=="B") & (dfspace['SpaceType']=="InteriorArea")]['Space']).sample(n=(len(roledf_B.loc[roledf_B['Level']==role_levels-1])), random_state=1, replace=True, ignore_index=True)
df_interiorarea_C = (dfspace.loc[(dfspace['Facility']=="C") & (dfspace['SpaceType']=="InteriorArea")]['Space']).sample(n=(len(roledf_C.loc[roledf_C['Level']==role_levels-1])), random_state=1, replace=True, ignore_index=True)
df_interiorarea_D = (dfspace.loc[(dfspace['Facility']=="D") & (dfspace['SpaceType']=="InteriorArea")]['Space']).sample(n=(len(roledf_D.loc[roledf_D['Level']==role_levels-1])), random_state=1, replace=True, ignore_index=True)

#add the sampled interior areas to define the Space column for each (lowest level) role dataset
roledf_A.loc[roledf_A['Level']==role_levels-1, "Space"] = df_interiorarea_A.values
roledf_B.loc[roledf_B['Level']==role_levels-1, "Space"] = df_interiorarea_B.values
roledf_C.loc[roledf_C['Level']==role_levels-1, "Space"] = df_interiorarea_C.values
roledf_D.loc[roledf_D['Level']==role_levels-1, "Space"] = df_interiorarea_D.values

#generate asset data for each set of role hierarchies
assetdf_A = asset_builder('A',len(roledf_A.loc[roledf_A['Level']==role_levels-1]),"http://ontology.eil.utoronto.ca/FAMO/assets/Motor",2000,"MotorSize")
assetdf_B = asset_builder('B',len(roledf_B.loc[roledf_B['Level']==role_levels-1]),"http://ontology.eil.utoronto.ca/FAMO/assets/Motor",2000,"MotorSize")
assetdf_C = asset_builder('C',len(roledf_C.loc[roledf_C['Level']==role_levels-1]),"http://ontology.eil.utoronto.ca/FAMO/assets/Motor",2000,"MotorSize")
assetdf_D = asset_builder('D',len(roledf_D.loc[roledf_D['Level']==role_levels-1]),"http://ontology.eil.utoronto.ca/FAMO/assets/Motor",2000,"MotorSize")


#relate asset data to roles
#add "serving" column to associate asset data with (level 9) roles
assetdf_A['ServingRole'] = roledf_A.loc[roledf_A['Level']==role_levels-1, "Role"].values
assetdf_B['ServingRole'] = roledf_B.loc[roledf_B['Level']==role_levels-1, "Role"].values
assetdf_C['ServingRole'] = roledf_C.loc[roledf_C['Level']==role_levels-1, "Role"].values
assetdf_D['ServingRole'] = roledf_D.loc[roledf_D['Level']==role_levels-1, "Role"].values

**double-check code for similar cases where .values should be used**

In [10]:
#convert all to csv
roledf_A.append(roledf_B).append(roledf_C).append(roledf_D).to_csv("01-tw-fakeroles.csv")
assetdf_A.append(assetdf_B).append(assetdf_C).append(assetdf_D).to_csv("01-tw-fakeassets.csv")
dfspace.to_csv("01-tw-fakespaces.csv")

In [13]:
dfspace

Unnamed: 0,Space,ContainsSpace,Facility,SpaceType,SpaceNum,Description
0,A,AA1,A,Facility,95562889,Physical help fund moment.
1,AA1,AA1B1,A,FacilityArea,05486373,Ground people project.
2,AA1B1,AA1B1l1,A,Building,48774826,Provide scene tell agency fill.
3,AA1B1l1,AA1B1l1r1,A,Level,73692812,Manager need final history without toward side...
4,AA1B1l1r1,AA1B1l1r1ia1,A,Room,07735523,Everyone attorney interview.
...,...,...,...,...,...,...
216331,DA4B5l3r100ia2,,D,InteriorArea,86078634,Difference mission join PM.
216332,DA4B5l3r100,DA4B5l3r100ia3,D,Room,70419603,How hair society forget southern.
216333,DA4B5l3r100ia3,,D,InteriorArea,86943215,Enough feel administration mission upon good h...
216334,DA4B5l3r100,DA4B5l3r100ia4,D,Room,75147556,Resource job great center because.


**TODO** 
* improve fake data types