# Data wrangling

This notebook is just to capture the workflow of combining all the data from the asynchronous and synchronous gremlin runs done on Summnit.  The resulting CSV file will then be added to our (not) FOGA git repo.  I will have a separate note book there for doing analytics and visualizations.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# First process the asynchronous data

In [3]:
cd /Users/may/Projects/data/AV/gremlin/runs/2021_UKCI/1051001_async/

/Users/may/Projects/data/AV/gremlin/runs/2021_UKCI/1051001_async


In [4]:
def add_columns(job_id, run_type, df):
    """ A convenience for adding the Summit job ID and run_type (sync vs. async)"""
    df['job_id'] = job_id
    df['run_type'] = run_type
    return df

In [5]:
# Could have used a list comprehension, but I wanted to echo the files for sanity checking
async_data = []
for csv_file in Path('.').glob('*ind*csv'):
    print(f'reading {csv_file}')
    async_data.append(pd.read_csv(str(csv_file)))

reading 3_1051001_issue_45_individuals.csv
reading 0_1051001_issue_45_individuals.csv
reading 4_1051001_issue_45_individuals.csv
reading 2_1051001_issue_45_individuals.csv
reading 1_1051001_issue_45_individuals.csv


In [6]:
async_data = [add_columns('1051001', 'async', x) for x in async_data]

In [7]:
async_df = pd.concat(async_data)

In [8]:
# Since we're going to later merge in the by-generation stuff that has a 'generation' column, we need to 
# add one to the async, but put in NaNs to indicate that is not relevant for the async stuff.
async_df['generation'] = np.nan

In [9]:
async_df.head() # sanity check

Unnamed: 0,run,hostname,pid,uuid,birth_id,scenario,cloudiness,wetness,precipitation,precipitation_deposits,...,fog_density,fog_distance,sun_azimuth_angle,sun_altitude_angle,start_eval_time,stop_eval_time,fitness,job_id,run_type,generation
0,3,d36n08,105620,eaa6d360-751a-4536-9ee7-6ecceffaf759,26,14,8,49,50,50,...,12,1.56,123,10,1622667000.0,1622667000.0,18.441713,1051001,async,
1,3,d36n01,24482,3403baac-6418-4c4a-a063-b478c5f98fd9,42,13,89,16,25,75,...,37,719.57594,306,-42,1622667000.0,1622667000.0,63.508626,1051001,async,
2,3,d36n07,170330,b01e7cac-eb6c-4d37-9805-c44298aa5706,37,36,49,20,50,75,...,21,15.777216,77,17,1622667000.0,1622667000.0,99.302382,1051001,async,
3,3,d36n01,24466,102e48ba-c110-4f6e-bdbf-3cd8c563c8c5,43,25,60,46,50,0,...,38,9.48576,318,72,1622667000.0,1622667000.0,47.844071,1051001,async,
4,3,d36n02,45849,8bb41025-7261-43c7-8b56-37bf88afb26d,8,38,51,34,50,0,...,46,1843.674407,336,43,1622667000.0,1622667000.0,77.942178,1051001,async,


In [20]:
async_df.to_csv('all_async.csv')

# Now to do the same thing for the synchronous (by-generation) data

In [14]:
cd /Users/may/Projects/data/AV/gremlin/runs/2021_UKCI/1064207_sync

/Users/may/Projects/data/AV/gremlin/runs/2021_UKCI/1064207_sync


In [16]:
sync_data = []
for csv_file in Path('.').glob('*pop*csv'):
    print(f'reading {csv_file}')
    sync_data.append(pd.read_csv(str(csv_file)))

reading 2_1064207_issue_46_pop.csv
reading 3_1064207_issue_46_pop.csv
reading 1_1064207_issue_46_pop.csv
reading 4_1064207_issue_46_pop.csv
reading 0_1064207_issue_46_pop.csv


In [18]:
sync_data = [add_columns('1064207', 'sync', x) for x in sync_data]

In [19]:
sync_df = pd.concat(sync_data)

In [21]:
sync_df.to_csv('all_sync.csv')

In [22]:
all_dfs = pd.concat([async_df, sync_df])

In [23]:
cd ..


/Users/may/Projects/data/AV/gremlin/runs/2021_UKCI


In [24]:
all_dfs.to_csv('gremlin_async_and_sync.csv', na_rep='NA', index=False)

In [25]:
all_dfs.head()

Unnamed: 0,run,hostname,pid,uuid,birth_id,scenario,cloudiness,wetness,precipitation,precipitation_deposits,...,fog_density,fog_distance,sun_azimuth_angle,sun_altitude_angle,start_eval_time,stop_eval_time,fitness,job_id,run_type,generation
0,3,d36n08,105620,eaa6d360-751a-4536-9ee7-6ecceffaf759,26,14,8,49,50,50,...,12,1.56,123,10,1622667000.0,1622667000.0,18.441713,1051001,async,
1,3,d36n01,24482,3403baac-6418-4c4a-a063-b478c5f98fd9,42,13,89,16,25,75,...,37,719.57594,306,-42,1622667000.0,1622667000.0,63.508626,1051001,async,
2,3,d36n07,170330,b01e7cac-eb6c-4d37-9805-c44298aa5706,37,36,49,20,50,75,...,21,15.777216,77,17,1622667000.0,1622667000.0,99.302382,1051001,async,
3,3,d36n01,24466,102e48ba-c110-4f6e-bdbf-3cd8c563c8c5,43,25,60,46,50,0,...,38,9.48576,318,72,1622667000.0,1622667000.0,47.844071,1051001,async,
4,3,d36n02,45849,8bb41025-7261-43c7-8b56-37bf88afb26d,8,38,51,34,50,0,...,46,1843.674407,336,43,1622667000.0,1622667000.0,77.942178,1051001,async,


In [26]:
all_dfs.tail()

Unnamed: 0,run,hostname,pid,uuid,birth_id,scenario,cloudiness,wetness,precipitation,precipitation_deposits,...,fog_density,fog_distance,sun_azimuth_angle,sun_altitude_angle,start_eval_time,stop_eval_time,fitness,job_id,run_type,generation
595,0,g17n18,26350,c526bd23-be9d-4386-b194-01d0c90715d8,595,4,82,32,25,0,...,80,41.949673,318,-57,1623170000.0,1623170000.0,10.88019,1064207,sync,9.0
596,0,g18n02,163154,7ce91453-e742-4ee0-85d1-8a1ba89a081e,596,5,100,69,0,100,...,62,1151.921505,132,15,1623170000.0,1623170000.0,15.307622,1064207,sync,9.0
597,0,g18n02,163158,d14983f6-c7c0-4879-af27-7784b34428fa,597,4,82,32,50,0,...,80,41.949673,318,-56,1623170000.0,1623170000.0,11.266799,1064207,sync,9.0
598,0,g18n02,163150,ade423a4-54cc-4cca-b79e-ab9d7d97c54a,598,4,100,67,0,100,...,62,719.57594,132,15,1623170000.0,1623170000.0,11.213852,1064207,sync,9.0
599,0,g18n02,163162,65e8d933-ac5a-446c-8bde-cb41de9c7b6d,599,4,82,32,50,0,...,80,41.949673,318,-56,1623170000.0,1623170000.0,10.895991,1064207,sync,9.0


At this point, the data is written out to a CSV file that was committed to our share repository.  So, the purpose of this specific notebook is done.