# Prepare data downloaded from BigQuery

The .xlsx file only had air sensor data in it for the PFC-EDU evaporation test that was run between Jan 15 and Feb 15. The raw data (with the exception of 'boot' and 'status' heart beat messages) have been pulled from BigQuery for the dates of the pilot and for the evaporation test.

##### *NOTE*

Not all 20 of the machines used in the evaporation test were also present in the pilot. A few of the BeagleBone boards that are the brains of the PFC-EDU were damaged beyond repair by the time the units were returned to us. while the 'brain board's from these machines (the circuit boards that have the LEDs and sensors on them) were re-used, the BeagleBones were replaced, generating new IDs in our back end.

##### BQ CSV format
The two CSV files that have been downloaded from BigQuery contain rows in the following format:
`device`,`report_time`,`var`,`name`,`value`,`values`

`var` is the actual value being measured/represented, `name` is the name of the sensor being used, `value` is a singular value while `values` is for data that is stored as a `json` object.

In [1]:
evap_test_file = "../data/csv/raw_from_BQ/bq-results-pfc-20-test-jan_14-feb_15.csv"
pilot_data_file = "../data/csv/raw_from_BQ/bq-results-pfc-20-pilot-data-oct_11_2018-jan_1_2019.csv"
save_path = "../data/csv/raw_from_BQ/split_datas"

In [2]:
def genfilename(dev_id, varname):
    return dev_id + "_" + varname[0] + "_" + varname[1] + ".csv"

device_ids = []
current_device_id = ""
sensor_files = {}
with open(evap_test_file, 'r') as infile:
    for line in infile.readlines():
        linesplit = line.split(',')
        dev_id = linesplit[0].strip()
        if dev_id != "device":
            if dev_id not in device_ids:
                device_ids.append(dev_id)
            report_time = linesplit[1].strip()
            varname = linesplit[2:4]
            value = linesplit[4].strip()
            values_json = (",".join(linesplit[5:])).strip()
            if genfilename(dev_id, varname) not in sensor_files.keys():
                sensor_files[genfilename(dev_id, varname)] = open(save_path+"/evap_test/"+genfilename(dev_id, varname),'w')
                sensor_files[genfilename(dev_id, varname)].write("variable,sensor_name,timestamp_utc,value,value_json")
            else:
                newline = ",".join(varname + [report_time,value,values_json])
                sensor_files[genfilename(dev_id, varname)].write(newline+"\n")
            
for f in sensor_files.values():
    f.close()

In [3]:
sensor_files = {}
with open(pilot_data_file, 'r') as infile:
    for line in infile.readlines():
        linesplit = line.split(',')
        dev_id = linesplit[0].strip()
        if dev_id != "device":
            if dev_id not in device_ids:
                print("deviceID Found: " + dev_id)
                device_ids.append(dev_id)
            report_time = linesplit[1].strip()
            varname = linesplit[2:4]
            value = linesplit[4].strip()
            values_json = (",".join(linesplit[5:])).strip()

            if genfilename(dev_id, varname) not in sensor_files.keys():
                sensor_files[genfilename(dev_id, varname)] = open(save_path+"/pilot_data/"+genfilename(dev_id, varname),'w')
                sensor_files[genfilename(dev_id, varname)].write("variable,sensor_name,timestamp_utc,value,value_json\n")
            else:
                newline = ",".join(varname + [report_time,value,values_json])
                sensor_files[genfilename(dev_id, varname)].write(newline+"\n")
            
for f in sensor_files.values():
    f.close()

deviceID Found: EDU-27B1A1C6-f4-5e-ab-3b-35-dd


In [4]:
device_ids

['EDU-30A77B2D-f4-5e-ab-64-50-92',
 'EDU-30EB6274-f4-5e-ab-66-6f-05',
 'EDU-32B65C51-50-65-83-d0-fb-bf',
 'EDU-429A28CD-f4-5e-ab-3c-f3-d2',
 'EDU-46CF2F54-f4-5e-ab-5d-fd-81',
 'EDU-4EF485F5-f4-5e-ab-5b-10-cd',
 'EDU-56F786AB-f4-5e-ab-5c-33-b2',
 'EDU-90DB5116-50-65-83-e6-7d-b0',
 'EDU-A3F734CE-f4-5e-ab-59-ac-e3',
 'EDU-B373ACFF-f4-5e-ab-66-4b-6a',
 'EDU-B4092A13-f4-5e-ab-5a-4d-f9',
 'EDU-C9B9F1A0-f4-5e-ab-5b-4f-d2',
 'EDU-D473FCE2-f4-5e-ab-65-8d-62',
 'EDU-D834D808-f4-5e-ab-fa-82-e8',
 'EDU-DC266DD8-f4-5e-ab-60-49-b2',
 'EDU-E074D2DE-f4-5e-ab-3d-d0-61',
 'EDU-F86DC2E3-50-65-83-d5-51-e5',
 'EDU-FACAD357-f4-5e-ab-67-5d-92',
 'EDU-27B1A1C6-f4-5e-ab-3b-35-dd']

In [5]:
import pandas as pd

In [6]:
raw_data_suffixes = {"air_co2":         "air_carbon_dioxide_ppm_T6713-Top.csv",
             "air_RH":          "air_humidity_percent_SHT25-Top.csv",
             "air_temp_C":      "air_temperature_celcius_SHT25-Top.csv",
             "water_ec_ms_cm":  "water_electrical_conductivity_ms_cm_AtlasEC-Reservoir.csv",
             "water_pH":        "water_potential_hydrogen_AtlasPH-Reservoir.csv",
             "water_temp_C":    "water_temperature_celcius_AtlasTemp-Reservoir.csv"}

In [7]:
pilot_path = save_path + "/pilot_data/" + device_ids[0] + "_" + raw_data_suffixes["air_co2"]

In [8]:
pilot_path

'../data/csv/raw_from_BQ/split_datas/pilot_data/EDU-30A77B2D-f4-5e-ab-64-50-92_air_carbon_dioxide_ppm_T6713-Top.csv'

### Calculate Date ranges
Read in the pilot data, and figure out the date ranges of the sensor readings for each bot

In [9]:
import os
dfs = []
max_min_dates = {}
for dev_id in device_ids:
    for suffix in raw_data_suffixes.keys():
        filename = save_path + "/pilot_data/"+dev_id + "_" + raw_data_suffixes[suffix]
        if os.path.exists(filename):
            # print(filename)
            df = pd.read_csv(filename)
            df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])
            dfs.append(df)
    max_min_dates[dev_id] = {'max': dfs[0]['timestamp_utc'].max(),
                             'min': dfs[0]['timestamp_utc'].min()}
    for i in range(1, len(dfs)):
        if dfs[i]['timestamp_utc'].max() > max_min_dates[dev_id]['max']:
            max_min_dates[dev_id]['max'] = dfs[i]['timestamp_utc'].max()
        if dfs[i]['timestamp_utc'].min() > max_min_dates[dev_id]['min']:
            max_min_dates[dev_id]['min'] = dfs[i]['timestamp_utc'].min()

In [10]:
lines = []
for dev_id in max_min_dates.keys():
    lines.append(dev_id + ": "+str(max_min_dates[dev_id]['min'])+" to "+str(max_min_dates[dev_id]['max']) + "\n")
with open('../data/dates_of_data_found_in_pilot_timeframe_Oct_2018-Dec_2018.txt','w') as f:
    f.writelines(lines)
    

In [11]:
for line in lines:
    print(line)

EDU-30A77B2D-f4-5e-ab-64-50-92: 2018-11-29 14:40:52+00:00 to 2018-12-07 05:03:23+00:00

EDU-30EB6274-f4-5e-ab-66-6f-05: 2018-11-29 21:06:45+00:00 to 2018-12-31 23:57:39+00:00

EDU-32B65C51-50-65-83-d0-fb-bf: 2018-11-30 10:11:46+00:00 to 2018-12-31 23:57:39+00:00

EDU-429A28CD-f4-5e-ab-3c-f3-d2: 2018-11-30 10:11:46+00:00 to 2018-12-31 23:57:39+00:00

EDU-46CF2F54-f4-5e-ab-5d-fd-81: 2018-12-06 12:17:37+00:00 to 2018-12-31 23:57:39+00:00

EDU-4EF485F5-f4-5e-ab-5b-10-cd: 2018-12-06 12:17:37+00:00 to 2018-12-31 23:57:39+00:00

EDU-56F786AB-f4-5e-ab-5c-33-b2: 2018-12-06 12:17:37+00:00 to 2018-12-31 23:57:40+00:00

EDU-90DB5116-50-65-83-e6-7d-b0: 2018-12-06 12:17:37+00:00 to 2018-12-31 23:57:40+00:00

EDU-A3F734CE-f4-5e-ab-59-ac-e3: 2018-12-06 12:17:37+00:00 to 2018-12-31 23:57:40+00:00

EDU-B373ACFF-f4-5e-ab-66-4b-6a: 2018-12-06 12:17:37+00:00 to 2018-12-31 23:57:40+00:00

EDU-B4092A13-f4-5e-ab-5a-4d-f9: 2018-12-06 12:17:37+00:00 to 2018-12-31 23:57:40+00:00

EDU-C9B9F1A0-f4-5e-ab-5b-4f-d2: 

### Some data stats on each bot in Evap Test (PFC-20 Test)

In [15]:

stats_dfs = {}
for dev_id in device_ids:
    dfs = {}
    for suffix in raw_data_suffixes.keys():
        filename = save_path + "/evap_test/"+dev_id + "_" + raw_data_suffixes[suffix]
        if os.path.exists(filename):
            # print(filename)
            df = pd.read_csv(filename)
            df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])
            df.set_index('timestamp_utc', inplace=True)
            dfs[suffix] = df.copy()
    # Do the stats here
    stat_column_names = ["Sensor", "Count", "Max", "Min", "Mean", "Median", "Standard Deviation"]
    stat_rows = []
    total_count = 0
    for key in raw_data_suffixes.keys():
        if key in dfs.keys():
            stats_row = [key,
                         dfs[key]["value"].count(),
                         dfs[key]["value"].max(),
                         dfs[key]["value"].min(),
                         dfs[key]["value"].mean(),
                        dfs[key]["value"].median(),
                        dfs[key]["value"].std()]
            stat_rows.append(stats_row)
            total_count += dfs[key]["value"].count()
    stat_rows.append(["Total Readings", total_count,0,0,0,0,0])
    stats_df = pd.DataFrame(stat_rows,columns=stat_column_names)
    stats_dfs[dev_id] = stats_df.copy()

In [17]:
summary_path = "../data/evap_test_summary_by_bot/"
for key in stats_dfs.keys():
    with open(summary_path + key + "_summary.csv",'w') as outfile:
        outfile.write(stats_dfs[key].to_csv())

In [19]:
stats_dfs['EDU-FACAD357-f4-5e-ab-67-5d-92']



Unnamed: 0,Sensor,Count,Max,Min,Mean,Median,Standard Deviation
0,air_co2,6551,573.0,0.0,143.17234,139.0,80.315757
1,air_RH,6564,45.0,7.0,20.193632,18.0,7.937771
2,air_temp_C,6564,40.0,13.0,24.94546,25.0,4.329392
3,water_ec_ms_cm,6555,1.65,0.0,0.923399,0.91,0.259811
4,water_pH,6555,7.46,3.6,7.147216,7.17,0.213877
5,water_temp_C,6556,28.92,-1023.0,18.577454,20.94,48.307948
6,Total Readings,39345,0.0,0.0,0.0,0.0,0.0
