# Library and Data Imports

In [1]:
# Library Imports
import os

import pandas as pd
import numpy as np

In [2]:
# Data Import and observing head
data = pd.read_csv('all_sites_2020-04-01.csv')
print(data.shape)
data = pd.concat([data, pd.read_csv('all_sites_2020-04-02.csv')])
print(data.shape)
data.head()

(3569212, 8)
(7213284, 8)


Unnamed: 0.1,Unnamed: 0,equipRef,groupRef,navName,siteRef,typeRef,unit,value
0,2020-04-01 00:00:01+00:00,AHU-01,Allard Hall Air Systems,Wifi Count,Allard Hall,Wifi Count,omit,6.0
1,2020-04-01 00:00:01+00:00,AHU-07,Allard Hall Air Systems,Rm 122 Wifi Count,Allard Hall,Rm 122 Wifi Count,omit,1.0
2,2020-04-01 00:00:01+00:00,AHU-08,Allard Hall Air Systems,Rm 121 Wifi Count,Allard Hall,Rm 121 Wifi Count,omit,6.0
3,2020-04-01 00:00:01+00:00,AHU-10,Allard Hall Air Systems,Rm 114 Wifi Count,Allard Hall,Rm 114 Wifi Count,omit,1.0
4,2020-04-01 00:00:01+00:00,AHU-13,Allard Hall Air Systems,Rm 111 Wifi Count,Allard Hall,Rm 111 Wifi Count,omit,2.0


***
# Pharmacy Data Exploration

In [3]:
# Filtering for just Pharmacy data
pharmacy = data.loc[data['siteRef']=='Pharmacy',:]
pharmacy.head()

Unnamed: 0.1,Unnamed: 0,equipRef,groupRef,navName,siteRef,typeRef,unit,value
7,2020-04-01 00:00:01+00:00,Cooling Plant,Pharmacy Hydronic Systems,CHWP_P5ABC_DP,Pharmacy,CHWP_P5ABC_DP,kPa,295.344025
10,2020-04-01 00:00:01+00:00,Heating Plant SBLR-1,Pharmacy Hydronic Systems,SB1_2_FWT_T,Pharmacy,SB1_2_FWT_T,°C,42.126369
11,2020-04-01 00:00:01+00:00,Heating Plant SBLR-2,Pharmacy Hydronic Systems,SB2_FIRE_RATE,Pharmacy,SB2_FIRE_RATE,Pa,58.159836
20,2020-04-01 00:00:01+00:00,Rm 3122 VAV-3S001,Pharmacy Floor 3,Discharge Air Flow,Pharmacy,VAV_3S001_FLW,L/s,156.490616
22,2020-04-01 00:00:01+00:00,Rm 3340 VAV-3S004,Pharmacy Floor 3,Zone Temp Effective Sp,Pharmacy,VAV_3S004_RT_SP,°C,22.0


In [4]:
# Extracting unique sensors in the Pharmacy building
unique_sensors = pharmacy.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])

In [5]:
# Observing info about the dataset
print(unique_sensors.shape)
unique_sensors.describe()

(4587, 8)


Unnamed: 0.1,Unnamed: 0,equipRef,groupRef,navName,siteRef,typeRef,unit,value
count,4587,4587,4587,4587,4587,4587,4587,4587.0
unique,1253,862,11,858,1,4558,16,2645.0
top,2020-04-01 00:15:00+00:00,Cooling Plant,Pharmacy Air Systems,Zone Temp,Pharmacy,Power OPC,°C,0.0
freq,1137,32,666,578,4587,4,1721,412.0


In [6]:
# Writing list of unique sensors to csv for easy observation
unique_sensors.to_csv('test.csv')

### Checking if Pharmacy Units Match SkySpark Metadata Units

In [7]:
# Reading in the SkySpark Metadata
pharmacy_skyspark = pd.read_csv('PharmacyQuery.csv')
pharmacy_skyspark.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,ahu,ahuMode,air,alarm,avg,bacnetConnRef,bacnetCur,bacnetHis,bacnetObjectId,...,utilityPoint,valve,vavMode,voltage,volume,water,wifi,wifiId,zone,mod
0,@p:ubcv:r:236c8a88-c9742760 Pharmacy Heating P...,,,,,,@p:ubcv:r:236bb200-36a98126 PHARMA PHARM_HX2_F...,,TL56,,...,,,,,,,,,,2020-04-09T19:02:32.597Z UTC
1,@p:ubcv:r:236c8a1e-5e449855 Pharmacy Elec Subm...,,,,,,@p:ubcv:r:236bb200-138f9a04 PHARMA PHARM_LEED_...,,TL59,,...,,,,,,,,,,2020-04-09T19:02:34.939Z UTC
2,@p:ubcv:r:2391be0a-02e78149 Pharmacy Rm Corr F...,,,✓,,,@p:ubcv:r:2391a56a-94071b95 PHARMA PHARM_FCU_5...,,TL1,,...,,,,,,,,,,2020-04-09T19:02:31.877Z UTC
3,@p:ubcv:r:2391c062-351afea3 Pharmacy Rm 1420 F...,,,✓,,,@p:ubcv:r:2391a768-9f8e3d61 PHARMA PHARM_FCU_1...,,TL1,,...,,,,,,,,,,2020-04-09T19:02:32.736Z UTC
4,@p:ubcv:r:236c8a38-ed027843 Pharmacy Elec Subm...,,,,,,@p:ubcv:r:236bb200-64c2f7d8 PHARMA PHARM_LEED_...,,TL208,,...,,,,,,,,,,2020-04-09T19:02:34.67Z UTC


In [8]:
# Extracting metadata required to generate a unique ID that conforms to the queried data format
### format of unique== siteRef+" "+equipRef+" "+typeRef+" "+groupRef+" "+navName
# Extracting siteRef, equipRef, and typeRef from the id column
pharmacy_skyspark['unique'] = pharmacy_skyspark['id'].apply(lambda x: x.split(" ",1)[1])
# Extracting the groupRef from the groupRef column
pharmacy_skyspark['unique'] += " "+pharmacy_skyspark['groupRef'].apply(lambda x: x.split(" ",1)[1] if type(x)==str else "" )
# Extracting the navName form the naveName column
pharmacy_skyspark['unique'] += " "+pharmacy_skyspark['navName']

pharmacy_skyspark.head()

Unnamed: 0,id,ahu,ahuMode,air,alarm,avg,bacnetConnRef,bacnetCur,bacnetHis,bacnetObjectId,...,valve,vavMode,voltage,volume,water,wifi,wifiId,zone,mod,unique
0,@p:ubcv:r:236c8a88-c9742760 Pharmacy Heating P...,,,,,,@p:ubcv:r:236bb200-36a98126 PHARMA PHARM_HX2_F...,,TL56,,...,,,,,,,,,2020-04-09T19:02:32.597Z UTC,Pharmacy Heating Plant HX-2 P-HX2A HX2_PHX2A_V...
1,@p:ubcv:r:236c8a1e-5e449855 Pharmacy Elec Subm...,,,,,,@p:ubcv:r:236bb200-138f9a04 PHARMA PHARM_LEED_...,,TL59,,...,,,,,,,,,2020-04-09T19:02:34.939Z UTC,Pharmacy Elec Submeters LEED-2N1PC3 2N1PC3_Cur...
2,@p:ubcv:r:2391be0a-02e78149 Pharmacy Rm Corr F...,,,✓,,,@p:ubcv:r:2391a56a-94071b95 PHARMA PHARM_FCU_5...,,TL1,,...,,,,,,,,,2020-04-09T19:02:31.877Z UTC,Pharmacy Rm Corr FC-513 FCU_513_S Pharmacy Flo...
3,@p:ubcv:r:2391c062-351afea3 Pharmacy Rm 1420 F...,,,✓,,,@p:ubcv:r:2391a768-9f8e3d61 PHARMA PHARM_FCU_1...,,TL1,,...,,,,,,,,,2020-04-09T19:02:32.736Z UTC,Pharmacy Rm 1420 FC-111 FCU_111_S Pharmacy Flo...
4,@p:ubcv:r:236c8a38-ed027843 Pharmacy Elec Subm...,,,,,,@p:ubcv:r:236bb200-64c2f7d8 PHARMA PHARM_LEED_...,,TL208,,...,,,,,,,,,2020-04-09T19:02:34.67Z UTC,Pharmacy Elec Submeters LEED-6ETLE1 6ETLE1_Cur...


In [9]:
# Generating unique column for the queried data with the same fomat as above
pharmacy['unique'] =  pharmacy['siteRef']+" "+pharmacy['equipRef']+" "+pharmacy['typeRef']+" "+pharmacy['groupRef']+" "+pharmacy['navName']
pharmacy.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0.1,Unnamed: 0,equipRef,groupRef,navName,siteRef,typeRef,unit,value,unique
7,2020-04-01 00:00:01+00:00,Cooling Plant,Pharmacy Hydronic Systems,CHWP_P5ABC_DP,Pharmacy,CHWP_P5ABC_DP,kPa,295.344025,Pharmacy Cooling Plant CHWP_P5ABC_DP Pharmacy ...
10,2020-04-01 00:00:01+00:00,Heating Plant SBLR-1,Pharmacy Hydronic Systems,SB1_2_FWT_T,Pharmacy,SB1_2_FWT_T,°C,42.126369,Pharmacy Heating Plant SBLR-1 SB1_2_FWT_T Phar...
11,2020-04-01 00:00:01+00:00,Heating Plant SBLR-2,Pharmacy Hydronic Systems,SB2_FIRE_RATE,Pharmacy,SB2_FIRE_RATE,Pa,58.159836,Pharmacy Heating Plant SBLR-2 SB2_FIRE_RATE Ph...
20,2020-04-01 00:00:01+00:00,Rm 3122 VAV-3S001,Pharmacy Floor 3,Discharge Air Flow,Pharmacy,VAV_3S001_FLW,L/s,156.490616,Pharmacy Rm 3122 VAV-3S001 VAV_3S001_FLW Pharm...
22,2020-04-01 00:00:01+00:00,Rm 3340 VAV-3S004,Pharmacy Floor 3,Zone Temp Effective Sp,Pharmacy,VAV_3S004_RT_SP,°C,22.0,Pharmacy Rm 3340 VAV-3S004 VAV_3S004_RT_SP Pha...


In [10]:
# Getting dataframe of just units and unique for both the metadata and queried data and dropping duplicate values
queried = pd.DataFrame(columns = ['unique', 'queried_unit'])
queried['unique'] = pharmacy['unique']
queried['queried_unit'] = pharmacy['unit']
print(queried.shape)
queried = queried.drop_duplicates()
queried.reset_index(drop=True, inplace=True)
print(queried.shape)

skyspark = pd.DataFrame(columns = ['unique', 'skyspark_unit'])
skyspark['unique'] = pharmacy_skyspark['unique']
skyspark['skyspark_unit'] = pharmacy_skyspark['unit']
print(skyspark.shape)
skyspark = skyspark.drop_duplicates()
print(skyspark.shape)

(1163384, 2)
(4644, 2)
(7938, 2)
(7832, 2)


In [11]:
# Merging the queried and skyspark dataframes for easy unit comparison 
compare = skyspark.merge(queried, on='unique', how="outer")
print(compare.shape)
compare.head()

(8218, 3)


Unnamed: 0,unique,skyspark_unit,queried_unit
0,Pharmacy Heating Plant HX-2 P-HX2A HX2_PHX2A_V...,kWh,°C
1,Pharmacy Elec Submeters LEED-2N1PC3 2N1PC3_Cur...,A,A
2,Pharmacy Rm Corr FC-513 FCU_513_S Pharmacy Flo...,A,A
3,Pharmacy Rm 1420 FC-111 FCU_111_S Pharmacy Flo...,A,A
4,Pharmacy Elec Submeters LEED-6ETLE1 6ETLE1_Cur...,A,A


In [12]:
# Creating Columns for easily filtering when the units are the same or have a NaN value in either of the unit columns (indicating the sensor doesn't exist in one)
compare['isSame'] = compare.apply(lambda x: True if x[1]==x[2] else False, axis=1)
compare['hasNaN'] = compare.apply(lambda x: True if (x[1]!=x[1] or x[2]!=x[2]) else False, axis=1)
compare.head()

Unnamed: 0,unique,skyspark_unit,queried_unit,isSame,hasNaN
0,Pharmacy Heating Plant HX-2 P-HX2A HX2_PHX2A_V...,kWh,°C,False,False
1,Pharmacy Elec Submeters LEED-2N1PC3 2N1PC3_Cur...,A,A,True,False
2,Pharmacy Rm Corr FC-513 FCU_513_S Pharmacy Flo...,A,A,True,False
3,Pharmacy Rm 1420 FC-111 FCU_111_S Pharmacy Flo...,A,A,True,False
4,Pharmacy Elec Submeters LEED-6ETLE1 6ETLE1_Cur...,A,A,True,False


In [13]:
# Filtering for when the units are differnt and there are no NaN's
dif_units = compare[compare['isSame']==False]
dif_units_noNaN = dif_units[dif_units['hasNaN']==False]
# Saving the result as a csv
dif_units_noNaN.to_csv('queried_spark_comp.csv')
dif_units_noNaN

Unnamed: 0,unique,skyspark_unit,queried_unit,isSame,hasNaN
0,Pharmacy Heating Plant HX-2 P-HX2A HX2_PHX2A_V...,kWh,°C,False,False
14,Pharmacy LEF-3 EF3_HR_INLET_T Pharmacy Air Sys...,°C,Pa,False,False
17,Pharmacy Elec Submeters LEED-2N1PC3 2N1PC3_Dem...,W,A,False,False
22,Pharmacy LEF-1 EF-1C EF1_F3_VFD_PWR(kWh) Pharm...,kWh,°C,False,False
57,Pharmacy Water Submeters WM-9 WM9_BLDG_DAY Pha...,L,_,False,False
...,...,...,...,...,...
7839,Pharmacy AHU-16 AHU16_SF_VFD_TSP(Pa) Pharmacy ...,Pa,°C,False,False
7849,Pharmacy HW Submeters FM-9 FM9_BTU_CRAH4~6_FLO...,L/s,°C,False,False
7857,Pharmacy Rm 6118 VAV-6S068 VAV_6S068_DMP Pharm...,%,ppm,False,False
7873,Pharmacy Elec Submeters LEED-6N3LO1 6N3LO1_Ins...,W,A,False,False


In [14]:
# Filtering when the skyspark units are NaN but the queried units aren't (i.e. exists in the queried data but not in the skyspark data)
in_queried = compare[compare['queried_unit'].notnull()]
in_queried_notIn_skyspark = in_queried[in_queried['skyspark_unit'].isnull()]
in_queried_notIn_skyspark

Unnamed: 0,unique,skyspark_unit,queried_unit,isSame,hasNaN
25,Pharmacy Rm 2340 VAV-2S028 VAV_2S028_Dmp_Close...,,omit,False,True
28,Pharmacy FF-103 FF_103_SCHED_BV Pharmacy Floor...,,omit,False,True
38,Pharmacy Rm 6321 VAV-6S074 VAV_6S074_Dmp_Open ...,,omit,False,True
40,Pharmacy Rm 3304 VAV-3S048 VAV_3S048_Dmp_Open ...,,omit,False,True
44,Pharmacy LEF-2 EF-2A EF2_F1_ENAB Pharmacy Air ...,,%RH,False,True
...,...,...,...,...,...
8213,Pharmacy Cooling Plant Chiller System CHLR_PCH...,,°C,False,True
8214,Pharmacy Cooling Plant Chiller System CHLR_PCH...,,°C,False,True
8215,Pharmacy AHU-13 AHU13_PFIL Pharmacy Air System...,,°C,False,True
8216,Pharmacy AHU-13 AHU13_OAD_ES Pharmacy Air Syst...,,°C,False,True


***
# Exploration of All Buildings

In [15]:
# Generateing Full list of Unique UOMs and Observing Unique UOM's
unique_uom = data['unit'].unique()
print(len(unique_uom))
unique_uom

40


array(['omit', 'kPa', 'Pa', '°C', '%', 'L/s', 'ppm', 'A', 'psi', 'ft/min',
       'Hz', 'h', 'gal/min', 'min', 'L/min', 'L', 'N', '%RH', '_', 'kW',
       'cfm', 'MW', 'kWh', 'cm', 'mA', 'mV', 'V', 'inH₂O', 'MWh', 'rpm',
       'm³/h', 'in', 'm³', 'mm', 'W/m²', 'deg', 'km', 'm/s', '°daysC',
       'K'], dtype=object)

## Exploring Non-Numeric Values

In [16]:
# Function to return True if a value is not a number and False if it is a number
def is_not_number(val):
    try:
        float(val)
        return False
    except:
        return True

# Generating column identifying which values are not numbers
data['is_not_num'] = data['value'].apply(is_not_number)

# Filtering for non-number values
not_nums = data.loc[data['is_not_num'],:]
not_nums.head()

Unnamed: 0.1,Unnamed: 0,equipRef,groupRef,navName,siteRef,typeRef,unit,value,is_not_num
13,2020-04-01 00:00:01+00:00,Rm 141 VAVS-M11S,Brimacombe-QMI Floor-01 Ampel,Operation Mode,Brimacombe-QMI,VAVS_M11S_FLW_OPERATION_MODE,omit,OCCUPIED,True
17,2020-04-01 00:00:01+00:00,Rm 263A VAVE-2-01,Brimacombe-QMI Floor-02 QMI,Operation Mode,Brimacombe-QMI,RM_263A_VAVE_2_01_FLW_OPERATION_MODE,omit,OCCUPIED,True
18,2020-04-01 00:00:01+00:00,Rm 273 VAVS-2-14,Brimacombe-QMI Floor-02 QMI,Operation Mode,Brimacombe-QMI,RM_273_VAVS_2_14_FLW_OPERATION_MODE,omit,OCCUPIED,True
39,2020-04-01 00:00:02+00:00,Rm 188 VAVS-1-08,Brimacombe-QMI Floor-01 QMI,Operation Mode,Brimacombe-QMI,RM_188_VAVS_1_08_FLW_OPERATION_MODE,omit,OCCUPIED,True
47,2020-04-01 00:00:02+00:00,Rm 263A VAVS-2-17,Brimacombe-QMI Floor-02 QMI,Operation Mode,Brimacombe-QMI,RM_263A_VAVS_2_17_FLW_OPERATION_MODE,omit,OCCUPIED,True


In [17]:
# Observing the summary of the non-numeric dataset
not_nums.describe()

Unnamed: 0.1,Unnamed: 0,equipRef,groupRef,navName,siteRef,typeRef,unit,value,is_not_num
count,395955,395864,395955,395864,395864,395955,395955,395955,395955
unique,143315,1139,63,554,7,2176,3,37,1
top,2020-04-02 13:00:00+00:00,FHE-15,Pharmacy Floor 3,Operation Mode,Brimacombe-QMI,ESB_FHE_15_S_BV,omit,OCCUPIED,True
freq,403,10380,37877,172855,182916,10380,395564,158419,395955


In [18]:
# Generating pivot table of non-numeric values and viewing it
not_nums_pivot = pd.pivot_table(not_nums, index=['equipRef','groupRef','navName','siteRef','typeRef','value'], aggfunc='count')
not_nums_pivot.to_csv('pivot_test.csv')
not_nums_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 0,is_not_num,unit
equipRef,groupRef,navName,siteRef,typeRef,value,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AHU-01,Brimacombe-QMI Air Systems,Discharge Air Damper Cmd,Brimacombe-QMI,AHU01_DIS_DMP,False,2,2,2
AHU-01,Brimacombe-QMI Air Systems,Discharge Air Damper Cmd,Brimacombe-QMI,AHU01_DIS_DMP,True,2,2,2
AHU-01,CIRS Air Systems,CIRS_AHU1_P1_S_BV,CIRS,CIRS_AHU1_P1_S_BV,False,19,19,19
AHU-01,CIRS Air Systems,CIRS_AHU1_P1_S_BV,CIRS,CIRS_AHU1_P1_S_BV,True,19,19,19
AHU-01,CIRS Air Systems,CIRS_AHU1_PUMP_SP_6_C,CIRS,CIRS_AHU1_PUMP_SP_6_C,False,24,24,24
...,...,...,...,...,...,...,...,...
stair 1 FF-01,CIRS Basement,Exhaust Fan Run,CIRS,CIRS_FF01_FAN_S,True,41,41,41
stair 2 FF-02,CIRS Floor 1,Discharge Fan Run Cmd,CIRS,CIRS_FF02_FAN_C,False,21,21,21
stair 2 FF-02,CIRS Floor 1,Discharge Fan Run Cmd,CIRS,CIRS_FF02_FAN_C,True,21,21,21
stair 2 FF-02,CIRS Floor 1,Exhaust Fan Run,CIRS,CIRS_FF02_FAN_S,False,21,21,21


## Checking for Inconsistent Datatypes Within the Same Sensor

In [19]:
# Function to assign datatypes (as strings) to each observations (for checking if datatypes change between the same piece of equipment)
def get_dtypes(x):
    try:
        float(x)
        return 'num'
    except:
        if x=='True' or x=='False':
            return 'bool'
        else:
            return 'str'
        
# Generating Column of Datatypes
data['dtype'] = data['value'].apply(get_dtypes)

In [20]:
# Reading in all of the files from the pCloud folder, combining them by month and removing duplicates ()
### (Couldn't store all of the data in RAM so had to save as csv as an intermediate step)
path = '../../data_files/'
isFirst = True
for folder in os.listdir(path):
    if len(folder)==3:
        print(folder)
        for file in os.listdir(path+folder):
            if file[0]!=".":
                if isFirst==False:
                    new_data = pd.read_csv(path+folder+"/"+file)
                    new_data['dtype'] = new_data['value'].apply(get_dtypes)            
                    all_data_dtype = pd.concat([all_data_dtype, new_data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef','dtype'])])
                    all_data = pd.concat([all_data, new_data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])])
                else:
                    all_data = pd.read_csv(path+folder+"/"+file)
                    all_data['dtype'] = all_data['value'].apply(get_dtypes)
                    all_data_dtype = all_data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef','dtype'])
                    all_data = all_data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])
                    isFirst = False
        all_data.to_csv("all_data_"+folder+".csv")
        all_data_dtype.to_csv("all_data_dtype_"+folder+".csv")
        isFirst = True

FileNotFoundError: [Errno 2] No such file or directory: '../../data_files/'

In [None]:
# Reading in the files generated from the above chunk and combining them
path_all = 'all_data'
path_dtype = 'all_data_dtype'

# all
isFirst = True
for file in os.listdir(path_all):
    if isFirst==False:
        if file[0]!=".":
            if isFirst==False:
                all_data = pd.concat([all_data, pd.read_csv(path_all+"/"+file)])
            else:
                all_data = pd.read_csv(path_all+"/"+file)
                isFirst = False
# dtype
isFirst = True
for file in os.listdir(path_dtype):
    if isFirst==False:
        if file[0]!=".":
            if isFirst==False:
                all_data_dtype = pd.concat([all_data_dtype, pd.read_csv(path_dtype+"/"+file)])
            else:
                all_data_dtype = pd.read_csv(path_dtype+"/"+file)
                isFirst = False

In [None]:
# Simple check to see if all instruments have the same datatype 
### (If there are different datatypes for any instrument then dropping duplicates while using datatype as one of the keys will have a larger number of observations that dropping without datatype as one of the keys)
dtype_no_dupes = all_data_dtype.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef','dtype'])
all_no_dupes = all_data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])
print("Observations when Dropping with dtypes: "+str(len(dtype_no_dupes)))
print("Observations when Dropping without dtypes: "+str(len(all_no_dupes)))
if len(dtype_no_dupes)==len(all_no_dupes):
    print("Datatypes ARE consistent")
else:
    print("Datatypes are NOT consistent")

In [None]:
# Filtering for duplicated values (meaning sensors with measurements of two different datatypes)
dtype_no_dupes[dtype_no_dupes.duplicated(subset=['equipRef','groupRef','navName','siteRef','typeRef'])]

## Checking for Unit Inconsistencies

In [None]:
# Data Import
df = pd.read_csv('all_sites_2020-04-01.csv')
df = pd.concat([df, pd.read_csv('all_sites_2020-04-02.csv')])

# Dropping weatherRef items (not part of this investigation)
df = df[df['groupRef']!='weatherRef']
# Creating a unique identifier
df['unique'] =  df['equipRef']+df['groupRef']+df['navName']+df['siteRef']+df['typeRef']
# Dropping Duplicate values once including units once not including units to see if there is a difference
no_dupes_w_unit = df.drop_duplicates(subset=['unique','unit'])
no_dupes_wo_unit = df.drop_duplicates(subset=['unique'])
# Comparing the number of observations after the two drops
print("# of observations when including units in the drop keys:" +str(len(no_dupes_w_unit)))
print("# of observations when omitting units from the drop keys:" +str(len(no_dupes_wo_unit)))
# Extracting the instruments with inconsistent units
units_incons = no_dupes_w_unit[no_dupes_w_unit.duplicated(subset=['unique'], keep=False)]
print("# of items in the duplicates list: "+str(len(units_incons)))
# Storing the list of instruments with inconsistent units in a csv
units_incons.to_csv("units_incons.csv")

In [None]:
# Used for exploring individual items with issues
uniqueID = 'Thermanex HeaderESB Hydronic SystemsOutside Air TempESBESB_TMX_BUILDING_OAT'
test = df.loc[df['unique']==uniqueID]
# Storing the first two UOMs
uom1 = test['unit'].unique()[0]
uom2 = test['unit'].unique()[1]
# Displaying the different UOMs the given sensor has
print(test['unit'].unique())
# Defining the number of observations to display (the smaller of 5 or the number of available observations for the given sensor)
endIdx = 5
if (len(test['value'].unique())<5):
    endIdx=len(test['value'].unique())
print(str(test['value'].unique()[0:endIdx]))
# Try Except statements for displaying the max and min observed values for the given sensor (or that the data is not numeric if not numeric)
try:
    print("Max: "+str(max(test['value'].apply(lambda x: float(x))))+"\tMin: "+str(min(test['value'].apply(lambda x: float(x)))))
except:
    print("Data not numbers")
# Displaying the observations for the first unit, and how many observations have that unit
print("Num obs: "+str(len(test.loc[test['unit']==uom1])))
test.loc[test['unit']==uom1]
#test

In [None]:
# Displaying the observations for the second unit, and how many observations have that unit
test2 = df.loc[df['unique']==uniqueID]
print("Num obs: "+str(len(test.loc[test['unit']==uom2])))
test2.loc[test2['unit']==uom2]

***
# Exploring other columns

In [None]:
# Observing items with weatherRef as the grouRef
data.loc[data['groupRef']=='weatherRef']

In [None]:
# Checking for null values
print(data.loc[data['equipRef'].isnull()]['groupRef'].unique())
print(data.loc[data['groupRef'].isnull()]['groupRef'].unique())
print(data.loc[data['navName'].isnull()]['groupRef'].unique())
print(data.loc[data['siteRef'].isnull()]['groupRef'].unique())
print(data.loc[data['typeRef'].isnull()]['groupRef'].unique())

In [None]:
# Checking for equipRef issues
no_equipRef = data.drop_duplicates(subset=['groupRef','navName','siteRef','typeRef'])
all_no_dupes = data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])
print("All Observations when Dropping without equipRef: "+str(len(no_equipRef)))
print("All Observations after Dropping all columns: "+str(len(all_no_dupes)))

In [None]:
# Checking for groupRef issues
no_groupRef = data.drop_duplicates(subset=['equipRef','navName','siteRef','typeRef'])
all_no_dupes = data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])
print("All Observations when Dropping without groupRef: "+str(len(no_groupRef)))
print("All Observations after Dropping all columns: "+str(len(all_no_dupes)))

In [None]:
# Checking for navName issues
no_navName = data.drop_duplicates(subset=['equipRef','groupRef','siteRef','typeRef'])
all_no_dupes = data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])
print("All Observations when Dropping without navName: "+str(len(no_navName)))
print("All Observations after Dropping all columns: "+str(len(all_no_dupes)))

In [None]:
# Checking for siteRef issues
no_siteRef = data.drop_duplicates(subset=['equipRef','groupRef','navName','typeRef'])
all_no_dupes = data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])
print("All Observations when Dropping without siteRef: "+str(len(no_siteRef)))
print("All Observations after Dropping all columns: "+str(len(all_no_dupes)))

In [None]:
# equipRef and navName didn't have the same number within both so saving data to csvs for further investigation
pd.concat([all_no_dupes,no_equipRef]).drop_duplicates(keep=False).to_csv('equipRef.csv')
pd.concat([all_no_dupes,no_navName]).drop_duplicates(keep=False).to_csv('navName.csv')

In [None]:
# Checking if can uniquely identify sensors without navName or groupRef
no_siteRef = data.drop_duplicates(subset=['equipRef','siteRef','typeRef'])
all_no_dupes = data.drop_duplicates(subset=['equipRef','groupRef','navName','siteRef','typeRef'])
print("All Observations when Dropping without siteRef: "+str(len(no_siteRef)))
print("All Observations after Dropping all columns: "+str(len(all_no_dupes)))