## STEP 2

In [3]:
def create_unique_id(df, metadata=False, indexes=['equipRef', 'groupRef', 'navName', 'siteRef', 'typeRef', 'bmsName']):
    """
    Function to add uniqueIds for sensors to a dataframe
    Args:
        df (dataframe): any pandas dataframe
        metadata (bool): False if data is not metadata
        indexes (list): list of field names
    Returns:
        df (dataframe): original dataframe with sensor uniqueId
    """
    if metadata==False:
        # concatenates the 5 fields
        uniqueid=df[indexes[0]].fillna('')+' '+df[indexes[1]].fillna('')+' '+df[indexes[2]].fillna('')+' '+df[indexes[3]].fillna('')+' '+df[indexes[4]].fillna('')
        # removes Pharmacy from uniqueId
        uniqueid=uniqueid.str.replace('Pharmacy ', '')
        # moves uniqueId to the front of df
        df.insert(0, 'uniqueId', uniqueid)
        return df
    elif metadata==True:
        # removes the database id from equipRef, groupRef, siteRef
        df[indexes[0]]=df[indexes[0]].str.extract('[^ ]* (.*)', expand=True)
        df[indexes[1]]=df[indexes[1]].str.extract('[^ ]* (.*)', expand=True)
        df[indexes[3]]=df[indexes[3]].str.extract('[^ ]* (.*)', expand=True)
        # concatenates the 5 fields
        uniqueid=df[indexes[0]].fillna('')+' '+df[indexes[1]].fillna('')+' '+df[indexes[2]].fillna('')+' '+df[indexes[3]].fillna('')+' '+df[indexes[5]].fillna('')
        # removes Pharmacy from uniqueId
        uniqueid=uniqueid.str.replace('Pharmacy ', '')
        # moves uniqueId to the front of df
        df.insert(0, 'uniqueId', uniqueid)
        return df

In [19]:
temp_df2.head()

Unnamed: 0,uniqueId,date,hour,unit,value
1877,Elec Submeters LEED-6N6LE1 Utilities 6N6LE1_En...,2020-05-01,0,kWh,302290.5625
1910,Elec Submeters LEED-CH-4 Utilities CH-4_Energy...,2020-05-01,0,kWh,5314.479004
1934,Elec Submeters LEED-6N6LW1 Utilities 6N6LW1_En...,2020-05-01,0,kWh,3829.905762
1951,Elec Submeters LEED-6N4LE1 Utilities 6N4LE1_En...,2020-05-01,0,kWh,254711.53125
1987,Elec Submeters LEED-6N4LE1 Utilities 6N4LE1_En...,2020-05-01,0,kWh,254711.53125


In [22]:
    import data_preparation
    import clustering
    import aggregation
    import pandas as pd
    from sklearn.linear_model import Ridge, RidgeCV
    

    #0) Set Constants (remember, constants are named in all caps with underscores between words)
    #################
    
    # TODO: write code to create a proper list of each day in the decided upon date-range store as DATELIST
    
    DATELIST = ["2020-03-16","2020-05-01"] # These dates are in the test_data folder so this is just here for testing purposes

    SENSOR_ID_TAGS = [1,2,3,4,5,6] # order is ["groupRef","equipRef","navName","siteRef","typeRef","unit"] #NOTE: Including "unit" here means that we WILL have inconsistent units after aggregations unless we address them in the for loop BEFORE running agg_all, it's fine for now but this will need to be addressed
                                 # Contiued from above: including "unit" causes issues when there are duplicate items with mixed units (need to run the code to fix the units during this for loop or ignore units in the clustering phase)

    last_idx_as_cols = False
    is_first_iter = True
    cnt=1
    for day in DATELIST:
        # Querying and preping data for aggregations
        temp_df2 = data_preparation.query_csv(client=None, date=day, site=None)
        if temp_df2 is None:
            continue
        temp_df2 = aggregation.split_datetime(temp_df2)
        # Filter for EC data, this step will be done in the query 
        temp_df2=temp_df2[temp_df2['unit']=='kWh']
        # Creating uniqueId
        temp_df2=create_unique_id(temp_df2)
        # Filtering dataframe for only relevant fields
        temp_df2=temp_df2[['uniqueId', 'date', 'hour', 'unit', 'value']]
        if is_first_iter:
            # Creating a low memory dataframe for the append_agg function before the structure is changed by agg_all
            struct_df2 = temp_df2.head(1)
            # Aggregating the first date's data
            ec_data1=aggregation.agg_numeric_by_col(temp_df2, col_idx=[0,1,2,3], how='mean')
    #    b) Also create second DF by aggregating further just using sensor ID fields (end result=1row per sensor)
            ec_data2=aggregation.agg_numeric_by_col(temp_df2, col_idx=[0,3], how='all')
            is_first_iter = False
        else:
            # Aggregating the current date's data and aggregate it with the current running total
            temp_df2a=aggregation.agg_numeric_by_col(temp_df2, col_idx=[0,1,2,3], how='mean')
            temp_df2b=aggregation.agg_numeric_by_col(temp_df2, col_idx=[0,3], how='all')
            ec_data1=aggregation.append_agg(df1=temp_df2a, df2=ec_data1, struct_df=struct_df2, col_idx=[0,1,2,3])
            ec_data2=aggregation.append_agg(df1=temp_df2b, df2=ec_data2, struct_df=struct_df2, col_idx=[0,3])
        cnt += 1
    # Freeing up some memory
    #temp_df2 = None
    #temp_df2a = None
    #temp_df2b = None
    # Calculating the update rate
    ec_data2["update_rate"] = ec_data2["count"] / (24*cnt)
    ec_data2.drop("count", inplace=True, axis=1)
     
    # Resetting index columns
    ec_data1=ec_data1.reset_index()
    ec_data2=ec_data2.reset_index()
    
    # Renaming column
    ec_data1=ec_data1.rename(columns={"mean":"EC_mean_value"})
    
    # Dataframe with unique sensor ids
    uniqueSensors=ec_data2['uniqueId'].unique()
    
    ### Scaling EC data 
    ec_data1['EC_mean_value']=data_preparation.scale_continuous(ec_data1, indexes=[4])

#     reading cluster csv just for now
    nc_data=pd.read_csv('sample_cluster_output.csv')

  
    ### Scaling Cluster data
    for i in range(2,len(nc_data.columns)):
        nc_data.iloc[:,i]=data_preparation.scale_continuous(nc_data, indexes=[i])

    #    c) For each unique EC sensorID (i.e. row in 2b_EC_data_df), create LASSO model using 2a_EC_data_df and 
    #       step1_output_NC_data_df. Model is basically: Y=EC response and Xn=NC data
   
    ### Will store each ridge output into a list and append all the dataframes 
    coefficients_list=[]

    ### Creating individual data frames for each sensor and implementing lasso 
    for sensor in uniqueSensors:

        ## Create data frame for only that relevant sensor
        new_df=ec_data1[ec_data1['uniqueId']==sensor]
        ######## Changing EC data types for merging later. Might not need depending on step 1 output types
        new_df = new_df.astype({"date": object, "hour": int})
        new_df.loc[:,'date']=new_df['date'].apply(lambda x: str(x)[0:10])
        
        ## Merge specific sensor to cluster data
        new_merged=pd.merge(nc_data, new_df, how='inner', left_on=['date','hour'], right_on=['date','hour']) 
        ## Ridge does not allow NANs, seems like some sensors are not 'on' during specific hours
        new_merged=new_merged.dropna()

        ## All NC predictor variables
        X=new_merged.iloc[:,2:22]

        ## Mean value of EC data
        Y=new_merged['EC_mean_value']
        Y=Y.to_numpy().reshape(len(Y),1)
        
        #Ridge CV to find optimal alpha value
        alphas=[0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8]
        reg=RidgeCV(alphas=alphas, store_cv_values=True)
        reg.fit(X, Y)
        alpha_best=reg.alpha_

         ## Ridge model using optimal alpha value found in step above
        ridge_test=Ridge(alpha=alpha_best, tol=.01, max_iter=10e7,normalize=True)
        ridge_test.fit(X,Y)
        coef=ridge_test.coef_

        ## Store coefficients into a dataframe
        new=pd.DataFrame(data=coef.reshape(1,20))

        ## Add uniqueId to the dataframe
        new['uniqueId']=sensor

        ## Store each sensorID's ridge coefficients into a list
        coefficients_list.append(new)

    ### Append all ridge coefficients for all sensors into a single dataframe
    for df in uniqueSensors:
        final_df = pd.concat(coefficients_list)   

  return runner(coro)


ValueError: could not convert string to float: '2020-05-01'

In [23]:
ec_data2.head()

Unnamed: 0,uniqueId,mean,max,min,std,update_rate
0,AHU-01 SF Air Systems Energy AHU1_SF_VFD_PWR(kWh),7418.680922,7451.669507,7385.707108,19.39108,1.763889
1,AHU-02 SF Air Systems Energy AHU2_SF_VFD_PWR(kWh),6478.359204,6511.913609,6444.804865,19.722132,1.763889
2,AHU-03 SF Air Systems Energy AHU3_SF_VFD_PWR(kWh),29664.693313,29755.81979,29573.807856,53.595036,1.763889
3,AHU-04 SF Air Systems Energy AHU4_SF_VFD_PWR(kWh),29497.462076,29587.241311,29407.896761,52.83236,1.763889
4,AHU-05 SF Air Systems Energy AHU5_SF_VFD_PWR(kWh),0.0,0.0,0.0,0.0,1.763889


## STEP 4

In [29]:
    #     4) Prep EC data for classification model
    #     ########################################
    #         a) Load metadata and join with 2b_EC_data_df
    metadata=pd.read_csv('~/data-599-capstone-ubc-urban-data-lab/code/test_data/PharmacyQuery.csv')
    # Make uniqueIDs 
    metadata=data_preparation.create_unique_id(metadata, metadata=True)
    # Drop duplicates
    metadata=metadata.sort_values('lastSynced').drop_duplicates('uniqueId',keep='last')
    # Choose relevant fields
    metadata=metadata[['uniqueId','kind', 'energy','power', 'sensor', 'unit', 'water']]
    ### Changing boolean to easily identify during encoding process
    metadata['energy']=metadata['energy'].apply(lambda x: 'yes_energy' if x=='✓' else 'no_energy')
    metadata['power']=metadata['power'].apply(lambda x: 'yes_power' if x=='✓' else 'no_power')
    metadata['sensor']=metadata['sensor'].apply(lambda x: 'yes_sensor' if x=='✓' else 'no_sensor')
    metadata['water']=metadata['water'].apply(lambda x: 'yes_water' if x=='✓' else 'no_water')
    metadata['unit']=metadata['unit'].apply(lambda x: 'omit' if x=='_' else x)
    # inner join metadata and 2b_EC_data_df 
    merged_inner=pd.merge(ec_data2, metadata, left_on='uniqueId', right_on='uniqueId', how='inner')

    #         b) Apply feature selection function(s) to the joined EC+metadata
    # load NRCan classifications training data
    nrcan_labels=pd.read_csv('~/data-599-capstone-ubc-urban-data-lab/data/PharmacyEnergyConsumption-secondtry - PharmacyEnergyConsumption-secondtry.csv')

    # make uniqueId
    nrcan_labels['siteRef']='Pharmacy'
    nrcan_labels=data_preparation.create_unique_id(nrcan_labels)

    # rename columns to fix unit of measurements
    nrcan_labels.rename(columns={'UBC_EWS.firstValue':'value'}, inplace=True)
    # run correct_df_units function
    data_preparation.correct_df_units(nrcan_labels)

    # TRAINING DATA CLEANING (maybe its own module with metadata?)
    # can change ? to 0 since uom fixed 
    nrcan_labels['isGas']=training.isGas.apply(lambda x: '0' if x=='?' else x)
    # changing boolean for more descriptive encoding 
    nrcan_labels['isGas']=training.isGas.apply(lambda x: 'no_gas' if x=='0' else 'yes_gas')

    # selecting relevant training data fields 
    nrcan_labels=nrcan_labels[['uniqueId', 'isGas', 'equipRef', 'groupRef', 'navName','ALEX-NRCanLabelGuess']]
    nrcan_labels=nrcan_labels.drop_duplicates()
    merged_outer=pd.merge(left=merged_inner, right=nrcan_labels, how='outer', left_on='uniqueId', right_on='uniqueId')
    # make equipRef and navName into smaller categories for feature engineering
    merged_outer['equipNew']=merged_outer.equipRef.apply(lambda x: data_preparation.equip_label(x))
    merged_outer['navNew']=merged_outer.navName.apply(lambda x: data_preparation.nav_label(x))

    #         c) Encode and scale the EC+metadata
    # encoding after feature selection
    merged_outer['groupRef_Air_Systems']=merged_outer.groupRef.apply(lambda x: 1 if x=='Air Systems' else 0)
    merged_outer['groupRef_Hydronic_Systems']=merged_outer.groupRef.apply(lambda x: 1 if x=='Hydronic Systems' else 0)
    merged_outer['groupRef_Utilities']=merged_outer.groupRef.apply(lambda x: 1 if x=='Utilities' else 0)
    merged_outer['influxDB_units_kWh']=merged_outer.unit.apply(lambda x: 1 if x=='kWh' else 0)
    merged_outer['isGas_yes_gas']=merged_outer.isGas.apply(lambda x: 1 if x=='yes_gas' else 0)
    merged_outer['energy_no_energy']=merged_outer.energy.apply(lambda x: 1 if x=='no_energy' else 0)
    merged_outer['energy_yes_energy']=merged_outer.energy.apply(lambda x: 1 if x=='yes_energy' else 0)
    merged_outer['sensor_no_sensor']=merged_outer.sensor.apply(lambda x: 1 if x=='no_sensor' else 0)
    merged_outer['sensor_yes_sensor']=merged_outer.sensor.apply(lambda x: 1 if x=='yes_sensor' else 0)
    merged_outer['equipRef_Air_Equip']=merged_outer.equipNew.apply(lambda x: 1 if x=='Air_Equip' else 0)
    merged_outer['equipRef_Cooling']=merged_outer.equipNew.apply(lambda x: 1 if x=='Cooling' else 0)
    merged_outer['equipRef_Heating']=merged_outer.equipNew.apply(lambda x: 1 if x=='Heating' else 0)
    merged_outer['equipRef_LEED']=merged_outer.equipNew.apply(lambda x: 1 if x=='LEED' else 0)
    merged_outer['equipRef_OPC(TV)']=merged_outer.equipNew.apply(lambda x: 1 if x=='OPC(TV)' else 0)
    merged_outer['navName_Energy']=merged_outer.navNew.apply(lambda x: 1 if x=='Energy' else 0)

    # scaling after feature selection
    for i in range(1,6):
        merged_outer.iloc[:,i]=data_preparation.scale_continuous(merged_outer, indexes=[i])
    #         d) Join the model coeffecients from step2 output to the EC+metadata
    # left join model coefficients and ec+metadata 
    data = pd.merge(merged_outer, final_df, left_on='uniqueId', right_on='uniqueId', how='outer')
    # dropping unnessary columns to feed into classification
    data = data.drop(['kind', 'energy', 'power', 'sensor', 'water', 'isGas', 'equipRef', 'groupRef', 'navName', 'equipNew', 'navNew', 'unit'], axis=1)
    #         OUTPUT OF STEP = dataframe with EC sensor ID fields, selected EC features, model coeffecients

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [30]:
data

Unnamed: 0,uniqueId,min,std,max,mean,update_rate,ALEX-NRCanLabelGuess,groupRef_Air_Systems,groupRef_Hydronic_Systems,groupRef_Utilities,...,10,11,12,13,14,15,16,17,18,19
0,AHU-01 SF Air Systems Energy AHU1_SF_VFD_PWR(kWh),0.000176,0.004027,0.000177,0.000177,1.0,4_Auxillary_Motors,0.0,0.0,0.0,...,0.0,-8.044641e-07,-1.642989e-05,-0.000002,-0.000068,7.067823e-06,-1.203895e-05,-1.863212e-05,4.729968e-06,-8.005030e-06
1,AHU-02 SF Air Systems Energy AHU2_SF_VFD_PWR(kWh),0.000153,0.004096,0.000155,0.000154,1.0,4_Auxillary_Motors,0.0,0.0,0.0,...,0.0,-8.235311e-07,-1.739682e-05,-0.000002,-0.000071,7.475479e-06,-1.273178e-05,-1.969023e-05,4.983808e-06,-8.524592e-06
2,AHU-03 SF Air Systems Energy AHU3_SF_VFD_PWR(kWh),0.000704,0.011131,0.000708,0.000706,1.0,4_Auxillary_Motors,0.0,0.0,0.0,...,0.0,-1.928326e-06,-4.859430e-05,-0.000006,-0.000199,2.034183e-05,-3.553179e-05,-5.434261e-05,1.325928e-05,-2.411629e-05
3,AHU-04 SF Air Systems Energy AHU4_SF_VFD_PWR(kWh),0.000700,0.010972,0.000704,0.000702,1.0,4_Auxillary_Motors,0.0,0.0,0.0,...,0.0,-1.905828e-06,-4.783331e-05,-0.000006,-0.000195,2.004111e-05,-3.495969e-05,-5.350754e-05,1.306443e-05,-2.373670e-05
4,AHU-05 SF Air Systems Energy AHU5_SF_VFD_PWR(kWh),0.000000,0.000000,0.000000,0.000000,1.0,4_Auxillary_Motors,0.0,0.0,0.0,...,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,HW Submeters FM-8 Utilities Power MV_PHARM_FM8...,,,,,,0_NOT_ENERGY_CONSUMPTION,0.0,0.0,0.0,...,,,,,,,,,,
700,HW Submeters FM-9 Utilities Power FM9_BTU_CRAH...,,,,,,0_NOT_ENERGY_CONSUMPTION,0.0,0.0,0.0,...,,,,,,,,,,
701,Cooling Plant CT-1A Hydronic Systems Energy CT...,,,,,,,,,,...,0.0,0.000000e+00,6.837714e-10,0.000000,0.000020,9.368307e-09,2.895755e-09,-2.042393e-09,-3.916348e-09,1.984014e-09
702,Cooling Plant CT-1B Hydronic Systems Energy CT...,,,,,,,,,,...,0.0,0.000000e+00,-7.573049e-11,0.000000,0.000005,-1.042308e-10,-1.681848e-11,4.058360e-11,-2.229658e-10,4.178189e-11
