In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
# Preprocessing
from sklearn.preprocessing import MinMaxScaler
# Algorithms
from minisom import MiniSom # This is for the unsupervised clustering
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
def read_data():
    path = r'C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data' # The path were all the .csv are kept"
    all_files = glob.glob(path + "/*.csv")
    
    li = [] # list for the .csv
    
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0, na_values = 0)
        li.append(df) # append all .csv to dataframe
        
    frame = pd.concat(li, axis=0, ignore_index=True)
    return frame

In [3]:
df = read_data() # get the data

In [4]:
df

Unnamed: 0,LCLid,stdorToU,DateTime,KWH/hh (per half hour),Acorn,Acorn_grouped
0,MAC000002,Std,10/12/2012 0:30,,ACORN-A,Affluent
1,MAC000002,Std,10/12/2012 1:00,,ACORN-A,Affluent
2,MAC000002,Std,10/12/2012 1:30,,ACORN-A,Affluent
3,MAC000002,Std,10/12/2012 2:00,,ACORN-A,Affluent
4,MAC000002,Std,10/12/2012 2:30,,ACORN-A,Affluent
...,...,...,...,...,...,...
167932469,MAC004221,Std,2013-08-06 04:30:00.0000000,0.017,ACORN-E,Affluent
167932470,MAC004221,Std,2013-08-06 05:00:00.0000000,0.017,ACORN-E,Affluent
167932471,MAC004221,Std,2013-08-06 05:30:00.0000000,0.017,ACORN-E,Affluent
167932472,MAC004221,Std,2013-08-06 06:00:00.0000000,0.017,ACORN-E,Affluent


In [5]:
df.rename(columns={'KWH/hh (per half hour) ' : 'KWH/hh'}, inplace=True) # rename column to make easier to read.

In [6]:
# Missing data.
df.fillna(0.0, inplace=True)
df['KWH/hh'].replace('Null', 0, inplace=True) # ValueError: could not convert string to float: 'Null'
df['KWH/hh'] = df['KWH/hh'].astype(float) 

In [7]:
df['DateTime'] = pd.to_datetime(df['DateTime']) # convert to datetime datetype

In [8]:
df.rename(columns={'Acorn_grouped_ACORN-' : 'Acorn_grouped_ACORN'}, inplace=True) # rename these col so I can drop them.
df.rename(columns={'Acorn_grouped_ACORN-U' : 'Acorn_grouped_ACORNU'}, inplace=True) # rename these col so I can drop them.

In [9]:
# drop the rows that have one of the bewlow values in 'Acorn_grouped'
df = df.drop(df[(df.Acorn_grouped == "ACORN-U") | (df.Acorn_grouped == "ACORN-")].index)

In [10]:
df

Unnamed: 0,LCLid,stdorToU,DateTime,KWH/hh,Acorn,Acorn_grouped
0,MAC000002,Std,2012-10-12 00:30:00,0.000,ACORN-A,Affluent
1,MAC000002,Std,2012-10-12 01:00:00,0.000,ACORN-A,Affluent
2,MAC000002,Std,2012-10-12 01:30:00,0.000,ACORN-A,Affluent
3,MAC000002,Std,2012-10-12 02:00:00,0.000,ACORN-A,Affluent
4,MAC000002,Std,2012-10-12 02:30:00,0.000,ACORN-A,Affluent
...,...,...,...,...,...,...
167932469,MAC004221,Std,2013-08-06 04:30:00,0.017,ACORN-E,Affluent
167932470,MAC004221,Std,2013-08-06 05:00:00,0.017,ACORN-E,Affluent
167932471,MAC004221,Std,2013-08-06 05:30:00,0.017,ACORN-E,Affluent
167932472,MAC004221,Std,2013-08-06 06:00:00,0.017,ACORN-E,Affluent


In [11]:
gb1 = df.groupby('LCLid') # group by ID to be able to look at the start and end dates so as to figure out where to slice.

In [12]:
min(gb1.first().DateTime) # earlist start

Timestamp('2011-11-23 09:00:00')

In [13]:
 max(gb1.first().DateTime) # latest start

Timestamp('2013-10-29 18:23:02')

In [14]:
min(gb1.last().DateTime) # earlist stop

Timestamp('2012-05-01 21:00:00')

In [15]:
 max(gb1.last().DateTime) # latest stop

Timestamp('2014-02-28 00:00:00')

In [16]:
df2 = df.set_index(['DateTime'])
df2 = df2.loc['2013-01-01':'2014-01-01'] # 1 year of data.

In [17]:
df2

Unnamed: 0_level_0,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-01-01 00:00:00,MAC000002,Std,0.219,ACORN-A,Affluent
2013-01-01 00:30:00,MAC000002,Std,0.241,ACORN-A,Affluent
2013-01-01 01:00:00,MAC000002,Std,0.191,ACORN-A,Affluent
2013-01-01 01:30:00,MAC000002,Std,0.235,ACORN-A,Affluent
2013-01-01 02:00:00,MAC000002,Std,0.182,ACORN-A,Affluent
...,...,...,...,...,...
2013-08-06 04:30:00,MAC004221,Std,0.017,ACORN-E,Affluent
2013-08-06 05:00:00,MAC004221,Std,0.017,ACORN-E,Affluent
2013-08-06 05:30:00,MAC004221,Std,0.017,ACORN-E,Affluent
2013-08-06 06:00:00,MAC004221,Std,0.017,ACORN-E,Affluent


In [18]:
df2 = df2.reset_index() # Remove datetime from index.
gb2 = df2.groupby(['LCLid']).DateTime.count() # Group df by id and get count of the # of rows.
df3 = pd.DataFrame(gb2) # Put gb in df.
df3.rename(columns={'DateTime': '# of rows for each series'}, inplace=True)

In [19]:
df3

Unnamed: 0_level_0,# of rows for each series
LCLid,Unnamed: 1_level_1
MAC000002,17580
MAC000003,17578
MAC000004,17579
MAC000005,17579
MAC000006,17580
...,...
MAC005561,17575
MAC005562,17579
MAC005564,17571
MAC005566,17579


In [20]:
lst_for_good_ids = [17580] # Series whos # of rows equals 17,580.
# Filter df for only series of len 17,580.
df4 = pd.DataFrame(df3.loc[df3['# of rows for each series'].isin(lst_for_good_ids)])

In [21]:
df6 = pd.DataFrame(df2.loc[df2['LCLid'].isin(df4.index.to_list())])
df6

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped
0,2013-01-01 00:00:00,MAC000002,Std,0.219,ACORN-A,Affluent
1,2013-01-01 00:30:00,MAC000002,Std,0.241,ACORN-A,Affluent
2,2013-01-01 01:00:00,MAC000002,Std,0.191,ACORN-A,Affluent
3,2013-01-01 01:30:00,MAC000002,Std,0.235,ACORN-A,Affluent
4,2013-01-01 02:00:00,MAC000002,Std,0.182,ACORN-A,Affluent
...,...,...,...,...,...,...
92477104,2014-01-01 21:30:00,MAC004219,Std,0.266,ACORN-E,Affluent
92477105,2014-01-01 22:00:00,MAC004219,Std,0.260,ACORN-E,Affluent
92477106,2014-01-01 22:30:00,MAC004219,Std,0.301,ACORN-E,Affluent
92477107,2014-01-01 23:00:00,MAC004219,Std,0.223,ACORN-E,Affluent


In [22]:
df6['Day'] = df6['DateTime'].dt.day_name() # Add the day of the week.

In [23]:
df_temp = pd.read_csv("data\\london 2013 temp.csv") # Load hourly temp data for London, UK in 2013.

In [24]:
df_temp
df_temp['DateTime'] = pd.to_datetime(df_temp['ob_time']) # convert to datetime datetype
df_temp.drop(['ob_time'], axis=1, inplace = True)

In [25]:
df7 = pd.merge(df6,df_temp ,on='DateTime',how='left')
df7['air_temperature'] = df7['air_temperature'].interpolate() # Fill in missing values for temp.
df7['air_temperature'] = df7['air_temperature'] * 1.8 + 32 # Convert to F degress.

In [26]:
df7

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature
0,2013-01-01 00:00:00,MAC000002,Std,0.219,ACORN-A,Affluent,Tuesday,48.02
1,2013-01-01 00:30:00,MAC000002,Std,0.241,ACORN-A,Affluent,Tuesday,47.84
2,2013-01-01 01:00:00,MAC000002,Std,0.191,ACORN-A,Affluent,Tuesday,47.66
3,2013-01-01 01:30:00,MAC000002,Std,0.235,ACORN-A,Affluent,Tuesday,47.39
4,2013-01-01 02:00:00,MAC000002,Std,0.182,ACORN-A,Affluent,Tuesday,47.12
...,...,...,...,...,...,...,...,...
19654435,2014-01-01 21:30:00,MAC004219,Std,0.266,ACORN-E,Affluent,Wednesday,42.80
19654436,2014-01-01 22:00:00,MAC004219,Std,0.260,ACORN-E,Affluent,Wednesday,42.80
19654437,2014-01-01 22:30:00,MAC004219,Std,0.301,ACORN-E,Affluent,Wednesday,42.80
19654438,2014-01-01 23:00:00,MAC004219,Std,0.223,ACORN-E,Affluent,Wednesday,42.80


In [27]:
df7['Month'] = df7['DateTime'].dt.strftime('%B')

In [28]:
df7

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature,Month
0,2013-01-01 00:00:00,MAC000002,Std,0.219,ACORN-A,Affluent,Tuesday,48.02,January
1,2013-01-01 00:30:00,MAC000002,Std,0.241,ACORN-A,Affluent,Tuesday,47.84,January
2,2013-01-01 01:00:00,MAC000002,Std,0.191,ACORN-A,Affluent,Tuesday,47.66,January
3,2013-01-01 01:30:00,MAC000002,Std,0.235,ACORN-A,Affluent,Tuesday,47.39,January
4,2013-01-01 02:00:00,MAC000002,Std,0.182,ACORN-A,Affluent,Tuesday,47.12,January
...,...,...,...,...,...,...,...,...,...
19654435,2014-01-01 21:30:00,MAC004219,Std,0.266,ACORN-E,Affluent,Wednesday,42.80,January
19654436,2014-01-01 22:00:00,MAC004219,Std,0.260,ACORN-E,Affluent,Wednesday,42.80,January
19654437,2014-01-01 22:30:00,MAC004219,Std,0.301,ACORN-E,Affluent,Wednesday,42.80,January
19654438,2014-01-01 23:00:00,MAC004219,Std,0.223,ACORN-E,Affluent,Wednesday,42.80,January


In [29]:
df7.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\2013_01_01_2014_01_01_day_temp.csv", index = False) # full dataset

### This section is created to filter the df LCLid's down from there current amout to just 30 per group of acorn_group and std or tou.

In [30]:
import random

In [31]:
gb1 = df7.groupby(['Acorn_grouped', 'stdorToU']) # group by to do more EDA.
groups = dict(list(gb1)) # makes a dict of dataframs which can be accesed below. 

Adversity_Std = groups['Adversity', 'Std']
Adversity_ToU = groups['Adversity', 'ToU']
Affluent_Std = groups['Affluent', 'Std']
Affluent_ToU = groups['Affluent', 'ToU']
Comfortable_Std = groups['Comfortable', 'Std']
Comfortable_ToU = groups['Comfortable', 'ToU']

In [32]:
(gb1.size()/17580).round(0) # how many series belong to each group.

Acorn_grouped  stdorToU
Adversity      Std         280.0
               ToU          70.0
Affluent       Std         348.0
               ToU         106.0
Comfortable    Std         241.0
               ToU          73.0
dtype: float64

In [33]:
ad_st = pd.Series(Adversity_Std['LCLid']).unique()
ad_t = pd.Series(Adversity_ToU['LCLid']).unique()
af_st = pd.Series(Affluent_Std['LCLid']).unique()
af_t = pd.Series(Affluent_ToU['LCLid']).unique()
c_st = pd.Series(Comfortable_Std['LCLid']).unique()
c_t = pd.Series(Comfortable_ToU['LCLid']).unique()

In [34]:
ad_st_lst = random.sample(list(ad_st), 30)
ad_t_lst = random.sample(list(ad_t), 30)
af_st_lst = random.sample(list(af_st), 30)
af_t_lst = random.sample(list(af_t), 30)
c_st_lst = random.sample(list(c_st), 30)
c_t_lst = random.sample(list(c_t), 30)

In [35]:
import itertools
filter_list =  []
for i in itertools.chain(ad_st_lst, ad_t_lst, af_st_lst, af_t_lst, c_st_lst, c_t_lst):
    print(i)
    filter_list.append(i)

print(len(filter_list))

MAC002299
MAC003462
MAC000006
MAC004461
MAC002996
MAC000615
MAC001485
MAC001816
MAC003894
MAC001909
MAC001929
MAC001128
MAC001546
MAC001703
MAC004019
MAC003205
MAC004680
MAC005236
MAC003411
MAC003924
MAC001168
MAC005386
MAC003604
MAC002600
MAC005166
MAC000872
MAC000714
MAC003667
MAC002685
MAC001558
MAC000590
MAC005240
MAC001133
MAC004821
MAC002418
MAC000587
MAC001512
MAC003675
MAC004497
MAC003816
MAC000107
MAC002910
MAC000187
MAC000383
MAC001715
MAC001880
MAC005102
MAC002318
MAC003315
MAC002728
MAC004065
MAC004236
MAC003706
MAC004952
MAC000582
MAC002346
MAC001509
MAC000303
MAC005434
MAC005085
MAC004332
MAC000715
MAC003005
MAC001665
MAC004428
MAC000116
MAC002296
MAC000506
MAC005276
MAC000956
MAC005497
MAC005137
MAC000140
MAC004984
MAC004327
MAC003160
MAC002358
MAC000732
MAC002558
MAC005214
MAC004694
MAC003668
MAC001184
MAC000301
MAC005187
MAC000179
MAC002233
MAC005293
MAC000968
MAC004879
MAC003474
MAC002433
MAC002106
MAC005278
MAC000822
MAC001756
MAC000165
MAC004720
MAC000772
MAC001826


In [36]:
df8 = df7[df7['LCLid'].isin(filter_list)]

In [37]:
df8

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature,Month
17580,2013-01-01 00:00:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,48.02,January
17581,2013-01-01 00:30:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.84,January
17582,2013-01-01 01:00:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.66,January
17583,2013-01-01 01:30:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.39,January
17584,2013-01-01 02:00:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.12,January
...,...,...,...,...,...,...,...,...,...
19214935,2014-01-01 21:30:00,MAC004060,Std,0.160,ACORN-G,Comfortable,Wednesday,42.80,January
19214936,2014-01-01 22:00:00,MAC004060,Std,0.249,ACORN-G,Comfortable,Wednesday,42.80,January
19214937,2014-01-01 22:30:00,MAC004060,Std,0.070,ACORN-G,Comfortable,Wednesday,42.80,January
19214938,2014-01-01 23:00:00,MAC004060,Std,0.048,ACORN-G,Comfortable,Wednesday,42.80,January


In [38]:
gb8 = df8.groupby(['Acorn_grouped', 'stdorToU']) # group by to do more EDA.
(gb8.size()/17580).round(0) # how many series belong to each group.

Acorn_grouped  stdorToU
Adversity      Std         30.0
               ToU         30.0
Affluent       Std         30.0
               ToU         30.0
Comfortable    Std         30.0
               ToU         30.0
dtype: float64

In [39]:
df8.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\2013_01_01_2014_01_01_day_temp_filtered.csv", index = False) # full dataset

In [40]:
# try to figure when the 4th quartile enrgy use is.
# on-peak hours that generally refer to the hours beginning at 7:00 a.m. 
# until #11:00 p.m. on weekdays, and off-peak hours that are between 11:00 p.m. 
# and 7:00 a.m. on weekdays and all day on Saturdays, Sundays, and holidays.

# https://www.eia.gov/todayinenergy/detail.php?id=42915

### End section

In [41]:
Adversity_Std = Adversity_Std.set_index(['DateTime'])
Adversity_ToU = Adversity_ToU.set_index(['DateTime'])
Affluent_Std = Affluent_Std.set_index(['DateTime'])
Affluent_ToU = Affluent_ToU.set_index(['DateTime'])
Comfortable_Std = Comfortable_Std.set_index(['DateTime'])
Comfortable_ToU = Comfortable_ToU.set_index(['DateTime'])

In [42]:
Adversity_Std

Unnamed: 0_level_0,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature,Month
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-01-01 00:00:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,48.02,January
2013-01-01 00:30:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.84,January
2013-01-01 01:00:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.66,January
2013-01-01 01:30:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.39,January
2013-01-01 02:00:00,MAC000006,Std,0.000,ACORN-Q,Adversity,Tuesday,47.12,January
...,...,...,...,...,...,...,...,...
2014-01-01 21:30:00,MAC004208,Std,0.739,ACORN-Q,Adversity,Wednesday,42.80,January
2014-01-01 22:00:00,MAC004208,Std,0.485,ACORN-Q,Adversity,Wednesday,42.80,January
2014-01-01 22:30:00,MAC004208,Std,0.520,ACORN-Q,Adversity,Wednesday,42.80,January
2014-01-01 23:00:00,MAC004208,Std,0.390,ACORN-Q,Adversity,Wednesday,42.80,January


In [43]:
on_peakdays = ['Monday' , 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
off_peakdays = ['Saturday', 'Sunday']

In [44]:
on_peak_Adversity_Std = Adversity_Std.between_time('07:00','23:00')
on_peak_Adversity_ToU = Adversity_ToU.between_time('07:00','23:00')
on_peak_Affluent_Std = Affluent_Std.between_time('07:00','23:00')
on_peak_Affluent_ToU = Affluent_ToU.between_time('07:00','23:00')
on_peak_Comfortable_Std = Comfortable_Std.between_time('07:00','23:00')
on_peak_Comfortable_ToU = Comfortable_ToU.between_time('07:00','23:00')

In [45]:
on_peak_Adversity_Std = on_peak_Adversity_Std[on_peak_Adversity_Std['Day'].isin(on_peakdays)] 
on_peak_Adversity_ToU = on_peak_Adversity_ToU[on_peak_Adversity_ToU['Day'].isin(on_peakdays)] 
on_peak_Affluent_Std = on_peak_Affluent_Std[on_peak_Affluent_Std['Day'].isin(on_peakdays)] 
on_peak_Affluent_ToU = on_peak_Affluent_ToU[on_peak_Affluent_ToU['Day'].isin(on_peakdays)] 
on_peak_Comfortable_Std = on_peak_Comfortable_Std[on_peak_Comfortable_Std['Day'].isin(on_peakdays)] 
on_peak_Comfortable_ToU = on_peak_Comfortable_ToU[on_peak_Comfortable_ToU['Day'].isin(on_peakdays)] 

In [46]:
## off-peak

In [47]:
off_peak_Adversity_Std = Adversity_Std.between_time('23:30','06:30')
off_peak_Adversity_ToU = Adversity_ToU.between_time('23:30','06:30')
off_peak_Affluent_Std = Affluent_Std.between_time('23:30','06:30')
off_peak_Affluent_ToU = Affluent_ToU.between_time('23:30','06:30')
off_peak_Comfortable_Std = Comfortable_Std.between_time('23:30','06:30')
off_peak_Comfortable_ToU = Comfortable_ToU.between_time('23:30','06:30')

In [48]:
off_peak_Adversity_Std = off_peak_Adversity_Std[off_peak_Adversity_Std['Day'].isin(off_peakdays)] 
off_peak_Adversity_ToU = off_peak_Adversity_ToU[off_peak_Adversity_ToU['Day'].isin(off_peakdays)] 
off_peak_Affluent_Std = off_peak_Affluent_Std[off_peak_Affluent_Std['Day'].isin(off_peakdays)] 
off_peak_Affluent_ToU = off_peak_Affluent_ToU[off_peak_Affluent_ToU['Day'].isin(off_peakdays)] 
off_peak_Comfortable_Std = off_peak_Comfortable_Std[off_peak_Comfortable_Std['Day'].isin(off_peakdays)] 
off_peak_Comfortable_ToU = off_peak_Comfortable_ToU[off_peak_Comfortable_ToU['Day'].isin(off_peakdays)] 

In [49]:
# output all 12 dfs to:
on_peak_Adversity_Std.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\on_peak_Adversity_Std.csv", index = True) # full dataset
on_peak_Adversity_ToU.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\on_peak_Adversity_ToU.csv", index = True)
on_peak_Affluent_Std.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\on_peak_Affluent_Std.csv", index = True)
on_peak_Affluent_ToU.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\on_peak_Affluent_ToU.csv", index = True)
on_peak_Comfortable_Std.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\on_peak_Comfortable_Std.csv", index = True)
on_peak_Comfortable_ToU.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\on_peak_Comfortable_ToU.csv", index = True)

###
off_peak_Adversity_Std.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\off_peak_Adversity_Std.csv", index = True)
off_peak_Adversity_ToU.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\off_peak_Adversity_ToU.csv", index = True)
off_peak_Affluent_Std.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\off_peak_Affluent_Std.csv", index = True)
off_peak_Affluent_ToU.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\off_peak_Affluent_ToU.csv", index = True)
off_peak_Comfortable_Std.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\off_peak_Comfortable_Std.csv", index = True)
off_peak_Comfortable_ToU.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\off_peak_Comfortable_ToU.csv", index = True)

In [50]:
on_peak_Adversity_Std.describe().round(3)

Unnamed: 0,KWH/hh,air_temperature
count,2420880.0,2420880.0
mean,0.207,53.627
std,0.237,12.57
min,0.0,25.88
25%,0.071,43.7
50%,0.137,52.7
75%,0.248,62.78
max,5.67,92.66


In [51]:
off_peak_Adversity_Std.describe().round(3)

Unnamed: 0,KWH/hh,air_temperature
count,437920.0,437920.0
mean,0.132,47.395
std,0.234,9.971
min,0.0,27.14
25%,0.043,38.66
50%,0.077,46.94
75%,0.139,55.76
max,4.887,73.85


In [52]:
Adversity_Std_df = pd.DataFrame(Adversity_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
Adversity_ToU_df = pd.DataFrame(Adversity_ToU.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
Affluent_Std_df = pd.DataFrame(Affluent_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
Affluent_ToU_df = pd.DataFrame(Adversity_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
Comfortable_Std_df = pd.DataFrame(Adversity_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
Comfortable_ToU_df = pd.DataFrame(Adversity_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)

In [53]:
# output all 12 dfs to:
on_peak_Adversity_Std_avg = pd.DataFrame(on_peak_Adversity_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
on_peak_Adversity_ToU_avg = pd.DataFrame(on_peak_Adversity_ToU.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
on_peak_Affluent_Std_avg = pd.DataFrame(on_peak_Affluent_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
on_peak_Affluent_ToU_avg = pd.DataFrame(on_peak_Affluent_ToU.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
on_peak_Comfortable_Std_avg = pd.DataFrame(on_peak_Comfortable_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
on_peak_Comfortable_ToU_avg = pd.DataFrame(on_peak_Comfortable_ToU.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)

###
off_peak_Adversity_Std_avg = pd.DataFrame(off_peak_Adversity_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
off_peak_Adversity_ToU_avg = pd.DataFrame(off_peak_Adversity_ToU.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
off_peak_Affluent_Std_avg = pd.DataFrame(off_peak_Affluent_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
off_peak_Affluent_ToU_avg = pd.DataFrame(off_peak_Affluent_ToU.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
off_peak_Comfortable_Std_avg = pd.DataFrame(off_peak_Comfortable_Std.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)
off_peak_Comfortable_ToU_avg = pd.DataFrame(off_peak_Comfortable_ToU.groupby('LCLid')['KWH/hh'].apply(list).to_dict()).mean(axis=1)

In [54]:
off_peak_Adversity_Std_avg.describe().round(3)

count    1564.000
mean        0.132
std         0.039
min         0.081
25%         0.102
50%         0.122
75%         0.147
max         0.279
dtype: float64

In [55]:
on_peak_Adversity_Std_avg.describe().round(3)

count    8646.000
mean        0.207
std         0.053
min         0.117
25%         0.166
50%         0.196
75%         0.233
max         0.406
dtype: float64

## Discretizing

In [56]:
df7

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature,Month
0,2013-01-01 00:00:00,MAC000002,Std,0.219,ACORN-A,Affluent,Tuesday,48.02,January
1,2013-01-01 00:30:00,MAC000002,Std,0.241,ACORN-A,Affluent,Tuesday,47.84,January
2,2013-01-01 01:00:00,MAC000002,Std,0.191,ACORN-A,Affluent,Tuesday,47.66,January
3,2013-01-01 01:30:00,MAC000002,Std,0.235,ACORN-A,Affluent,Tuesday,47.39,January
4,2013-01-01 02:00:00,MAC000002,Std,0.182,ACORN-A,Affluent,Tuesday,47.12,January
...,...,...,...,...,...,...,...,...,...
19654435,2014-01-01 21:30:00,MAC004219,Std,0.266,ACORN-E,Affluent,Wednesday,42.80,January
19654436,2014-01-01 22:00:00,MAC004219,Std,0.260,ACORN-E,Affluent,Wednesday,42.80,January
19654437,2014-01-01 22:30:00,MAC004219,Std,0.301,ACORN-E,Affluent,Wednesday,42.80,January
19654438,2014-01-01 23:00:00,MAC004219,Std,0.223,ACORN-E,Affluent,Wednesday,42.80,January


In [57]:
df7['KWH/hh_group'] = pd.cut(df7['KWH/hh'], [0,1.561,10], include_lowest=True, labels=['< 99th per', '99th per'])
df7.sort_values('KWH/hh_group')

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature,Month,KWH/hh_group
0,2013-01-01 00:00:00,MAC000002,Std,0.219,ACORN-A,Affluent,Tuesday,48.02,January,< 99th per
13071176,2013-08-07 17:00:00,MAC000162,Std,0.129,ACORN-Q,Adversity,Wednesday,71.60,August,< 99th per
13071175,2013-08-07 16:30:00,MAC000162,Std,0.257,ACORN-Q,Adversity,Wednesday,71.15,August,< 99th per
13071174,2013-08-07 16:00:00,MAC000162,Std,0.184,ACORN-Q,Adversity,Wednesday,70.70,August,< 99th per
13071173,2013-08-07 15:30:00,MAC000162,Std,0.334,ACORN-Q,Adversity,Wednesday,70.97,August,< 99th per
...,...,...,...,...,...,...,...,...,...,...
9566656,2013-03-07 07:00:00,MAC000049,Std,2.383,ACORN-E,Affluent,Thursday,46.22,March,99th per
5155055,2013-03-27 16:00:00,MAC000834,ToU,1.572,ACORN-F,Comfortable,Wednesday,37.76,March,99th per
5155056,2013-03-27 16:30:00,MAC000834,ToU,1.581,ACORN-F,Comfortable,Wednesday,37.49,March,99th per
5155044,2013-03-27 10:30:00,MAC000834,ToU,1.575,ACORN-F,Comfortable,Wednesday,37.31,March,99th per


In [58]:
df7['KWH/hh_group'].value_counts()

< 99th per    19457722
99th per        196718
Name: KWH/hh_group, dtype: int64

In [59]:
df7['KWH/hh'].quantile(0.99)

1.562

In [60]:
gb9 = df7.groupby(['KWH/hh_group']) # group by to do more EDA.

In [61]:
gb9.size()

KWH/hh_group
< 99th per    19457722
99th per        196718
dtype: int64

In [62]:
df_99th = gb9.get_group('99th per')
#df_less_99th = gb9.get_group('< 99th per')

In [63]:
df_99th['Day'].value_counts()

Sunday       30985
Saturday     28444
Friday       27916
Monday       27790
Wednesday    27651
Tuesday      27363
Thursday     26569
Name: Day, dtype: int64

In [64]:
df_99th['Month'].value_counts()

January      31012
March        27060
February     25898
December     24155
November     18816
April        16795
October      11884
May          11065
September     9477
June          8196
August        6236
July          6124
Name: Month, dtype: int64

In [65]:
gb5 = df_99th.groupby(['Acorn_grouped', 'stdorToU'])

In [83]:
gb5.size()#/17580

Acorn_grouped  stdorToU
Adversity      Std          22900
               ToU           4345
Affluent       Std         116630
               ToU          10132
Comfortable    Std          35795
               ToU           6916
dtype: int64

In [71]:
arr1 = df_99th['LCLid'].unique()
arr2 = df7['LCLid'].unique()

In [73]:
diff = np.setdiff1d(arr2, arr1, assume_unique=False)

In [79]:
arr2.shape # all metrers ids

(1118,)

In [80]:
diff.shape # number of meters that dont have values in the top 1% over 1.5

(341,)

In [81]:
arr1.shape # # number of meters that do have values in the top 1% over 1.5

(777,)

In [84]:
df_99th

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature,Month,KWH/hh_group
1241,2013-01-26 20:00:00,MAC000002,Std,1.603,ACORN-A,Affluent,Saturday,45.32,January,99th per
2488,2013-02-21 19:00:00,MAC000002,Std,2.887,ACORN-A,Affluent,Thursday,34.34,February,99th per
2489,2013-02-21 19:30:00,MAC000002,Std,1.699,ACORN-A,Affluent,Thursday,34.07,February,99th per
2490,2013-02-21 20:00:00,MAC000002,Std,1.665,ACORN-A,Affluent,Thursday,33.80,February,99th per
2491,2013-02-21 20:30:00,MAC000002,Std,1.730,ACORN-A,Affluent,Thursday,33.62,February,99th per
...,...,...,...,...,...,...,...,...,...,...
19600489,2013-12-07 19:00:00,MAC004208,Std,1.590,ACORN-Q,Adversity,Saturday,46.04,December,99th per
19601201,2013-12-22 15:00:00,MAC004208,Std,1.618,ACORN-Q,Adversity,Sunday,48.74,December,99th per
19601211,2013-12-22 20:00:00,MAC004208,Std,1.622,ACORN-Q,Adversity,Sunday,42.80,December,99th per
19601636,2013-12-31 16:00:00,MAC004208,Std,1.593,ACORN-Q,Adversity,Tuesday,44.96,December,99th per


In [91]:
df_99th_filtered = df7[df7['LCLid'].isin(arr1)] 
df_less_99th_filtered = df7[~df7['LCLid'].isin(arr1)] 

In [90]:
df_99th_filtered

Unnamed: 0,DateTime,LCLid,stdorToU,KWH/hh,Acorn,Acorn_grouped,Day,air_temperature,Month,KWH/hh_group
0,2013-01-01 00:00:00,MAC000002,Std,0.219,ACORN-A,Affluent,Tuesday,48.02,January,< 99th per
1,2013-01-01 00:30:00,MAC000002,Std,0.241,ACORN-A,Affluent,Tuesday,47.84,January,< 99th per
2,2013-01-01 01:00:00,MAC000002,Std,0.191,ACORN-A,Affluent,Tuesday,47.66,January,< 99th per
3,2013-01-01 01:30:00,MAC000002,Std,0.235,ACORN-A,Affluent,Tuesday,47.39,January,< 99th per
4,2013-01-01 02:00:00,MAC000002,Std,0.182,ACORN-A,Affluent,Tuesday,47.12,January,< 99th per
...,...,...,...,...,...,...,...,...,...,...
19636855,2014-01-01 21:30:00,MAC004213,Std,0.329,ACORN-E,Affluent,Wednesday,42.80,January,< 99th per
19636856,2014-01-01 22:00:00,MAC004213,Std,0.242,ACORN-E,Affluent,Wednesday,42.80,January,< 99th per
19636857,2014-01-01 22:30:00,MAC004213,Std,0.268,ACORN-E,Affluent,Wednesday,42.80,January,< 99th per
19636858,2014-01-01 23:00:00,MAC004213,Std,0.364,ACORN-E,Affluent,Wednesday,42.80,January,< 99th per


In [94]:
df_less_99th_filtered['KWH/hh_group'].value_counts()

< 99th per    5994780
99th per            0
Name: KWH/hh_group, dtype: int64

In [95]:
df_99th_filtered['KWH/hh_group'].value_counts()

< 99th per    13462942
99th per        196718
Name: KWH/hh_group, dtype: int64

In [96]:
df_99th_filtered.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\df_99th_filtered.csv", index = False) # full dataset
df_less_99th_filtered.to_csv("C:\\Users\\jesse\\Downloads\\clases\\electrical data\\london data\\data\\peak\\df_less_99th_filtered.csv", index = False)

In [None]:
weekdays = ['Monday' , 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
weekends = ['Saturday', 'Sunday']