# Preprocess for House Dataset

In [1]:
import codecs
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm import tqdm

# Load original data

`household_power_consumption.txt` is downloaded from https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption .

In [2]:
df = pd.read_csv('../house_dataset/household_power_consumption.txt', sep='\t', delimiter=';', low_memory=False)

# Process the original data

## Replace NaN to 0

In [3]:
df = df.replace('?', 0)
df = df.replace(np.nan, 0)

In [4]:
df.isnull().any()

Date                     False
Time                     False
Global_active_power      False
Global_reactive_power    False
Voltage                  False
Global_intensity         False
Sub_metering_1           False
Sub_metering_2           False
Sub_metering_3           False
dtype: bool

## Use Sub_metering

In [5]:
df_date = pd.to_datetime(df['Date'], format="%d/%m/%Y")
df_time = df['Time']

df_sub = df[[
    'Sub_metering_1',
    'Sub_metering_2',
    'Sub_metering_3'
]].astype(float)

## Add sorted indices

In [6]:
times = sorted(list(set(df_time)))

In [7]:
dates = sorted(list(set(df_date)))

In [8]:
print(len(dates))
print(len(times))

1442
1440


In [9]:
def sort_time(x):
    return times.index(x)

df_sub['time(sort)'] = df_time.apply(sort_time)

In [10]:
def sort_date(x):
    return dates.index(x)

df_sub['date(sort)'] = df_date.apply(sort_date)

In [11]:
df_sub.head()

Unnamed: 0,Sub_metering_1,Sub_metering_2,Sub_metering_3,time(sort),date(sort)
0,0.0,1.0,17.0,1044,0
1,0.0,1.0,16.0,1045,0
2,0.0,2.0,17.0,1046,0
3,0.0,1.0,17.0,1047,0
4,0.0,1.0,17.0,1048,0


# Create clustering dataset

In [12]:
def aggregate_time(sep):
    
    XX = np.zeros([int(len(times) / sep), len(dates), 3])
    
    for i in tqdm(range(int(len(df_sub)))):
        rec = df_sub.iloc[i]
        XX[
            int(rec['time(sort)'] // sep),
            int(rec['date(sort)']),
            :
        ] += rec[['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']]

    return XX

In [13]:
XX_pre = aggregate_time(sep=5)

100%|██████████| 2075259/2075259 [20:58<00:00, 1648.51it/s]


In [14]:
XX = []
for t in range(96):
    XX.append(
        np.r_[XX_pre[3 * t], XX_pre[3 * t + 1], XX_pre[3 * t + 2]]
    )
XX = np.array(XX)
XX = np.log(1 + XX)

In [15]:
with open('../house_dataset/house_XX.pkl', 'wb') as f:
    pkl.dump(XX, f)