In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
 cd drive/My\ Drive/New\ Problem\ Statement\ for\ GC\ Data\ Analytics

/content/drive/My Drive/New Problem Statement for GC Data Analytics


In [0]:
ls

'Evaluation Metric example.xlsx'   sample_submission.csv   train.csv
'Problem Statement.docx'           test.csv


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import minimize
# from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from math import sqrt


In [0]:
data = pd.read_csv("train.csv")
data.shape

(132000, 5)

In [0]:
bui_4 = data.loc[data['building_number']==4].reset_index().drop(['index','building_number'],axis=1)

In [0]:
season_len = 672
n_split = 14

In [0]:
class HoltWinters:
    """Scikit-learn like interface for Holt-Winters method."""

    def __init__(self, season_len, alpha=0.9, beta=0.9, gamma=0.5):
        self.beta = beta
        self.alpha = alpha
        self.gamma = gamma
        self.season_len = season_len

    def fit(self, series):
        # note that unlike scikit-learn's fit method, it doesn't learn
        # the optimal model paramters, alpha, beta, gamma instead it takes
        # whatever the value the user specified the produces the predicted time
        # series, this of course can be changed.
        beta = self.beta
        alpha = self.alpha
        gamma = self.gamma
        season_len = self.season_len
        seasonals = self._initial_seasonal(series)

        # initial values
        predictions = []
        smooth = series[0]
        trend = self._initial_trend(series)
        predictions.append(smooth)

        for i in range(1, len(series)):
            value = series[i]
            previous_smooth = smooth
            seasonal = seasonals[i % season_len]
            smooth = alpha * (value - seasonal) + (1 - alpha) * (previous_smooth + trend)
            trend = beta * (smooth - previous_smooth) + (1 - beta) * trend
            seasonals[i % season_len] = gamma * (value - smooth) + (1 - gamma) * seasonal
            predictions.append(smooth + trend + seasonals[i % season_len])

        self.trend_ = trend
        self.smooth_ = smooth
        self.seasonals_ = seasonals
        self.predictions_ = predictions
        return self
    
    def _initial_trend(self, series):
        season_len = self.season_len
        total = 0.0
        for i in range(season_len):
            total += (series[i + season_len] - series[i]) / season_len

        trend = total / season_len
        return trend

    def _initial_seasonal(self, series):
        season_len = self.season_len
        n_seasons = len(series) // season_len

        season_averages = np.zeros(n_seasons)
        for j in range(n_seasons):
            start_index = season_len * j
            end_index = start_index + season_len
            season_average = np.sum(series[start_index:end_index]) / season_len
            season_averages[j] = season_average

        seasonals = np.zeros(season_len)
        seasons = np.arange(n_seasons)
        index = seasons * season_len
        for i in range(season_len):
            seasonal = np.sum(series[index + i] - season_averages) / n_seasons
            seasonals[i] = seasonal

        return seasonals

    def predict(self, n_preds=10):
        """
        Parameters
        ----------
        n_preds: int, default 10
            Predictions horizon. e.g. If the original input time series to the .fit
            method has a length of 50, then specifying n_preds = 10, will generate
            predictions for the next 10 steps. Resulting in a prediction length of 60.
        """
        predictions = self.predictions_
        original_series_len = len(predictions)
        for i in range(original_series_len, original_series_len + n_preds):
            m = i - original_series_len + 1
            prediction = self.smooth_ + m * self.trend_ + self.seasonals_[i % self.season_len]
            predictions.append(prediction)

        return predictions

In [0]:
def timeseries_cv_score(params, series, loss_function, season_len=season_len, n_splits=n_split):
    """
    Iterating over folds, train model on each fold's training set,
    forecast and calculate error on each fold's test set.
    """
    errors = []    
    alpha, beta, gamma = params
    time_series_split = TimeSeriesSplit(n_splits=n_splits) 

    for train, test in time_series_split.split(series):
        model = HoltWinters(season_len, alpha, beta, gamma)
        model.fit(series[train])

        # evaluate the prediction on the test set only
        predictions = model.predict(n_preds=len(test))
        test_predictions = predictions[-len(test):]
        test_actual = series[test]
        error = loss_function(test_actual, test_predictions)
        errors.append(error)

    return np.mean(errors)

In [0]:
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true,y_pred))

In [0]:
def _train(data):
  train = data.iloc[:,1]
  # test = data.iloc[-5856:,1]

  # we have a daily seasonality, which
  # means our season length is 24 (the time
  # interval in our time series is measured in hours)


  # given that we've defined the length for
  # our season, we can figure out how many
  # seasons are there in our time series
  # and we need to compute the average values
  # for each season
  n_seasons = len(train) // season_len

  season_averages = np.zeros(n_seasons)

  for j in range(n_seasons):
      start_index = season_len * j
      end_index = start_index + season_len
      season_average = np.sum(train[start_index:end_index]) / season_len
      season_averages[j] = season_average

  print("Season Averages:")
  print(season_averages)

  # estimate the initial seasonal components
  seasonals = np.zeros(season_len)
  seasons = np.arange(n_seasons)
  index = seasons * season_len
  for i in range(season_len):
      seasonal = np.sum(train[index + i] - season_averages) / n_seasons
      seasonals[i] = seasonal

  print("Seasonals:")
  print(seasonals)


  x = [0, 0, 0]
  test_size = 0
  train_values = train.values

  print("Optimizing parameters")
  opt = minimize(timeseries_cv_score, x0=x, 
                args=(train_values, mean_squared_error, season_len, n_split), 
                method='TNC', bounds=((0, 1), (0, 1), (0, 1)))

  print('Original parameters: {}'.format(str(x)))
  print('Best parameters: {}'.format(str(opt.x)))


  alpha_final, beta_final, gamma_final = opt.x
  model = HoltWinters(season_len, alpha_final, beta_final, gamma_final)
  # data = series.values
  model.fit(train_values)
  predictions = model.predict(n_preds=10325)

  print('original series length: ', len(train))
  print('prediction length: ', len(predictions))

  # error = rmse(test, predictions[20544:])
  # print('Rmse: {0:.4f}'.format(error))

  return model, predictions[26400:]

In [0]:
mod_bui=[]
pred_bui=[]
for i in [bui_4]:
  models=[]
  pred = []
  for j in ['main_meter',	'sub_meter_1', 'sub_meter_2']:
    data = i[['timestamp',j]]
    m, p = _train(data)
    models.append(m)
    pred.append(p)

  mod_bui.append(models)
  pred_bui.append(np.array(pred).transpose())

Season Averages:
[ 5327.61683093  5332.36316654  4565.07368171  4795.79646386
  4670.04406244  4885.05832634  4727.93195541  4492.16334154
  4621.57339022  5967.12835681  7543.66531935 10250.26758842
 11434.92211199  9939.79552698 11310.68157917 10191.91561087
 10997.8351897  12044.75144899 12328.41309988  9223.64616509
  9485.14443808  9920.20160001  7698.03438352  7581.90070331
  6936.16380331  6136.69434639  6225.12399861  6080.29034722
  6115.64684998  6067.21992311  5836.01616841  5919.23436646
  5697.6478409   6262.99261466  6628.29001019  6979.21344669
  7181.0323039   8445.04815246  6740.57225501]
Seasonals:
[-3115.07473241 -3142.41963397 -3145.43560997 -3212.21146062
 -3164.75162955 -3228.57248332 -3194.66324682 -3221.61745454
 -3261.95247835 -3235.23087107 -3173.80552306 -3459.51221668
 -3377.41224025 -3432.47597543 -3559.34435828 -3472.76890521
 -3204.98252119 -3304.81112023 -3258.6171158  -3151.87884184
 -3123.73665283 -3258.64850279 -3213.24167354 -2881.84454963
 -1729.497

In [0]:
error = rmse(test, predictions[20544:])
print('Rmse: {0:.4f}'.format(error))

In [0]:
test = bui_1[-5856:].drop(['timestamp'],axis=1)
test.values.shape

In [0]:
rmse(test.values,pred)

In [0]:
pred_bui[0].shape

(10325, 3)

In [0]:
pd.DataFrame(pred_bui[0]).to_csv("bui_1.csv",index=False)