In [1]:
import os
os.chdir('..')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import torch
from torch import nn

import sklearn as sk
from sklearn.ensemble import RandomForestRegressor


from Energy_ds.dataset import DataPrep, DataModule, EnergyDataset
from Energy_ds.config import SEASON, REGION, DatasetConfig

from neural.train import LightningTrainer
from neural.module import LightningModel, LightningWrapper
from neural.mlp import MlpBlock, RMLP, MLPLayer
from neural.config import TrainConfig


In [2]:
path = "Hourly_Energy_Consumption/AEP_hourly.csv"


In [3]:
train_config = DatasetConfig(
    past_window=24,
    years=(2006, 2010)
)
test_config = DatasetConfig(
    past_window=24,
    years=(2010, 2014)
)

In [4]:
train_ds = EnergyDataset(path, train_config)
test_ds = EnergyDataset(path, test_config)

columns = train_ds.features_df.columns

(121249,)
(121249, 30)
(121249,)
(121249,)
(121249, 30)
(121249,)


In [None]:
rf = RandomForestRegressor()


In [None]:
class Experiment:
    def __init__(self, model, train_ds, test_ds):
        self.model = model
        self.train_ds = train_ds
        self.test_ds = test_ds

    def train(self):
        self.model.fit(self.train_ds.X, self.train_ds.y)

    def test(self):
        y_pred = self.model.predict(self.test_ds.X)
        return y_pred
    



# OLD

In [2]:
settings = TrainConfig(
        # loss_fn = 'bce', # str
        # optimizer = 'adam', # str
        device = f"gpu", # str
        log = False, # bool
        logs_dir = "logs/", # str
        num_epochs = 30, # int
        checkpoints = 'test.pt', # str
        early_stopping = None, # int
        log_every = 1, # int
        timeout = "00:12:00:00", # int
        # learning_rate = 0.001, # float
        # weight_decay = 1e-06, # float
        batch_size = 2048, # int
        shuffle = False, # bool
        num_workers = 9, # int
        # train_test_split = 0.5, # float
    )

In [16]:
path = "Hourly_Energy_Consumption/AEP_hourly.csv"
past_hours = 2

prper = DataPrep(path)
data = prper.data

# train_years = range(2004, 2016)
# val_years = range(2016, 2018)

def year_cond(start:int, end:int):
    return lambda data: data['Year'].between(start, end)

def season_cond(season:SEASON):
    return lambda data: data['Season'] == season.value

def month_cond(month:int):
    return lambda data: data['Month'] == month

def cond_and(conds:list):
    def cond(data):
        mask = pd.Series(len(data)*[True])
        for c in conds:
            mask = mask & c(data)
        return mask
    # return lambda data: ([cond(data) for cond in conds])
    return cond

train_conds = [year_cond(2006, 2008), season_cond(SEASON.WINTER)]
val_conds = [year_cond(2009, 2012), season_cond(SEASON.SUMMER)]

train_ds = EnergyDataset(path, past_hours=past_hours, condition=cond_and(train_conds))
val_ds = EnergyDataset(path, past_hours=past_hours, condition=cond_and(val_conds))


datamodule = DataModule(train_ds,
                        val_ds,
                        batch_size=settings.batch_size,
                        num_workers=settings.num_workers,
                        shuffle=settings.shuffle,
                        seed=42,
                        )


  self.data_df = features[mask].copy()
  data = data[condition]
  self.data_df = features[mask].copy()
  data = data[condition]


In [7]:
train_ds.labels_df

10964    14565.0
10965    14287.0
10966    13932.0
10967    13713.0
10968    13656.0
          ...   
37257    19192.0
37258    18670.0
37259    18319.0
37260    17963.0
37261    17546.0
Name: MW, Length: 6502, dtype: float64

In [8]:
train_ds.data_df

Unnamed: 0,Region,Year,Day,Hour,Week_Number,Season,MW_at_-1H,MW_at_-2H
10964,1,2006,1,2,52,4,14287.0,14565.0
10965,1,2006,1,3,52,4,13932.0,14287.0
10966,1,2006,1,4,52,4,13713.0,13932.0
10967,1,2006,1,5,52,4,13656.0,13713.0
10968,1,2006,1,6,52,4,13635.0,13656.0
...,...,...,...,...,...,...,...,...
37257,1,2008,31,21,1,4,18670.0,19192.0
37258,1,2008,31,22,1,4,18319.0,18670.0
37259,1,2008,31,23,1,4,17963.0,18319.0
37260,1,2009,1,0,1,4,17546.0,17963.0


In [9]:
train_ds[0]

(tensor([1.0000e+00, 2.0060e+03, 1.0000e+00, 2.0000e+00, 5.2000e+01, 4.0000e+00,
         1.4287e+04, 1.4565e+04]),
 tensor(14565.))

In [9]:
# datamodule.prepare_data()
dl = datamodule.train_dataloader()

In [10]:
batch, label = next(iter(dl))
n_features = batch.shape[-1]

In [None]:
batch.shape, label.shape

In [31]:
from torch import Tensor


class ResidMLP(LightningModel):
    def __init__(self, input_size, hidden_size, output_size=1, loss:nn.Module=nn.L1Loss()):
        super().__init__(loss)
        # self.f = nn.TransformerEncoderLayer(input_size,
        #                                     nhead=4,
        #                                     dim_feedforward=hidden_size[-1],
        #                                     dropout=0.1,
        #                                     activation='relu',
        #                                     batch_first=True)
        
        # self.rmlp = RMLP(
        #     in_dim=input_size,
        #     block_in_dim=hidden_size[0], 
        #     block_dims=hidden_size,
        #     block_nonlins=[nn.ReLU()]*len(hidden_size),
        #     n_blocks=2,
        #     out_dim=output_size,
        #     out_nonlin=nn.Identity(),
        #     batch_norm=False,
        #     )
        # self.mlp = MlpBlock(
        #     in_dim=input_size,
        #     dims=hidden_size,
        #     nonlins=[nn.Tanh()]*len(hidden_size),
        #     batch_norm=False,
        #     )
        self.out_layer = nn.Linear(input_size, output_size)
    
    def forward(self, x):
        # x = x.flatten(1).to(torch.float32)
        # x = self.rmlp(x)
        # x = self.mlp(x)
        # x = self.f(x)
        return self.out_layer(x)#.squeeze(-1)
    
    def accuracy(self, preds: Tensor, labels: Tensor) -> Tensor:
        return (preds - labels).abs().mean()
    


In [None]:
in_size = n_features#*past_hours
rmlp = ResidMLP(input_size=in_size, hidden_size=[24, 16], output_size=1)

model = LightningWrapper(rmlp)
print(model)

In [None]:
trainer = LightningTrainer(settings, "global_wheat_1")
train_dl, val_dl = datamodule.train_dataloader(), datamodule.val_dataloader()
trainer.fit(model, dl_train=train_dl, dl_test=val_dl)


In [10]:
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor

In [11]:
rf = RandomForestRegressor()


In [12]:
rf = rf.fit(train_ds.data, train_ds.labels)

In [13]:
def mae(preds, labels):
    return np.abs(preds - labels).mean()

In [17]:
train_scores = mae(rf.predict(train_ds.data), train_ds.labels.numpy())
val_scores = mae(rf.predict(val_ds.data), val_ds.labels.numpy())

In [69]:

def validate(model:RandomForestRegressor, val_dl):
    scores = []
    for batch, label in val_dl:
        score = np.abs(model.predict(batch.numpy()) - label.numpy()).flatten()
        scores.append(score)
    return np.concatenate(scores)

train_scores = validate(rf, train_dl)
val_scores = validate(rf, val_dl)
# scores = []
# for batch, label in val_dl:
#     score = np.abs(rf.predict(batch.numpy()) - label.numpy()).mean()
#     scores.append(score)
# print(np.mean(scores))

In [18]:
print(f"train_score = {train_scores.mean()}")
print(f"val_score = {val_scores.mean()}")

train_score = 0.5609689326360994
val_score = 13.655823143115938


In [None]:
model(batch)