In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

file_path = '/content/drive/My Drive/DataScience/Data/cleaned_merged_data.csv'
df = pd.read_csv(file_path)

In [3]:
df

Unnamed: 0,group,Date,KWH,Next_3_Days_Energy_Total,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,1,2012-09-28,920.374,959.118667,5.0,1.60,73.0,16.70,13.30,8.50,0.20,101300.0,0.0
1,1,2012-09-29,946.774,951.399333,1.0,8.70,146.0,18.10,12.70,8.60,0.00,101830.0,0.0
2,1,2012-09-30,997.614,919.995000,5.0,1.60,71.0,17.30,12.10,6.10,4.20,101870.0,0.0
3,1,2012-10-01,932.968,918.231000,6.0,2.70,85.0,17.90,15.20,13.00,2.60,101190.0,0.0
4,1,2012-10-02,923.616,950.022667,6.0,2.80,85.0,15.10,14.40,10.90,1.20,100970.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27983,55,2014-02-15,1124.513,1070.176000,3.6,3.72,54.8,10.94,7.38,4.44,0.32,101252.0,0.0
27984,55,2014-02-16,1165.853,1018.218000,1.0,8.40,101.0,10.80,6.30,1.90,0.20,100680.0,0.0
27985,55,2014-02-17,1036.378,1010.964667,5.0,0.10,29.0,11.10,6.80,2.70,2.20,100770.0,0.0
27986,55,2014-02-18,1008.297,993.217000,5.0,1.20,45.0,11.00,9.30,7.50,5.80,100860.0,0.0


In [4]:
missing_counts = df.isna().sum()
print(missing_counts)

group                       0
Date                        0
KWH                         0
Next_3_Days_Energy_Total    0
cloud_cover                 0
sunshine                    0
global_radiation            0
max_temp                    0
mean_temp                   0
min_temp                    0
precipitation               0
pressure                    0
snow_depth                  0
dtype: int64


#### cloud_cover processing

In [5]:
df['cloud_cover'].value_counts()

6.000000    4488
7.000000    3903
5.000000    3665
4.000000    3650
8.000000    3620
3.000000    2753
2.000000    2310
1.000000    1699
0.000000    1430
5.400000      54
5.600000      47
4.800000      41
4.600000      38
5.200000      37
5.800000      35
6.200000      30
4.400000      28
4.200000      26
6.600000      22
6.400000      22
3.800000      18
3.400000      13
6.800000      12
3.600000       9
2.800000       8
7.200000       8
3.200000       7
7.400000       5
4.748274       3
7.600000       3
2.400000       2
2.200000       1
7.800000       1
Name: cloud_cover, dtype: int64

In [6]:
condition = (df['cloud_cover'] % 1 != 0) | (df['cloud_cover'] > 8.0)

# 从DataFrame中去除满足条件的行
df = df[~condition]

In [7]:
df['cloud_cover'].value_counts()

6.0    4488
7.0    3903
5.0    3665
4.0    3650
8.0    3620
3.0    2753
2.0    2310
1.0    1699
0.0    1430
Name: cloud_cover, dtype: int64

#### change the sunshine into hours

In [8]:
df['sunshine'] = df['sunshine'] * 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sunshine'] = df['sunshine'] * 60


#### change snow depth into a category value

In [9]:
df['snow_depth'].value_counts()

0.0    27290
3.0      110
2.0       55
5.0       55
0.6        4
1.0        2
1.2        2
Name: snow_depth, dtype: int64

In [10]:
df.loc[df['snow_depth'] > 0, 'snow_depth'] = 1.0

In [11]:
df['snow_depth'].value_counts()

0.0    27290
1.0      228
Name: snow_depth, dtype: int64

#### change the precipitation into category value

In [12]:
df['precipitation'].value_counts()

0.00    11479
0.20     3790
0.40     1045
1.80      662
1.20      603
        ...  
2.44        1
1.28        1
2.48        1
2.32        1
3.32        1
Name: precipitation, Length: 142, dtype: int64

In [13]:
bins = [-float('inf'), 0.1, 2.5, 10, 50, float('inf')]
labels = [0, 1, 2, 3, 4]

df['precipitation'] = pd.cut(df['precipitation'], bins=bins, labels=labels, right=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['precipitation'] = pd.cut(df['precipitation'], bins=bins, labels=labels, right=False)


In [14]:
df['precipitation'].value_counts()

0    11482
1     9129
2     5702
3     1205
4        0
Name: precipitation, dtype: int64

#### one-hot encoding

In [15]:
df_encoded = pd.get_dummies(df['cloud_cover'], prefix='cloud_cover')

df = pd.concat([df, df_encoded], axis=1)

df.drop('cloud_cover', axis=1, inplace=True)

In [16]:
df_encoded = pd.get_dummies(df['precipitation'], prefix='precipitation')

df = pd.concat([df, df_encoded], axis=1)

df.drop('precipitation', axis=1, inplace=True)

In [17]:
df_encoded = pd.get_dummies(df['snow_depth'], prefix='snow_depth')

df = pd.concat([df, df_encoded], axis=1)

df.drop('snow_depth', axis=1, inplace=True)

In [18]:
df.head(5)

Unnamed: 0,group,Date,KWH,Next_3_Days_Energy_Total,sunshine,global_radiation,max_temp,mean_temp,min_temp,pressure,...,cloud_cover_6.0,cloud_cover_7.0,cloud_cover_8.0,precipitation_0,precipitation_1,precipitation_2,precipitation_3,precipitation_4,snow_depth_0.0,snow_depth_1.0
0,1,2012-09-28,920.374,959.118667,96.0,73.0,16.7,13.3,8.5,101300.0,...,0,0,0,0,1,0,0,0,1,0
1,1,2012-09-29,946.774,951.399333,522.0,146.0,18.1,12.7,8.6,101830.0,...,0,0,0,1,0,0,0,0,1,0
2,1,2012-09-30,997.614,919.995,96.0,71.0,17.3,12.1,6.1,101870.0,...,0,0,0,0,0,1,0,0,1,0
3,1,2012-10-01,932.968,918.231,162.0,85.0,17.9,15.2,13.0,101190.0,...,1,0,0,0,0,1,0,0,1,0
4,1,2012-10-02,923.616,950.022667,168.0,85.0,15.1,14.4,10.9,100970.0,...,1,0,0,0,1,0,0,0,1,0


#### delete the date

In [19]:
del df['Date']
del df['group']

In [20]:
df.head()

Unnamed: 0,KWH,Next_3_Days_Energy_Total,sunshine,global_radiation,max_temp,mean_temp,min_temp,pressure,cloud_cover_0.0,cloud_cover_1.0,...,cloud_cover_6.0,cloud_cover_7.0,cloud_cover_8.0,precipitation_0,precipitation_1,precipitation_2,precipitation_3,precipitation_4,snow_depth_0.0,snow_depth_1.0
0,920.374,959.118667,96.0,73.0,16.7,13.3,8.5,101300.0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,946.774,951.399333,522.0,146.0,18.1,12.7,8.6,101830.0,0,1,...,0,0,0,1,0,0,0,0,1,0
2,997.614,919.995,96.0,71.0,17.3,12.1,6.1,101870.0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,932.968,918.231,162.0,85.0,17.9,15.2,13.0,101190.0,0,0,...,1,0,0,0,0,1,0,0,1,0
4,923.616,950.022667,168.0,85.0,15.1,14.4,10.9,100970.0,0,0,...,1,0,0,0,1,0,0,0,1,0


#### normalization

In [21]:
from sklearn.preprocessing import MinMaxScaler

features_to_normalize = ['sunshine', 'global_radiation', 'max_temp', 'mean_temp', 'min_temp', 'pressure', 'KWH']

scaler = MinMaxScaler()


df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

#### check if any INF or NAN

In [22]:
inf_check = np.isinf(df)
print(inf_check.any())

KWH                         False
Next_3_Days_Energy_Total    False
sunshine                    False
global_radiation            False
max_temp                    False
mean_temp                   False
min_temp                    False
pressure                    False
cloud_cover_0.0             False
cloud_cover_1.0             False
cloud_cover_2.0             False
cloud_cover_3.0             False
cloud_cover_4.0             False
cloud_cover_5.0             False
cloud_cover_6.0             False
cloud_cover_7.0             False
cloud_cover_8.0             False
precipitation_0             False
precipitation_1             False
precipitation_2             False
precipitation_3             False
precipitation_4             False
snow_depth_0.0              False
snow_depth_1.0              False
dtype: bool


In [23]:
inf_check = np.isnan(df)
print(inf_check.any())

KWH                         False
Next_3_Days_Energy_Total    False
sunshine                    False
global_radiation            False
max_temp                    False
mean_temp                   False
min_temp                    False
pressure                    False
cloud_cover_0.0             False
cloud_cover_1.0             False
cloud_cover_2.0             False
cloud_cover_3.0             False
cloud_cover_4.0             False
cloud_cover_5.0             False
cloud_cover_6.0             False
cloud_cover_7.0             False
cloud_cover_8.0             False
precipitation_0             False
precipitation_1             False
precipitation_2             False
precipitation_3             False
precipitation_4             False
snow_depth_0.0              False
snow_depth_1.0              False
dtype: bool


In [24]:
df.head(10)

Unnamed: 0,KWH,Next_3_Days_Energy_Total,sunshine,global_radiation,max_temp,mean_temp,min_temp,pressure,cloud_cover_0.0,cloud_cover_1.0,...,cloud_cover_6.0,cloud_cover_7.0,cloud_cover_8.0,precipitation_0,precipitation_1,precipitation_2,precipitation_3,precipitation_4,snow_depth_0.0,snow_depth_1.0
0,0.411248,959.118667,0.110345,0.190031,0.492711,0.556777,0.523438,0.473577,0,0,...,0,0,0,0,1,0,0,0,1,0
1,0.433385,951.399333,0.6,0.417445,0.533528,0.534799,0.527344,0.581301,0,1,...,0,0,0,1,0,0,0,0,1,0
2,0.476016,919.995,0.110345,0.183801,0.510204,0.512821,0.429688,0.589431,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0.421808,918.231,0.186207,0.227414,0.527697,0.626374,0.699219,0.45122,0,0,...,1,0,0,0,0,1,0,0,1,0
4,0.413967,950.022667,0.193103,0.227414,0.446064,0.59707,0.617188,0.406504,0,0,...,1,0,0,0,1,0,0,0,1,0
5,0.397016,984.457,0.110345,0.17757,0.483965,0.549451,0.621094,0.323171,0,0,...,0,1,0,0,0,1,0,0,1,0
6,0.417371,1027.060333,0.517241,0.361371,0.466472,0.487179,0.441406,0.376016,0,0,...,0,0,0,0,0,1,0,0,1,0
7,0.493941,1014.076333,0.075862,0.149533,0.451895,0.553114,0.601562,0.351626,0,0,...,0,0,1,0,0,0,1,0,1,0
8,0.483638,997.224333,0.544828,0.361371,0.408163,0.487179,0.484375,0.473577,0,0,...,0,0,0,0,1,0,0,0,1,0
9,0.524543,966.853333,0.22069,0.224299,0.376093,0.410256,0.375,0.573171,0,0,...,0,0,0,1,0,0,0,0,1,0


In [25]:
from sklearn.model_selection import train_test_split

X = df.drop(['Next_3_Days_Energy_Total'], axis=1)
y = df[['Next_3_Days_Energy_Total']]

In [26]:
# 假设您的 DataFrame 名为 df，形状为 (100, 22)

# 将数据划分为通道
num_channels = 10  # 每个通道包含的行数
num_samples = X.shape[0] - num_channels + 1
channels = []
for i in range(num_samples):
    channel = X.iloc[i:i+num_channels, :]
    channels.append(channel)

# 重新组织形状为 (x, x, x)
x = 10  # 每个通道中的行数
X = np.array(channels).reshape(num_samples, x, X.shape[1])

# reshaped_df 的形状为 (num_samples, x, num_channels, num_features)

In [27]:
y = y[-num_samples:]

In [28]:
X.shape

(27509, 10, 23)

In [29]:
y.shape

(27509, 1)

In [30]:
from sklearn.model_selection import train_test_split

# 划分训练集为新的训练集和验证集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 划分训练集为新的训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [31]:
# 打印划分后的数据集大小
print("train dataset:", X_train.shape)
print("validation dataset:", X_val.shape)
print("test dataset:", X_test.shape)

train dataset: (17605, 10, 23)
validation dataset: (4402, 10, 23)
test dataset: (5502, 10, 23)


In [39]:
print("train target dataset:", y_train.shape)
print("validation target dataset:", y_val.shape)
print("test target dataset:", y_test.shape)

train target dataset: (17605, 1)
validation target dataset: (4402, 1)
test target dataset: (5502, 1)


In [32]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and being used")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU instead")

GPU is available and being used


In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(42)  # set the random seed


X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

#### Create a TensorDataset from the tensors

In [34]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Specify the batch size
batch_size = 64

# Create DataLoaders for training, validation, and testing
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

#### construct the TCN model

In [35]:
import torch
import torch.nn as nn
import torch.optim as optim

# 定义TCN的残差块
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation)
        self.activation1 = nn.ReLU()
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation)
        self.activation2 = nn.ReLU()

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.activation1(out)
        out = self.conv2(out)
        out = self.activation2(out)
        out = torch.add(out, residual)  # 跳跃连接，将残差与原始输入相加
        return out

# 定义TCN
class TCN(nn.Module):
    def __init__(self, in_channels, num_residual_blocks, residual_channels, kernel_size, stride, padding):
        super(TCN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, residual_channels, kernel_size, stride=stride, padding=padding)
        self.activation1 = nn.ReLU()
        self.residual_blocks = nn.ModuleList([
            ResidualBlock(residual_channels, residual_channels, kernel_size, stride, dilation * (kernel_size - 1) // 2, dilation)
            for dilation in [2**i for i in range(num_residual_blocks)]
        ])
        self.fc = nn.Linear(residual_channels, 1)  # 回归问题的输出是一个单一值，因此使用1个输出单元

    def forward(self, x):
        x = x.permute(0, 2, 1)  # 调整通道维度的顺序
        out = self.conv1(x)
        out = self.activation1(out)
        for residual_block in self.residual_blocks:
            out = residual_block(out)
        out = torch.mean(out, dim=2)  # 对时间维度进行平均池化
        out = self.fc(out)
        return out

In [38]:
import time
start_time = time.time()


in_channels = 23  # 输入通道数
num_residual_blocks = 4  # 残差块的数量
residual_channels = 500  # 残差块中卷积层的输出通道数
kernel_size = 5  # 卷积核大小
stride = 1  # 卷积步长
padding = 2  # 卷积填充大小

# 初始化TCN
model = TCN(in_channels, num_residual_blocks, residual_channels, kernel_size, stride, padding).to(device)

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# 训练模型
epochs = 500

for epoch in range(epochs):
    model.train()
    train_total_loss = 0.0

    for i, (X_batch, y_batch) in enumerate(train_loader):
        # Clear gradients from the previous iteration
        optimizer.zero_grad()

        # Forward pass: compute model predictions
        predictions = model(X_batch.to(device))
        #print('here is prediction：', predictions.shape)

        # Compute the loss
        loss = criterion(predictions, y_batch.to(device))
        #print('here is loss：', loss.item())

        # Backpropagation: compute gradients of the loss with respect to model parameters
        loss.backward()

        # Optimization: update model parameters using the gradients
        optimizer.step()

        # Accumulate the total loss for this epoch
        train_total_loss += loss.item()

    # Calculate the average loss for this epoch
    train_loss = train_total_loss / len(train_loader)
    #train_loss_history.append(train_loss)


    # 输出训练进度
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{epochs}], Loss: {train_loss:.4f}')

# 完成训练后，可以使用模型进行预测或进行评估。

# 使用训练好的模型进行预测
model.eval()
with torch.no_grad():
    val_total_loss = 0.0
    for batch_X, batch_y in val_loader:
        predictions = model(batch_X.to(device))
        loss = criterion(predictions, batch_y.to(device))
        val_total_loss += loss.item()
        validation_loss = val_total_loss / len(val_loader)
        #val_loss_history.append(validation_loss)
    print(validation_loss)

end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time}秒")

Epoch [10/500], Loss: 4470.4591
Epoch [20/500], Loss: 3539.9113
Epoch [30/500], Loss: 3354.0271
Epoch [40/500], Loss: 3277.6374
Epoch [50/500], Loss: 3158.6988
Epoch [60/500], Loss: 2838.6767
Epoch [70/500], Loss: 2787.9330
Epoch [80/500], Loss: 2532.7103
Epoch [90/500], Loss: 2255.1553
Epoch [100/500], Loss: 2213.8003
Epoch [110/500], Loss: 2057.5552
Epoch [120/500], Loss: 2062.6905
Epoch [130/500], Loss: 2079.3139
Epoch [140/500], Loss: 2076.2204
Epoch [150/500], Loss: 2058.7552
Epoch [160/500], Loss: 1931.6010
Epoch [170/500], Loss: 1893.1689
Epoch [180/500], Loss: 1844.2199
Epoch [190/500], Loss: 1895.0010
Epoch [200/500], Loss: 1816.2157
Epoch [210/500], Loss: 1858.1806
Epoch [220/500], Loss: 1822.1011
Epoch [230/500], Loss: 1725.2045
Epoch [240/500], Loss: 1709.6305
Epoch [250/500], Loss: 1810.3148
Epoch [260/500], Loss: 1689.2335
Epoch [270/500], Loss: 1773.7897
Epoch [280/500], Loss: 1591.8745
Epoch [290/500], Loss: 1663.1901
Epoch [300/500], Loss: 1648.0208
Epoch [310/500], Lo

KeyboardInterrupt: ignored