In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm.auto import tqdm

In [None]:
# Load the data
train_folder = 'Train/Train'

# List to store individual DataFrames
dfs = []

# Iterate through all files in the train folder
for filename in os.listdir(train_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(train_folder, filename)
        # Read CSV file and append to the list
        df = pd.read_csv(file_path, index_col='Time', parse_dates=True)
        dfs.append(df)

# Concatenate all DataFrames
data = pd.concat(dfs, axis=0)

# Sort the index to ensure chronological order
data.sort_index(inplace=True)

data = data.reset_index()
print(len(data))

In [None]:
def extract_features(df):
    df['Year'] = df['Time'].dt.year
    df['Month'] = df['Time'].dt.month
    df['Day'] = df['Time'].dt.day
    df['Hour'] = df['Time'].dt.hour
    df['Minute'] = df['Time'].dt.minute
    df.drop(columns=['Time'], inplace=True)
    return df

In [None]:
processed_data = extract_features(data)

In [None]:
train_size = int(len(data) * 1)
target_column = 'System Production (W)'

In [None]:
#Convert the dataframe to a numpy array
dataset = processed_data.filter([target_column]).values
# Scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(dataset)

In [None]:
look_back = 24 
train, test = scaled_data[:train_size,:], scaled_data[train_size:,:]
print(len(train))
print(len(test))

In [None]:
def create_sequences(dataset, look_back=1):
    X, Y = [], []
    for i in range(look_back, len(dataset)):
        a = dataset[i-look_back:i, 0]
        X.append(a)
        Y.append(dataset[i, 0])
    return np.array(X), np.array(Y)

In [None]:
x_train, y_train = create_sequences(train, look_back)

# reshape input to be [samples, time steps, features]
x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1]))

print(len(x_train))
print(x_train.shape)

train_dataset = TensorDataset(torch.FloatTensor(x_train), torch.FloatTensor(y_train))
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
x_test, y_test = create_sequences(test, look_back)

# reshape input to be [samples, time steps, features]
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

print(len(x_test))
print(x_test.shape)

test_dataset = TensorDataset(torch.FloatTensor(x_test), torch.FloatTensor(y_test))
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [None]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size1, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_size1, hidden_size2, batch_first=True)
        self.fc1 = nn.Linear(hidden_size2, 25)
        self.fc2 = nn.Linear(25, output_size)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = self.fc1(x[:, -1, :])
        x = self.fc2(x)
        return x

# Instantiate the model
model = LSTMModel(input_size=x_train.shape[2], hidden_size1=128, hidden_size2=64, output_size=1).to("cuda")

In [None]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_x, batch_y in tqdm(train_loader):
        batch_x, batch_y = batch_x.to("cuda"), batch_y.to("cuda")
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y.unsqueeze(1))
        loss.backward()
        optimizer.step()
    
    # testidation
    # model.eval()
    # with torch.no_grad():
    #     test_loss = sum(criterion(model(batch_x.to("cuda")), batch_y.to("cuda").unsqueeze(1)) for batch_x, batch_y in test_loader)
    #     test_loss /= len(test_loader)
    # print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}')
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}')

In [None]:
model.eval()

predict = []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        predict.extend(model(batch_x.to("cuda")).cpu().numpy())

test_predict = scaler.inverse_transform(predict)
y_test = scaler.inverse_transform([y_test])

# Calculate RMSE
test_score = root_mean_squared_error(y_test[0], test_predict[:,0])
print(f'Test Score: {test_score:.2f} RMSE')

In [None]:
test = pd.read_csv('Test.csv')
test.sort_values(by='Time', inplace=True)
test['Time'] = pd.to_datetime(test['Time'], format='%d/%m/%Y %H:%M')
test.drop(columns=['id'], inplace=True)

print(len(test))

In [None]:
processed_test = extract_features(test)
concat_data = pd.concat([processed_data[-24:], processed_test])
concat_data.drop(columns=[target_column], inplace=True)

In [None]:
concat_data

In [None]:
def create_test_sequences(data, look_back):
    x = []
    for i in range(look_back, len(data)):
        x.append(data[i-look_back:i, 0])
    return np.array(x)

In [None]:
test_numpy = concat_data.values

x_test = create_test_sequences(test_numpy, look_back)

# reshape input to be [samples, time steps, features]
x_test = np.reshape(x_test, (x_test.shape[0], 1, x_test.shape[1]))

print(len(x_test))
print(x_test.shape)

test_dataset = TensorDataset(torch.FloatTensor(x_test))
test_loader = DataLoader(test_dataset, shuffle=False)

In [None]:
model.eval()

predict = []

with torch.no_grad():
    for batch_x in test_loader:
        predict.extend(model(batch_x[0].to("cuda")).cpu().numpy())

test_predict = scaler.inverse_transform(predict)

In [None]:
len(test_predict)

In [None]:
submission = pd.read_csv('Sample-submission.csv')
submission['System Production (W)'] = test_predict
submission.to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c hackathon-online-solar-power-forecasting -f submission.csv -m "LSTM"