In [1]:
!pip install optuna



In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.optim as optim
import os
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn.init as init
import torch.nn.functional as F
import math
import joblib
import gc
import optuna

In [3]:
def cleanup_memory():

    gc.collect() 

    torch.cuda.empty_cache()

    

cleanup_memory()

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')



print(device)

cpu


In [5]:
torch.cuda.empty_cache()

In [6]:
df = pd.read_excel("../input/weather-cond4/4_year_df.xlsx", engine="openpyxl")



df['Homérséklet'] = np.log1p(df['Homérséklet'])

In [7]:
df.head()

Unnamed: 0,Homérséklet,Páratartalom,Légnyomás,minute_sin,month_cos,hour_sin,hour_cos,month_sin,day_sin,day_cos
0,0.530628,74,1029.7,0.8183028,0.866025,0.0,1.0,0.5,0.207912,0.978148
1,0.875469,75,1029.6,0.05322217,0.866025,0.0,1.0,0.5,0.207912,0.978148
2,0.955511,75,1029.8,-0.9996456,0.866025,0.0,1.0,0.5,0.207912,0.978148
3,1.064711,75,1029.7,-2.449294e-16,0.866025,0.0,1.0,0.5,0.207912,0.978148
4,0.993252,77,1029.8,0.8993121,0.866025,0.269797,0.962917,0.5,0.207912,0.978148


In [8]:
df.shape

(128829, 10)

In [9]:
scaler = MinMaxScaler()



scaled_df = scaler.fit_transform(df)



joblib.dump(scaler, '/kaggle/working/scaler.save')



scaled_df

array([[0.14483945, 0.675     , 0.70719178, ..., 0.75      , 0.60452846,
        0.9890738 ],
       [0.23896656, 0.6875    , 0.70547945, ..., 0.75      , 0.60452846,
        0.9890738 ],
       [0.26081489, 0.6875    , 0.70890411, ..., 0.75      , 0.60452846,
        0.9890738 ],
       ...,
       [0.61736728, 0.6625    , 0.42123288, ..., 0.5       , 0.60452846,
        0.9890738 ],
       [0.61162058, 0.675     , 0.41952055, ..., 0.5       , 0.60452846,
        0.9890738 ],
       [0.6027671 , 0.675     , 0.42123288, ..., 0.5       , 0.60452846,
        0.9890738 ]])

In [10]:
def create_sequences(data, src_len, tgt_len):

    sequences = []

    

    for i in range(len(data) - src_len - tgt_len):

        src = data[i : i + src_len]

        tgt = data[i + src_len : i + src_len + tgt_len, 0]

        

        sequences.append((src, tgt))

    

    return sequences

In [11]:
input_length = 700

output_length = 300



sequences = create_sequences(scaled_df, input_length, output_length)



src_seqs = np.array([seq[0] for seq in sequences])

tgt_seqs = np.array([seq[1] for seq in sequences])

In [12]:
class SlidingWindowAttention(nn.Module):

    def __init__(self, window_size):

        super(SlidingWindowAttention, self).__init__()

        self.window_size = window_size



    def forward(self, q, k, v):

        """

        q, k, v: Tensors of shape (batch_size, seq_len, d_model)

        """

        batch_size, seq_len, d_model = q.size()

        

        output = torch.zeros_like(q)



        # Apply sliding window attention

        for i in range(seq_len):

            left = max(0, i - self.window_size)

            right = min(seq_len, i + self.window_size + 1)



            local_q = q[:, i:i+1, :]  # Current query

            local_k = k[:, left:right, :]  # Keys in the window

            local_v = v[:, left:right, :]  # Values in the window



            attn_scores = torch.bmm(local_q, local_k.transpose(1, 2)) / (d_model ** 0.5)

            attn_probs = F.softmax(attn_scores, dim=-1)



            context = torch.bmm(attn_probs, local_v)

            output[:, i:i+1, :] = context



        return output

In [13]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, window_size, dim_feedforward, dropout):

        super(EncoderLayer, self).__init__()

        self.self_attn = SlidingWindowAttention(window_size)

        self.linear1 = nn.Linear(d_model, dim_feedforward)

        self.dropout = nn.Dropout(dropout)

        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)

        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)

        self.dropout2 = nn.Dropout(dropout)



    def forward(self, src):

        src2 = self.self_attn(src, src, src)

        src = src + self.dropout1(src2)

        src = self.norm1(src)



        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))

        src = src + self.dropout2(src2)

        src = self.norm2(src)



        return src

In [14]:
class WeatherForecastTransformer(nn.Module):

    def __init__(self, input_dim, model_dim, num_heads, num_encoder_layers, num_decoder_layers, output_dim, input_seq_len, output_seq_len, window_size, dim_feedforward, dropout):

        super(WeatherForecastTransformer, self).__init__()

        self.input_seq_len = input_seq_len

        self.output_seq_len = output_seq_len

        

        self.embedding = nn.Linear(input_dim, model_dim)



        self.encoder_layers = nn.ModuleList([EncoderLayer(model_dim, window_size, dim_feedforward=dim_feedforward, dropout=dropout) for _ in range(num_encoder_layers)])



        self.fc_out = nn.Linear(model_dim, output_dim)



    def forward(self, src):

        src = self.embedding(src)  



        for layer in self.encoder_layers:

            src = layer(src)  

        

 

        outputs = self.fc_out(src[:, -self.output_seq_len:, :])  



        return outputs

In [15]:
def train_val_split(sequences_x, sequences_y, train_percentage, val_percentage):

    train_end_idx = int(len(sequences_x) * (train_percentage / 100))

    

    val_start_idx = int(len(sequences_x) * (1 - val_percentage / 100))

    

    train_x, train_y = sequences_x[:train_end_idx], sequences_y[:train_end_idx]  

    val_x, val_y = sequences_x[val_start_idx:], sequences_y[val_start_idx:] 



    return train_x, val_x, train_y, val_y


In [16]:
train_percentage = 10 

val_percentage = 25



src_train, src_val, tgt_train, tgt_val = train_val_split(src_seqs, tgt_seqs, train_percentage, val_percentage)



src_train = torch.tensor(src_train, dtype=torch.float32).to(device)

tgt_train = torch.tensor(tgt_train, dtype=torch.float32).to(device)

src_val = torch.tensor(src_val, dtype=torch.float32).to(device)

tgt_val = torch.tensor(tgt_val, dtype=torch.float32).to(device)

In [17]:
train_dataset = TensorDataset(src_train, tgt_train)

val_dataset = TensorDataset(src_val, tgt_val)



batch_size = 64



train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [18]:
def train_model(model, optimizer, num_epochs, train_loader, val_loader, criterion):

    val_loss = []



    for epoch in range(num_epochs):

        train_loss1 = 0

        val_loss1 = 0



        # Training Loop

        model.train()

        for x_batch, y_batch in train_loader:

            optimizer.zero_grad()



            with torch.amp.autocast(device_type='cuda'):

                output = model(x_batch)

                loss = criterion(output.squeeze(-1), y_batch)



            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            optimizer.step()



            train_loss1 += loss.item()



        train_loss1 /= len(train_loader)



        # Validation

        model.eval()

        with torch.no_grad():

            

            for x_val_batch, y_val_batch in val_loader:

                output_val = model(x_val_batch)

                val_loss1 += criterion(output_val.squeeze(-1), y_val_batch).item()



        val_loss1 /= len(val_loader)

        val_loss.append(val_loss1)



    return np.mean(val_loss)

In [19]:
def objective(trial):

    input_seq_len = 700

    output_seq_len = 300

    input_dim = 10

    output_dim = 1

    model_dim = trial.suggest_int('model_dim', 64, 256, step=64)  

    num_heads = trial.suggest_int('num_heads', 2, 8)

    num_encoder_layers = trial.suggest_int('num_encoder_layers', 1, 6)

    window_size = trial.suggest_int('window_size', 8, 64, step=8)

    dim_feedforward = trial.suggest_int('dim_feedforward', 512, 1536, step=512)

    dropout = trial.suggest_float('dropout', 0.1, 0.5)

    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)





    model = WeatherForecastTransformer(input_dim, model_dim, num_heads, num_encoder_layers,

                                       num_decoder_layers=0, output_dim=output_dim,

                                       input_seq_len=input_seq_len, output_seq_len=output_seq_len,

                                       window_size=window_size, dim_feedforward=dim_feedforward,

                                       dropout=dropout)



    model = torch.nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)  



    criterion = nn.MSELoss()





    best_val_loss = train_model(model, optimizer, num_epochs=6, train_loader=train_loader, val_loader=val_loader, criterion=criterion)



    return best_val_loss

In [20]:
def save_study_as_csv(study, filename="/kaggle/working/study.csv"):
    trials_df = study.trials_dataframe()
    trials_df.to_csv(filename, index=False)

In [21]:
def save_study_callback(study, trial):
    save_study_as_csv(study, filename="/kaggle/working/study.csv")

In [None]:
def load_study_from_csv(filename):
    study = optuna.create_study(study_name="loaded_study", direction="minimize")
    trials_df = pd.read_csv(filename)

    for idx, row in trials_df.iterrows():
        params = {
            'model_dim': row['params_model_dim'],
            'num_heads': row['params_num_heads'],
            'num_encoder_layers': row['params_num_encoder_layers'],
            'window_size': row['params_window_size'],
            'dim_feedforward': row['params_dim_feedforward'],
            'dropout': row['params_dropout'],
            'learning_rate': row['params_learning_rate']
        }

        distributions = {
            'model_dim': optuna.distributions.IntDistribution(64, 256, step=64),
            'num_heads': optuna.distributions.IntDistribution(2, 8),
            'num_encoder_layers': optuna.distributions.IntDistribution(1, 6),
            'window_size': optuna.distributions.IntDistribution(8, 64, step=8),
            'dim_feedforward': optuna.distributions.IntDistribution(512, 1536, step=512),
            'dropout': optuna.distributions.FloatDistribution(0.1, 0.5),
            'learning_rate': optuna.distributions.FloatDistribution(1e-5, 1e-2, log=True)
        }

        trial = optuna.trial.create_trial(
            params=params,  
            value=row['value'], 
            distributions=distributions,
            state=optuna.trial.TrialState.COMPLETE
        )

        study.add_trial(trial)

    return study

In [None]:
study = load_study_from_csv(filename="../input/study-checkpoint/study (1).csv")

study.optimize(objective, n_trials=30, callbacks=[save_study_callback])