In [2]:
import cdsapi


In [3]:
import pandas as pd

df = pd.read_csv(
    "/Users/omkarsomeshwarkondhalkar/Movies/Notes/PCDL25/Research Project/data/raw/tageswerte_KL_01975_19360101_20241231_hist/produkt_klima_tag_19360101_20241231_01975.txt",
    sep=";",
    comment="#",
    parse_dates=["MESS_DATUM"]
)

# Filter for recent years: 2015–2024
df_recent = df[(df['MESS_DATUM'] >= '2015-01-01') & (df['MESS_DATUM'] <= '2024-12-31')].reset_index(drop=True)

print(df_recent.head())


   STATIONS_ID MESS_DATUM  QN_3    FX    FM  QN_4   RSK  RSKF   SDK  SHK_TAG  \
0         1975 2015-01-01    10  17.8   6.4    10   3.9     6   1.2        0   
1         1975 2015-01-02    10  21.9   9.4    10   4.0     8   0.8        0   
2         1975 2015-01-03    10  16.3   7.0    10   6.1     6   0.0        0   
3         1975 2015-01-04    10  13.0   4.9    10   0.0     0   6.4        0   
4         1975 2015-01-05    10   9.9   3.9    10   0.0     6   0.0        0   

     NM   VPM      PM   TMK   UPM   TXK   TNK   TGK  eor  
0   7.2   6.7  1026.2   3.6  84.0   5.5   2.0   1.1  eor  
1   6.5   7.7  1014.6   6.3  81.0   9.2   4.1   2.6  eor  
2   5.5   7.0  1014.6   4.5  83.0   5.9   2.4   1.0  eor  
3   3.0   6.9  1023.2   4.0  85.0   6.4   2.6   0.2  eor  
4   7.8   7.2  1026.4   3.8  90.0   4.4   2.6   2.6  eor  


In [4]:
from sklearn.preprocessing import StandardScaler

# drop unneeded columns
df = df.drop(['STATIONS_ID', 'QN_4', 'RSKF', 'QN_3', 'eor'], axis=1)

In [5]:
print(df.columns.tolist())
df.columns = df.columns.str.strip()

['MESS_DATUM', '  FX', '  FM', ' RSK', ' SDK', 'SHK_TAG', '  NM', ' VPM', '  PM', ' TMK', ' UPM', ' TXK', ' TNK', ' TGK']


In [7]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Fill missing values with forward fill
df = df.fillna(method='ffill')

# Feature scaling
scaler = StandardScaler()
features = ['FX', 'FM', 'RSK', 'SDK', 'SHK_TAG', 'NM', 'VPM', 'PM', 'TMK', 'UPM', 'TXK', 'TNK', 'TGK']
df[features] = scaler.fit_transform(df[features])

# Create sequence data
def create_seq(data, features, target_col, seq_length=7):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data.iloc[i:(i+seq_length)][features].values
        y = data.iloc[i+seq_length][target_col]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

X, y = create_seq(df, features, target_col='TMK', seq_length=7)
print(X.shape, y.shape)

  df = df.fillna(method='ffill')


(32501, 7, 13) (32501,)


In [8]:
n_samples = X.shape[0]

#calculate split indices
train_end = int(0.7 * n_samples)
val_end = int(0.85 * n_samples)

#chronological data
X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]

print('Train set:', X_train.shape, y_train.shape)
print('Validation set:', X_val.shape, y_val.shape)
print('Test set:', X_test.shape, y_test.shape)


Train set: (22750, 7, 13) (22750,)
Validation set: (4875, 7, 13) (4875,)
Test set: (4876, 7, 13) (4876,)


In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader

#convert numpy arrays to pytorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

#create datasets
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
test_dataset = TensorDataset(X_test, y_test)

#create dataloaders

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"Train set size: {len(train_loader.dataset)}")
print(f"Validation set size: {len(val_loader.dataset)}")
print(f"Test set size: {len(test_loader.dataset)}")

Train set size: 22750
Validation set size: 4875
Test set size: 4876
