In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data.encoders import NaNLabelEncoder, GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping
from itertools import product
import numpy as np

# Load data
data = pd.read_csv('./MachineLearning.csv')
data['ref_date'] = pd.to_datetime(data['ref_date'], format='%d-%m-%Y')

# Remove unwanted entries
data = data[
    (data['geo'] != 'Canada') &
    (data['noc_code'] != 101) &
    (data['job_char'] != 'Type of work, all types')
]

# Filter data for training (2015-2022) and future prediction (2023-2024)
train_data = data[(data['ref_date'] >= '01-01-2015') & (data['ref_date'] <= '31-12-2022')]
future_data = data[(data['ref_date'] >= '01-01-2023') & (data['ref_date'] <= '31-12-2024')]

# Generate future dates for 2025-2027
future_dates = pd.date_range(start="01-01-2025", end="31-12-2027", freq="QS")

# Generate all combinations for future data
geo_values = train_data['geo'].unique()
noc_desc_values = train_data['noc_desc'].unique()
job_char_values = train_data['job_char'].unique()

future_combinations = pd.DataFrame(
    list(product(future_dates, geo_values, noc_desc_values, job_char_values)),
    columns=["ref_date", "geo", "noc_desc", "job_char"]
)
future_combinations['total_vacancies'] = None

# Combine with existing future data
future_data = pd.concat([future_data, future_combinations], ignore_index=True)

# Fill missing values
future_data['total_vacancies'].fillna(0, inplace=True)

# Scale target variable
scaler = StandardScaler()
train_data['total_vacancies_scaled'] = scaler.fit_transform(train_data[['total_vacancies']])
future_data['total_vacancies_scaled'] = scaler.transform(future_data[['total_vacancies']])

# Encode categorical variables
geo_encoder = NaNLabelEncoder()
sector_encoder = NaNLabelEncoder()
job_char_encoder = NaNLabelEncoder()

train_data['geo_encoded'] = geo_encoder.fit_transform(train_data['geo'])
train_data['noc_desc_encoded'] = sector_encoder.fit_transform(train_data['noc_desc'])
train_data['job_char_encoded'] = job_char_encoder.fit_transform(train_data['job_char'])

future_data['geo_encoded'] = geo_encoder.transform(future_data['geo'])
future_data['noc_desc_encoded'] = sector_encoder.transform(future_data['noc_desc'])
future_data['job_char_encoded'] = job_char_encoder.transform(future_data['job_char'])

# Create time index
train_data['time_idx'] = (train_data['ref_date'] - train_data['ref_date'].min()).dt.days
future_data['time_idx'] = (future_data['ref_date'] - train_data['ref_date'].min()).dt.days

# Define TimeSeriesDataSet
max_encoder_length = 16
max_prediction_length = 8

train_dataset = TimeSeriesDataSet(
    train_data,
    time_idx="time_idx",
    target="total_vacancies_scaled",
    group_ids=["geo_encoded", "noc_desc_encoded", "job_char_encoded"],
    min_encoder_length=1,
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["total_vacancies_scaled"],
    categorical_encoders={
        "geo_encoded": NaNLabelEncoder(),
        "noc_desc_encoded": NaNLabelEncoder(),
        "job_char_encoded": NaNLabelEncoder()
    },
    target_normalizer=GroupNormalizer(groups=["geo_encoded", "noc_desc_encoded", "job_char_encoded"]),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

# Create DataLoaders
train_loader = train_dataset.to_dataloader(train=True, batch_size=64, shuffle=True, num_workers=4)
val_loader = train_dataset.to_dataloader(train=False, batch_size=64, num_workers=4)


  from tqdm.autonotebook import tqdm
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  future_data['total_vacancies'].fillna(0, inplace=True)
  future_data['total_vacancies'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['total_vacancies_scaled'] = scaler.fit_transform(train_data[['total_vacancies']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [2]:

# Define and train Temporal Fusion Transformer

# Define and train Temporal Fusion Transformer
class TFTModule(LightningModule):
	def __init__(self, tft):
		super().__init__()
		self.tft = tft

	def forward(self, x):
		y_pred = self.tft(x)
		# Ensure y_pred has the correct shape
		if isinstance(y_pred, tuple):
			y_pred = y_pred[0]
		return y_pred

	def training_step(self, batch, batch_idx):
		x, y = batch
		y_hat = self(x)
		loss = self.tft.loss(y_hat, y)
		self.log("train_loss", loss)
		return loss

	def validation_step(self, batch, batch_idx):
		x, y = batch
		y_hat = self(x)
		loss = self.tft.loss(y_hat, y)
		self.log("val_loss", loss)
		return loss

	def configure_optimizers(self):
		return torch.optim.Adam(self.parameters(), lr=0.03)


tft = TFTModule(TemporalFusionTransformer.from_dataset(
	train_dataset,
	learning_rate=0.005,
	hidden_size=64,
	attention_head_size=8,
	dropout=0.1,
	hidden_continuous_size=32,
	output_size=len(QuantileLoss().quantiles),  # QuantileLoss output size
	loss=QuantileLoss(),
	optimizer="adam",
	log_interval=10
))

early_stop_callback = EarlyStopping(monitor="val_loss", patience=5, verbose=True, mode="min", min_delta=0.001)
trainer = Trainer(
	max_epochs=50,  # Increased number of epochs
	accelerator="cpu",
	devices=1,
	callbacks=[early_stop_callback],
	log_every_n_steps=1,  # Log every step
    enable_progress_bar=True,
    enable_checkpointing=False  # Skip checkpoints to save time
)
trainer.fit(tft, train_dataloaders=train_loader, val_dataloaders=val_loader)


c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name | Type                      | Params | Mode 
-----------------------------------------------------------
0 | tft  | TemporalFusionTransformer | 247 K  | train
-----------------------------------------------------------
247 K     Trainable params
0         Non-trainable par

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\utilities\data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 64. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:419: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0: 100%|██████████| 83/83 [00:32<00:00,  2.59it/s, v_num=161]

c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\utilities\data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 40. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
Metric val_loss improved. New best score: 0.163


Epoch 1: 100%|██████████| 83/83 [00:56<00:00,  1.47it/s, v_num=161]

Metric val_loss improved by 0.023 >= min_delta = 0.001. New best score: 0.140


Epoch 2: 100%|██████████| 83/83 [01:00<00:00,  1.37it/s, v_num=161]

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.136


Epoch 3: 100%|██████████| 83/83 [00:56<00:00,  1.47it/s, v_num=161]

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.132


Epoch 4: 100%|██████████| 83/83 [00:53<00:00,  1.55it/s, v_num=161]

Metric val_loss improved by 0.004 >= min_delta = 0.001. New best score: 0.128


Epoch 9: 100%|██████████| 83/83 [00:54<00:00,  1.53it/s, v_num=161]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.128. Signaling Trainer to stop.


Epoch 9: 100%|██████████| 83/83 [00:54<00:00,  1.53it/s, v_num=161]


In [5]:
# Make predictions
combined_data = pd.concat([train_data, future_data]).drop_duplicates(subset=['time_idx', 'geo_encoded', 'noc_desc_encoded', 'job_char_encoded'])
combined_data.reset_index(drop=True, inplace=True)

combined_dataset = TimeSeriesDataSet.from_dataset(train_dataset, combined_data)
dataloader = combined_dataset.to_dataloader(train=False, batch_size=64)

# Use the TemporalFusionTransformer model directly for predictions
predictions = tft.tft.predict(dataloader, mode="quantiles")


c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=17` in the `DataLoader` to improve performance.


In [6]:


# Denormalize predictions
predicted_vacancies = predictions[:, :, 0].numpy().flatten().reshape(-1, 1)
denormalized_vacancies = scaler.inverse_transform(predicted_vacancies).flatten()

# Ensure the lengths of the arrays are the same
min_length = min(len(future_data['ref_date'].values), len(denormalized_vacancies))

# Save results
results = pd.DataFrame({
    "ref_date": future_data['ref_date'].values[:min_length],
    "geo": geo_encoder.inverse_transform(future_data['geo_encoded'].values[:min_length]),
    "noc_desc": sector_encoder.inverse_transform(future_data['noc_desc_encoded'].values[:min_length]),
    "job_char": job_char_encoder.inverse_transform(future_data['job_char_encoded'].values[:min_length]),
    "predicted_vacancies": denormalized_vacancies[:min_length]
})
results.to_csv("NEWWWWWW.csv", index=False)
print("Denormalized predictions saved to 'denormalized_predictions_sanjay.csv'.")


Denormalized predictions saved to 'denormalized_predictions_sanjay.csv'.
