In [1]:
import pandas as pd
import torch
from sklearn.preprocessing import MinMaxScaler
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.data.encoders import NaNLabelEncoder
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from pytorch_forecasting.metrics import QuantileLoss

  from tqdm.autonotebook import tqdm


In [2]:
# Load and preprocess data
data = pd.read_csv('D:/MAC-Course/Sem 2/ADT/Final Project/Predicting_Canadian_Job_Vacancies-main/Predicting_Canadian_Job_Vacancies-main/Database/Resources/MachineLearning.csv')
data['ref_date'] = pd.to_datetime(data['ref_date'], format='%m-%d-%Y')


In [3]:
# Aggregate data by date, geo, and sector
aggregated_data = (
    data.groupby(['ref_date', 'geo', 'noc_desc', 'job_char'], as_index=False)
    .agg({'total_vacancies': 'sum'})
)

In [4]:
# Scale total_vacancies
scaler = MinMaxScaler()
aggregated_data['total_vacancies_scaled'] = scaler.fit_transform(
    aggregated_data[['total_vacancies']]
)


In [5]:
# Handle missing values in categorical columns before encoding
aggregated_data['geo'] = aggregated_data['geo'].fillna('unknown')  # Replace NaNs with 'unknown'
aggregated_data['noc_desc'] = aggregated_data['noc_desc'].fillna('unknown')  # Replace NaNs with 'unknown'
aggregated_data['job_char'] = aggregated_data['job_char'].fillna('unknown')  # Replace NaNs with 'unknown'


In [6]:
# Initialize encoders
geo_encoder = NaNLabelEncoder()
sector_encoder = NaNLabelEncoder()
job_char_encoder = NaNLabelEncoder()

# Fit and transform the categorical columns
aggregated_data['geo_encoded'] = geo_encoder.fit_transform(aggregated_data['geo'])
aggregated_data['noc_desc_encoded'] = sector_encoder.fit_transform(aggregated_data['noc_desc'])
aggregated_data['job_char_encoded'] = job_char_encoder.fit_transform(aggregated_data['job_char'])


In [7]:
# Create a time index
aggregated_data['time_idx'] = (aggregated_data['ref_date'] - aggregated_data['ref_date'].min()).dt.days


In [8]:
# Step 1: Fill missing time steps

# Expand time index for continuity
full_range = (
    aggregated_data.groupby(['geo_encoded', 'noc_desc_encoded', 'job_char_encoded'])['time_idx']
    .apply(lambda x: pd.RangeIndex(start=x.min(), stop=x.max() + 1))
)

In [9]:
# Create a new DataFrame with all combinations of groups and time_idx
expanded_data = (
    full_range.reset_index()
    .explode('time_idx')
    .merge(aggregated_data, on=['geo_encoded', 'noc_desc_encoded', 'job_char_encoded' ,'time_idx'], how='left')
)
expanded_data.ffill(inplace=True)

  expanded_data.ffill(inplace=True)


In [10]:
# # Step 3: Check the differences between consecutive time_idx to ensure continuity
# print(expanded_data.groupby(['geo_encoded', 'noc_desc_encoded'])['time_idx'].diff().unique())


In [11]:
# Define TimeSeriesDataSet
max_encoder_length = 30
max_prediction_length = 60  

# Filter valid groups
group_counts = expanded_data.groupby(["geo_encoded", "noc_desc_encoded", 'job_char_encoded']).size()
min_required_length = max_encoder_length + max_prediction_length
valid_groups = group_counts[group_counts >= min_required_length].index

expanded_data = expanded_data[
    expanded_data.set_index(["geo_encoded", "noc_desc_encoded", 'job_char_encoded']).index.isin(valid_groups)
]

train_df, val_df = train_test_split(expanded_data, test_size=0.2, random_state=42)

In [12]:
train_df

Unnamed: 0,geo_encoded,noc_desc_encoded,job_char_encoded,time_idx,ref_date,geo,noc_desc,job_char,total_vacancies,total_vacancies_scaled
195316,1,10,1,2081,2020-01-10,British Columbia,"Trades, transport and equipment operators and ...",Part-time,2250.0,0.002180
1187874,13,9,2,229,2015-01-10,Yukon,"Total, all occupations","Type of work, all types",465.0,0.000450
846699,9,9,0,241,2015-01-10,Ontario,"Total, all occupations",Full-time,95165.0,0.092195
204594,2,0,1,1495,2019-01-10,Canada,"Business, finance and administration occupations",Part-time,8125.0,0.007871
1030612,11,10,0,2588,2022-01-10,Quebec,"Trades, transport and equipment operators and ...",Full-time,31775.0,0.030783
...,...,...,...,...,...,...,...,...,...,...
110270,1,1,0,2141,2020-01-10,British Columbia,Health occupations,Full-time,4905.0,0.004752
259181,2,6,0,1647,2019-01-10,Canada,"Occupations in education, law and social, comm...",Full-time,20570.0,0.019928
131934,1,3,2,798,2017-01-10,British Columbia,Natural and applied sciences and related occup...,"Type of work, all types",4240.0,0.004108
671197,7,6,1,1616,2019-01-10,Nova Scotia,"Occupations in education, law and social, comm...",Part-time,485.0,0.000470


In [13]:
train_dataset = TimeSeriesDataSet(
    train_df,
    time_idx="time_idx",
    target="total_vacancies_scaled",
    group_ids=["geo_encoded", "noc_desc_encoded", 'job_char_encoded'],
    min_encoder_length=max_encoder_length // 2,
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    time_varying_unknown_reals=["total_vacancies_scaled"],
    categorical_encoders={
        "geo_encoded": NaNLabelEncoder(),
        "noc_desc_encoded": NaNLabelEncoder(),
        'job_char_encoded': NaNLabelEncoder()
    },
    target_normalizer=GroupNormalizer(groups=["geo_encoded", "noc_desc_encoded", 'job_char_encoded']),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missing_timesteps=True,
)

val_dataset = TimeSeriesDataSet.from_dataset(train_dataset, val_df, stop_randomization=True)

train_loader = train_dataset.to_dataloader(train=True, batch_size=64, shuffle=True)
val_loader = val_dataset.to_dataloader(train=False, batch_size=64)


# Step 4: Verify that the dataset is correctly formatted
print(f"Number of samples in dataset: {len(val_dataset)}")
print(f"train_loader: {len(train_loader)}")
print(f"val_loader: {len(val_loader)}")

Number of samples in dataset: 327863
train_loader: 17131
val_loader: 5123


In [14]:
print(f"Valid groups after filtering: {len(valid_groups)}")
print(expanded_data[['geo_encoded', 'noc_desc_encoded', 'job_char_encoded']].drop_duplicates())


Valid groups after filtering: 392
         geo_encoded  noc_desc_encoded  job_char_encoded
0                  0                 0                 0
3288               0                 0                 1
6576               0                 0                 2
9864               0                 1                 0
13152              0                 1                 1
...              ...               ...               ...
1184360           13                 9                 1
1187648           13                 9                 2
1190936           13                10                 0
1193862           13                10                 1
1196420           13                10                 2

[392 rows x 3 columns]


In [15]:
expanded_data

Unnamed: 0,geo_encoded,noc_desc_encoded,job_char_encoded,time_idx,ref_date,geo,noc_desc,job_char,total_vacancies,total_vacancies_scaled
0,0,0,0,3,2015-01-04,Alberta,"Business, finance and administration occupations",Full-time,4705.0,0.004558
1,0,0,0,4,2015-01-04,Alberta,"Business, finance and administration occupations",Full-time,4705.0,0.004558
2,0,0,0,5,2015-01-04,Alberta,"Business, finance and administration occupations",Full-time,4705.0,0.004558
3,0,0,0,6,2015-01-07,Alberta,"Business, finance and administration occupations",Full-time,4260.0,0.004127
4,0,0,0,7,2015-01-07,Alberta,"Business, finance and administration occupations",Full-time,4260.0,0.004127
...,...,...,...,...,...,...,...,...,...,...
1199703,13,10,2,3286,2023-01-10,Yukon,"Trades, transport and equipment operators and ...","Type of work, all types",240.0,0.000233
1199704,13,10,2,3287,2024-01-01,Yukon,"Trades, transport and equipment operators and ...","Type of work, all types",165.0,0.000160
1199705,13,10,2,3288,2024-01-01,Yukon,"Trades, transport and equipment operators and ...","Type of work, all types",165.0,0.000160
1199706,13,10,2,3289,2024-01-01,Yukon,"Trades, transport and equipment operators and ...","Type of work, all types",165.0,0.000160


In [16]:
# from sklearn.model_selection import train_test_split

# # Assuming 'expanded_data' is your preprocessed DataFrame
# train_df, val_df = train_test_split(expanded_data, test_size=0.2, random_state=42)


In [17]:
# train_dataset = TimeSeriesDataSet(
#     train_df,
#     time_idx="time_idx",
#     target="total_vacancies_scaled",
#     group_ids=["geo_encoded", "noc_desc_encoded"],
#     min_encoder_length=max_encoder_length // 2,
#     max_encoder_length=max_encoder_length,
#     min_prediction_length=1,
#     max_prediction_length=max_prediction_length,
#     time_varying_unknown_reals=["total_vacancies_scaled"],
#     categorical_encoders={
#         "geo_encoded": NaNLabelEncoder(),
#         "noc_desc_encoded": NaNLabelEncoder(),
#     },
#     target_normalizer=GroupNormalizer(
#         groups=["geo_encoded", "noc_desc_encoded"], transformation="softplus"
#     ),
#     add_relative_time_idx=True,
#     add_target_scales=True,
#     add_encoder_length=True,
#     allow_missing_timesteps=True,  # Added this line
# )


# val_dataset = TimeSeriesDataSet(
#     val_df,
#     time_idx="time_idx",
#     target="total_vacancies_scaled",
#     group_ids=["geo_encoded", "noc_desc_encoded"],
#     min_encoder_length=max_encoder_length // 2,
#     max_encoder_length=max_encoder_length,
#     min_prediction_length=1,
#     max_prediction_length=max_prediction_length,
#     time_varying_unknown_reals=["total_vacancies_scaled"],
#     categorical_encoders={
#         "geo_encoded": NaNLabelEncoder(),
#         "noc_desc_encoded": NaNLabelEncoder(),
#     },
#     target_normalizer=GroupNormalizer(
#         groups=["geo_encoded", "noc_desc_encoded"], transformation="softplus"
#     ),
#     add_relative_time_idx=True,
#     add_target_scales=True,
#     add_encoder_length=True,
#     allow_missing_timesteps=True,  # Fix applied here as well

# )


In [18]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=64)


In [19]:
# # Create DataLoaders from the TimeSeriesDataSet
# batch_size = 64
# train_loader = train_dataset.to_dataloader(train=True, batch_size=batch_size, shuffle=True)
# val_loader = val_dataset.to_dataloader(train=False, batch_size=batch_size)


In [20]:
# Define and train Temporal Fusion Transformer
tft = TemporalFusionTransformer.from_dataset(
    train_dataset,
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # QuantileLoss output size
    loss=QuantileLoss(),
)

c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.
  super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)


In [21]:
from torch.utils.data import DataLoader

# Convert dataset to a DataLoader
dataloader = train_dataset.to_dataloader(train=True, batch_size=1)

# Fetch one batch
batch = next(iter(dataloader))
print(batch)


({'encoder_cat': tensor([], size=(1, 30, 0), dtype=torch.int64), 'encoder_cont': tensor([[[ 1.0000, -0.2729, -0.2814, -1.0000,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.9667,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.9333,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.9000,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.8667,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.8333,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.8000,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.7667,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.7333,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.7000,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.6667,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.6333,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.6000,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.5667,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.5333,  0.3095],
         [ 1.0000, -0.2729, -0.2814, -0.5000,  0.3095],
         [ 1.0000, -0.2

In [22]:
print(expanded_data.head())


   geo_encoded  noc_desc_encoded  job_char_encoded  time_idx   ref_date  \
0            0                 0                 0         3 2015-01-04   
1            0                 0                 0         4 2015-01-04   
2            0                 0                 0         5 2015-01-04   
3            0                 0                 0         6 2015-01-07   
4            0                 0                 0         7 2015-01-07   

       geo                                          noc_desc   job_char  \
0  Alberta  Business, finance and administration occupations  Full-time   
1  Alberta  Business, finance and administration occupations  Full-time   
2  Alberta  Business, finance and administration occupations  Full-time   
3  Alberta  Business, finance and administration occupations  Full-time   
4  Alberta  Business, finance and administration occupations  Full-time   

   total_vacancies  total_vacancies_scaled  
0           4705.0                0.004558  
1       

In [23]:
for idx, batch in enumerate(dataloader):
    x, y = batch
    print(f"Batch {idx}:")
    print("Inputs:", x)
    print("Targets:", y)
    break  # Only check the first batch


Batch 0:
Inputs: {'encoder_cat': tensor([], size=(1, 30, 0), dtype=torch.int64), 'encoder_cont': tensor([[[ 1.0000,  0.1779,  0.1379, -1.0000, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.9667, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.9333, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.9000, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.8667, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.8333, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.8000, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.7667, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.7333, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.7000, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.6667, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.6333, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.6000, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.5667, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.5333, -0.5930],
         [ 1.0000,  0.1779,  0.1379, -0.5000, -0.5930],
       

In [24]:
print(train_dataset.reals)
print(train_dataset.categoricals)


['encoder_length', 'total_vacancies_scaled_center', 'total_vacancies_scaled_scale', 'relative_time_idx', 'total_vacancies_scaled']
[]


In [25]:
print(expanded_data['total_vacancies_scaled'].describe())
print(expanded_data['total_vacancies_scaled'].isna().sum())


count    1.199658e+06
mean     1.236873e-02
std      4.617354e-02
min      0.000000e+00
25%      1.986020e-04
50%      1.036606e-03
75%      5.928997e-03
max      1.000000e+00
Name: total_vacancies_scaled, dtype: float64
0


In [26]:
from torch.utils.data import DataLoader

dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)


In [27]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x21961c407a0>

In [28]:
expanded_data["geo_encoded"] = expanded_data["geo_encoded"].astype(str)
expanded_data["noc_desc_encoded"] = expanded_data["noc_desc_encoded"].astype(str)
expanded_data["job_char_encoded"] = expanded_data["job_char_encoded"].astype(str)
expanded_data["total_vacancies_scaled"] = expanded_data["total_vacancies_scaled"].astype(float)


In [29]:
group_counts = expanded_data.groupby(["geo_encoded", "noc_desc_encoded", 'job_char_encoded']).size()
print(group_counts[group_counts >= (max_encoder_length + max_prediction_length)])


geo_encoded  noc_desc_encoded  job_char_encoded
0            0                 0                   3288
                               1                   3288
                               2                   3288
             1                 0                   3288
                               1                   3288
                                                   ... 
9            8                 1                   3288
                               2                   3288
             9                 0                   3288
                               1                   3288
                               2                   3288
Length: 392, dtype: int64


In [30]:
from pytorch_forecasting import TemporalFusionTransformer
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning import LightningModule

class TFTModule(LightningModule):
	def __init__(self, tft):
		super().__init__()
		self.tft = tft

	def forward(self, x):
		y_pred = self.tft(x)
		# Ensure y_pred has the correct shape
		if isinstance(y_pred, tuple):
			y_pred = y_pred[0]
		return y_pred

	def training_step(self, batch, batch_idx):
		x, y = batch
		y_hat = self(x)
		loss = self.tft.loss(y_hat, y)
		self.log("train_loss", loss)
		return loss

	def validation_step(self, batch, batch_idx):
		x, y = batch
		y_hat = self(x)
		loss = self.tft.loss(y_hat, y)
		self.log("val_loss", loss)
		return loss

	def configure_optimizers(self):
		return torch.optim.Adam(self.parameters(), lr=0.03)

tft_module = TFTModule(tft)

early_stop_callback = EarlyStopping(monitor="val_loss", patience=5, verbose=True, mode="min")
trainer = Trainer(max_epochs=1, accelerator="auto", callbacks=[early_stop_callback])

trainer.fit(tft_module, train_dataloaders=train_loader, val_dataloaders=val_loader)




GPU available: False, used: False
TPU available: False, using: 0 TPU cores


HPU available: False, using: 0 HPUs

  | Name | Type                      | Params | Mode 
-----------------------------------------------------------
0 | tft  | TemporalFusionTransformer | 17.6 K | train
-----------------------------------------------------------
17.6 K    Trainable params
0         Non-trainable params
17.6 K    Total params
0.070     Total estimated model params size (MB)
230       Modules in train mode
0         Modules in eval mode


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=17` in the `DataLoader` to improve performance.


                                                                           

c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\utilities\data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 64. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=17` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 37/17131 [00:19<2:32:29,  1.87it/s, v_num=109]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
# # Save the model
# torch.save(tft_module.state_dict(), "tft_model_job_char.pth")

In [31]:
# Load the model
tft_module = TFTModule(tft)  # Recreate the TFTModule
tft_module.load_state_dict(torch.load("tft_model_job_char.pth"))
tft_module.eval()  # Set the model to evaluation mode

  tft_module.load_state_dict(torch.load("tft_model_job_char.pth"))


TFTModule(
  (tft): TemporalFusionTransformer(
    	"attention_head_size":               1
    	"categorical_groups":                {}
    	"causal_attention":                  True
    	"dataset_parameters":                {'time_idx': 'time_idx', 'target': 'total_vacancies_scaled', 'group_ids': ['geo_encoded', 'noc_desc_encoded', 'job_char_encoded'], 'weight': None, 'max_encoder_length': 30, 'min_encoder_length': 15, 'min_prediction_idx': 0, 'min_prediction_length': 60, 'max_prediction_length': 60, 'static_categoricals': None, 'static_reals': None, 'time_varying_known_categoricals': None, 'time_varying_known_reals': None, 'time_varying_unknown_categoricals': None, 'time_varying_unknown_reals': ['total_vacancies_scaled'], 'variable_groups': None, 'constant_fill_strategy': None, 'allow_missing_timesteps': True, 'lags': None, 'add_relative_time_idx': True, 'add_target_scales': True, 'add_encoder_length': True, 'target_normalizer': GroupNormalizer(
    		method='standard',
    		groups=

In [32]:
# Prepare future data for prediction
future_time_idx = range(expanded_data['time_idx'].max() + 1, expanded_data['time_idx'].max() + 121)
future_data = []

for geo in aggregated_data['geo_encoded'].unique():
    for sector in aggregated_data['noc_desc_encoded'].unique():
        for job_char in aggregated_data['job_char_encoded'].unique():
            for time_idx in future_time_idx:
                future_data.append([time_idx, geo, sector, job_char,None])

future_df = pd.DataFrame(
    future_data,
    columns=['time_idx', 'geo_encoded', 'noc_desc_encoded', 'job_char_encoded' ,'total_vacancies_scaled']
)
future_df['total_vacancies_scaled'] = 0  # Placeholder for predictions

# Ensure unique index for combined data
combined_data = pd.concat([expanded_data, future_df]).drop_duplicates(subset=['time_idx', 'geo_encoded', 'noc_desc_encoded','job_char_encoded'])
combined_data.reset_index(drop=True, inplace=True)

# Ensure consistent data types
combined_data["geo_encoded"] = combined_data["geo_encoded"].astype(int)
combined_data["noc_desc_encoded"] = combined_data["noc_desc_encoded"].astype(int)
combined_data["job_char_encoded"] = combined_data["job_char_encoded"].astype(int)

combined_dataset = TimeSeriesDataSet.from_dataset(train_dataset, combined_data)


In [33]:
# Add missing columns with default values to future_df
future_df['ref_date'] = pd.NaT  # Use NaT (Not a Timestamp) for datetime columns
future_df['geo'] = "Unknown"  # Default string value
future_df['noc_desc'] = "Unknown"  # Default string value
future_df['total_vacancies'] = 0  # Default numeric value
future_df['job_char'] = "Unknown"  # Default string value
# Determine the starting date from expanded_data (if available)
start_date = pd.to_datetime(expanded_data['ref_date'].min())  # Adjust as necessary

# Populate ref_date in future_df based on time_idx
future_df['ref_date'] = future_df['time_idx'].apply(lambda idx: start_date + pd.Timedelta(days=idx))

# Combine expanded_data and updated future_df
combined_data = pd.concat([expanded_data, future_df]).drop_duplicates(subset=['time_idx', 'geo_encoded', 'noc_desc_encoded', 'job_char_encoded'])
combined_data.reset_index(drop=True, inplace=True)


In [34]:
combined_data.shape

(1255098, 10)

In [35]:
missing_values = combined_data.isnull().sum()
print("Missing values per column:\n", missing_values)


Missing values per column:
 geo_encoded               0
noc_desc_encoded          0
job_char_encoded          0
time_idx                  0
ref_date                  0
geo                       0
noc_desc                  0
job_char                  0
total_vacancies           0
total_vacancies_scaled    0
dtype: int64


In [36]:
combined_data['total_vacancies_scaled'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['total_vacancies_scaled'].fillna(0, inplace=True)


In [37]:
# Check for None or NaN values
assert combined_data.isnull().sum().sum() == 0, "Dataset contains missing values."

# Inspect for unexpected data types
print(combined_data.dtypes)

# Ensure combined_data contains expected columns and no empty rows
print(combined_data.columns)
print(combined_data.shape)


geo_encoded                       object
noc_desc_encoded                  object
job_char_encoded                  object
time_idx                           int64
ref_date                  datetime64[ns]
geo                               object
noc_desc                          object
job_char                          object
total_vacancies                  float64
total_vacancies_scaled           float64
dtype: object
Index(['geo_encoded', 'noc_desc_encoded', 'job_char_encoded', 'time_idx',
       'ref_date', 'geo', 'noc_desc', 'job_char', 'total_vacancies',
       'total_vacancies_scaled'],
      dtype='object')
(1255098, 10)


In [None]:
# for i in range(len(combined_dataset)):
#     print(f"Index {i}: {combined_dataset[i]}")


In [38]:
combined_data['geo_encoded'] = combined_data['geo_encoded'].astype(str)
combined_data['noc_desc_encoded'] = combined_data['noc_desc_encoded'].astype(str)
combined_data['job_char_encoded'] = combined_data['job_char_encoded'].astype(str)

combined_data.fillna(0, inplace=True)  # Replace NaN with 0
print(combined_data.isnull().sum())    # Confirm no missing values


geo_encoded               0
noc_desc_encoded          0
job_char_encoded          0
time_idx                  0
ref_date                  0
geo                       0
noc_desc                  0
job_char                  0
total_vacancies           0
total_vacancies_scaled    0
dtype: int64


In [39]:
for column in combined_data.columns:
    print(f"{column}: {combined_data[column].apply(type).value_counts()}")


geo_encoded: geo_encoded
<class 'str'>    1255098
Name: count, dtype: int64
noc_desc_encoded: noc_desc_encoded
<class 'str'>    1255098
Name: count, dtype: int64
job_char_encoded: job_char_encoded
<class 'str'>    1255098
Name: count, dtype: int64
time_idx: time_idx
<class 'int'>    1255098
Name: count, dtype: int64
ref_date: ref_date
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    1255098
Name: count, dtype: int64
geo: geo
<class 'str'>    1255098
Name: count, dtype: int64
noc_desc: noc_desc
<class 'str'>    1255098
Name: count, dtype: int64
job_char: job_char
<class 'str'>    1255098
Name: count, dtype: int64
total_vacancies: total_vacancies
<class 'float'>    1255098
Name: count, dtype: int64
total_vacancies_scaled: total_vacancies_scaled
<class 'float'>    1255098
Name: count, dtype: int64


In [40]:
# Confirm that combined_data is not empty
print(f"Filtered Combined Data Length: {len(combined_data)}")

Filtered Combined Data Length: 1255098


In [41]:
# Reset index after filtering
combined_data.reset_index(drop=True, inplace=True)

In [42]:
numerical_cols = ['time_idx', 'total_vacancies_scaled']

In [43]:
# Identify your numerical and categorical columns
numerical_cols = ['time_idx', 'total_vacancies_scaled']  # replace with your actual numerical column names
categorical_cols = ['geo_encoded', 'noc_desc_encoded', 'job_char_encoded']  # replace with your actual categorical column names

# Fill missing values in numerical columns with 0
combined_data[numerical_cols] = combined_data[numerical_cols].fillna(0)

In [44]:
combined_data

Unnamed: 0,geo_encoded,noc_desc_encoded,job_char_encoded,time_idx,ref_date,geo,noc_desc,job_char,total_vacancies,total_vacancies_scaled
0,0,0,0,3,2015-01-04,Alberta,"Business, finance and administration occupations",Full-time,4705.0,0.004558
1,0,0,0,4,2015-01-04,Alberta,"Business, finance and administration occupations",Full-time,4705.0,0.004558
2,0,0,0,5,2015-01-04,Alberta,"Business, finance and administration occupations",Full-time,4705.0,0.004558
3,0,0,0,6,2015-01-07,Alberta,"Business, finance and administration occupations",Full-time,4260.0,0.004127
4,0,0,0,7,2015-01-07,Alberta,"Business, finance and administration occupations",Full-time,4260.0,0.004127
...,...,...,...,...,...,...,...,...,...,...
1255093,13,2,1,3406,2024-04-29,Unknown,Unknown,Unknown,0.0,0.000000
1255094,13,2,1,3407,2024-04-30,Unknown,Unknown,Unknown,0.0,0.000000
1255095,13,2,1,3408,2024-05-01,Unknown,Unknown,Unknown,0.0,0.000000
1255096,13,2,1,3409,2024-05-02,Unknown,Unknown,Unknown,0.0,0.000000


In [45]:
# Inspect the structure of an item in combined_dataset
for item in combined_dataset:
    print(f"Item: {item}")
    break  # Inspect just the first item

Item: ({'x_cat': tensor([], size=(90, 0), dtype=torch.int64), 'x_cont': tensor([[ 1.0000, -0.1683, -0.1865, -1.0000, -0.1814],
        [ 1.0000, -0.1683, -0.1865, -0.9667, -0.1814],
        [ 1.0000, -0.1683, -0.1865, -0.9333, -0.1814],
        [ 1.0000, -0.1683, -0.1865, -0.9000, -0.3560],
        [ 1.0000, -0.1683, -0.1865, -0.8667, -0.3560],
        [ 1.0000, -0.1683, -0.1865, -0.8333, -0.3560],
        [ 1.0000, -0.1683, -0.1865, -0.8000, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.7667, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.7333, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.7000, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.6667, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.6333, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.6000, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.5667, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.5333, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.5000, -0.6522],
        [ 1.0000, -0.1683, -0.1865, -0.4667, -0.

In [46]:
# Validate combined_data
combined_data.fillna(0, inplace=True)  # Replace missing values
print(combined_data.isnull().sum())   # Confirm no missing values
print(combined_data.head())           # Preview dataset

geo_encoded               0
noc_desc_encoded          0
job_char_encoded          0
time_idx                  0
ref_date                  0
geo                       0
noc_desc                  0
job_char                  0
total_vacancies           0
total_vacancies_scaled    0
dtype: int64
  geo_encoded noc_desc_encoded job_char_encoded  time_idx   ref_date      geo  \
0           0                0                0         3 2015-01-04  Alberta   
1           0                0                0         4 2015-01-04  Alberta   
2           0                0                0         5 2015-01-04  Alberta   
3           0                0                0         6 2015-01-07  Alberta   
4           0                0                0         7 2015-01-07  Alberta   

                                           noc_desc   job_char  \
0  Business, finance and administration occupations  Full-time   
1  Business, finance and administration occupations  Full-time   
2  Business, finance a

In [47]:
dataloader = DataLoader(combined_dataset, batch_size=64)

In [48]:
combined_dataset

TimeSeriesDataSet[length=1224380](
	time_idx='time_idx',
	target='total_vacancies_scaled',
	group_ids=['geo_encoded', 'noc_desc_encoded', 'job_char_encoded'],
	weight=None,
	max_encoder_length=30,
	min_encoder_length=15,
	min_prediction_idx=0,
	min_prediction_length=60,
	max_prediction_length=60,
	static_categoricals=None,
	static_reals=None,
	time_varying_known_categoricals=None,
	time_varying_known_reals=None,
	time_varying_unknown_categoricals=None,
	time_varying_unknown_reals=['total_vacancies_scaled'],
	variable_groups=None,
	constant_fill_strategy=None,
	allow_missing_timesteps=True,
	lags=None,
	add_relative_time_idx=True,
	add_target_scales=True,
	add_encoder_length=True,
	target_normalizer=GroupNormalizer(
	method='standard',
	groups=['geo_encoded', 'noc_desc_encoded', 'job_char_encoded'],
	center=True,
	scale_by_group=False,
	transformation=None,
	method_kwargs={}
),
	categorical_encoders={'geo_encoded': NaNLabelEncoder(add_nan=False, warn=True), 'noc_desc_encoded': NaNLabelE

In [49]:
# Identify missing values
print(val_df.isnull().sum())

# Drop or fill missing values
test_df = val_df.fillna(0)  # Replace missing values with 0 or another strategy

test_df


geo_encoded               0
noc_desc_encoded          0
job_char_encoded          0
time_idx                  0
ref_date                  0
geo                       0
noc_desc                  0
job_char                  0
total_vacancies           0
total_vacancies_scaled    0
dtype: int64


Unnamed: 0,geo_encoded,noc_desc_encoded,job_char_encoded,time_idx,ref_date,geo,noc_desc,job_char,total_vacancies,total_vacancies_scaled
249632,2,5,0,1962,2020-01-10,Canada,"Occupations in art, culture, recreation and sport",Full-time,5790.0,0.005609
675618,7,6,2,2752,2022-01-10,Nova Scotia,"Occupations in education, law and social, comm...","Type of work, all types",2080.0,0.002015
350963,3,5,0,2482,2021-01-10,Manitoba,"Occupations in art, culture, recreation and sport",Full-time,310.0,0.000300
202393,2,0,0,2582,2022-01-10,Canada,"Business, finance and administration occupations",Full-time,86635.0,0.083931
1186639,13,9,1,2282,2021-01-10,Yukon,"Total, all occupations",Part-time,270.0,0.000262
...,...,...,...,...,...,...,...,...,...,...
73219,0,8,1,1258,2018-01-10,Alberta,Sales and service occupations,Part-time,8260.0,0.008002
317361,3,1,0,652,2016-01-10,Manitoba,Health occupations,Full-time,585.0,0.000567
459395,4,7,0,2057,2020-01-10,New Brunswick,Occupations in manufacturing and utilities,Full-time,1755.0,0.001700
1075642,12,4,0,1208,2018-01-10,Saskatchewan,"Natural resources, agriculture and related pro...",Full-time,505.0,0.000489


In [None]:
combined_dataset = [item for item in combined_dataset if item is not None]

In [None]:
# import pickle

# # Save it to a file
# with open("combined_dataset_job_char.pkl", "wb") as file:
#     pickle.dump(combined_dataset, file)


In [None]:
# from joblib import dump
# dump(combined_dataset, "combined_dataset_job_char.pkl")


In [None]:
# # import pickle

# # # Load the dataset
# # with open("combined_dataset_job_char.pkl", "rb") as file:
# #     combined_dataset = pickle.load(file)

# from joblib import load

# # Load the joblib file
# combined_dataset = load("combined_dataset_job_char.pkl")



In [50]:
combined_dataset

TimeSeriesDataSet[length=1224380](
	time_idx='time_idx',
	target='total_vacancies_scaled',
	group_ids=['geo_encoded', 'noc_desc_encoded', 'job_char_encoded'],
	weight=None,
	max_encoder_length=30,
	min_encoder_length=15,
	min_prediction_idx=0,
	min_prediction_length=60,
	max_prediction_length=60,
	static_categoricals=None,
	static_reals=None,
	time_varying_known_categoricals=None,
	time_varying_known_reals=None,
	time_varying_unknown_categoricals=None,
	time_varying_unknown_reals=['total_vacancies_scaled'],
	variable_groups=None,
	constant_fill_strategy=None,
	allow_missing_timesteps=True,
	lags=None,
	add_relative_time_idx=True,
	add_target_scales=True,
	add_encoder_length=True,
	target_normalizer=GroupNormalizer(
	method='standard',
	groups=['geo_encoded', 'noc_desc_encoded', 'job_char_encoded'],
	center=True,
	scale_by_group=False,
	transformation=None,
	method_kwargs={}
),
	categorical_encoders={'geo_encoded': NaNLabelEncoder(add_nan=False, warn=True), 'noc_desc_encoded': NaNLabelE

In [None]:
# Recreate combined_dataset as a TimeSeriesDataSet
combined_dataset = TimeSeriesDataSet.from_dataset(
	train_dataset, 
	combined_data,
	categorical_encoders={
		"geo_encoded": NaNLabelEncoder(add_nan=True),
		"noc_desc_encoded": NaNLabelEncoder(add_nan=True),
        "job_char_encoded": NaNLabelEncoder(add_nan=True)
        
	}
)

In [None]:
# Convert to DataLoader
dataloader = combined_dataset.to_dataloader(train=False, batch_size=64)

In [None]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x1ed0ba8e870>

In [None]:
test_df

Unnamed: 0,geo_encoded,noc_desc_encoded,job_char_encoded,time_idx,ref_date,geo,noc_desc,job_char,total_vacancies,total_vacancies_scaled
249632,2,5,0,1962,2020-01-10,Canada,"Occupations in art, culture, recreation and sport",Full-time,5790.0,0.005609
675618,7,6,2,2752,2022-01-10,Nova Scotia,"Occupations in education, law and social, comm...","Type of work, all types",2080.0,0.002015
350963,3,5,0,2482,2021-01-10,Manitoba,"Occupations in art, culture, recreation and sport",Full-time,310.0,0.000300
202393,2,0,0,2582,2022-01-10,Canada,"Business, finance and administration occupations",Full-time,86635.0,0.083931
1186639,13,9,1,2282,2021-01-10,Yukon,"Total, all occupations",Part-time,270.0,0.000262
...,...,...,...,...,...,...,...,...,...,...
73219,0,8,1,1258,2018-01-10,Alberta,Sales and service occupations,Part-time,8260.0,0.008002
317361,3,1,0,652,2016-01-10,Manitoba,Health occupations,Full-time,585.0,0.000567
459395,4,7,0,2057,2020-01-10,New Brunswick,Occupations in manufacturing and utilities,Full-time,1755.0,0.001700
1075642,12,4,0,1208,2018-01-10,Saskatchewan,"Natural resources, agriculture and related pro...",Full-time,505.0,0.000489


In [None]:
# import pickle

# # Save it to a file
# with open("dataloader.pkl", "wb") as file:
#     pickle.dump(dataloader, file)

In [51]:
from joblib import load

# Load the joblib file
dataloader = load("dataloader.pkl")

In [52]:
# Make predictions
predictions = tft.predict(dataloader, return_x=False)
print(predictions)

c:\Users\sanja\AppData\Local\Programs\Python\Python312\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=17` in the `DataLoader` to improve performance.


tensor([[ 0.0050,  0.0050,  0.0050,  ...,  0.0050,  0.0050,  0.0050],
        [ 0.0050,  0.0050,  0.0050,  ...,  0.0050,  0.0050,  0.0050],
        [ 0.0050,  0.0050,  0.0050,  ...,  0.0050,  0.0050,  0.0050],
        ...,
        [-0.0006, -0.0004, -0.0006,  ..., -0.0005, -0.0005, -0.0005],
        [-0.0006, -0.0004, -0.0006,  ..., -0.0005, -0.0005, -0.0005],
        [-0.0006, -0.0004, -0.0006,  ..., -0.0005, -0.0005, -0.0005]])


In [53]:
import pickle

# Save predictions to a file
with open("predictions_job_char.pkl", "wb") as file:
    pickle.dump(predictions, file)

# from joblib import dump
# dump(predictions, "predictions_job_char.pkl")


In [None]:
# import pickle

# # Load predictions from a file
# with open("predictions.pkl", "rb") as file:
#     predictions = pickle.load(file)


In [54]:
predictions.shape

torch.Size([1224380, 60])

In [55]:
# Check the unique values and lengths
print(f"Predictions Shape: {predictions.shape}")
print(f"Future DF Length: {len(future_df)}")
print(f"Unique Geo Encoded: {future_df['geo_encoded'].nunique()}")
print(f"Unique NOC Desc Encoded: {future_df['noc_desc_encoded'].nunique()}")
print(f"Unique Job Char Encoded: {future_df['job_char_encoded'].nunique()}")

future_df
future_df.to_csv("future_predictions.csv", index=False)
print("Future predictions saved to 'future_predictions.csv'.")

Predictions Shape: torch.Size([1224380, 60])
Future DF Length: 55440
Unique Geo Encoded: 14
Unique NOC Desc Encoded: 11
Unique Job Char Encoded: 3
Future predictions saved to 'future_predictions.csv'.


In [74]:
import numpy as np

# Save results
future_dates = pd.date_range(start='2022-04-01', periods=20, freq='QS')  # Generate quarterly start dates

# Ensure the lengths of the arrays match
geo = geo_encoder.inverse_transform(future_df['geo_encoded'])
noc_desc = sector_encoder.inverse_transform(future_df['noc_desc_encoded'])
job_char = job_char_encoder.inverse_transform(future_df['job_char_encoded'])
scaled_vacancies = future_df['total_vacancies_scaled']

predicted_vacancies = predictions.flatten()

# Calculate the number of unique combinations of geo, noc_desc, and job_char
num_combinations = len(geo_encoder.classes_) * len(sector_encoder.classes_) * len(job_char_encoder.classes_)

# Ensure the lengths of the arrays match
num_predictions = len(predicted_vacancies)
geo = geo[:num_predictions]
noc_desc = noc_desc[:num_predictions]
job_char = job_char[:num_predictions]
scaled_vacancies = scaled_vacancies[:num_predictions]
future_dates = np.tile(future_dates, num_combinations)[:num_predictions]

# Ensure all arrays have the same length
min_length = min(len(future_dates), len(geo), len(noc_desc), len(predicted_vacancies))
future_dates = future_dates[:min_length]
geo = geo[:min_length]
noc_desc = noc_desc[:min_length]
job_char = job_char[:min_length]
scaled_vacancies = scaled_vacancies[:min_length]
predicted_vacancies = predicted_vacancies[:min_length]

results = pd.DataFrame({
    "ref_date": future_dates,
    "geo": geo,
    "noc_desc": noc_desc,
    "predicted_vacancies": predicted_vacancies,
    "job_char": job_char,
    "scaled_vacancies": scaled_vacancies
})

# Remove duplicate rows
results.drop_duplicates(inplace=True)

print(results)
# Remove rows with duplicates except for 'predicted_vacancies'
results = results.loc[results.drop(columns=['predicted_vacancies']).drop_duplicates().index]

print(results)
# Denormalize the predicted vacancies
scaler = MinMaxScaler()
scaler.fit(expanded_data[['total_vacancies']])  # Fit the scaler on the original data

# Inverse transform the predicted vacancies
results['predicted_vacancies'] = scaler.inverse_transform(results[['predicted_vacancies']])

# Save the results to a CSV file
results.to_csv("denormalized_predictions.csv", index=False)
print("Denormalized predictions saved to 'denormalized_predictions.csv'.")

       ref_date      geo                                           noc_desc  \
0    2022-04-01   Quebec                             Total, all occupations   
1    2022-07-01   Quebec                             Total, all occupations   
2    2022-10-01   Quebec                             Total, all occupations   
3    2023-01-01   Quebec                             Total, all occupations   
4    2023-04-01   Quebec                             Total, all occupations   
...         ...      ...                                                ...   
9175 2026-01-01  Alberta  Natural and applied sciences and related occup...   
9176 2026-04-01  Alberta  Natural and applied sciences and related occup...   
9177 2026-07-01  Alberta  Natural and applied sciences and related occup...   
9178 2026-10-01  Alberta  Natural and applied sciences and related occup...   
9179 2027-01-01  Alberta  Natural and applied sciences and related occup...   

      predicted_vacancies                 job_char 

In [None]:
# # Assuming 'scaler' is the MinMaxScaler used for normalization
# predicted_vacancies = predictions.flatten().reshape(-1, 1)  # Reshape predictions to match scaler's expected input
# denormalized_vacancies = scaler.inverse_transform(predicted_vacancies)

# # Convert to a DataFrame for better readability
# denormalized_vacancies_df = pd.DataFrame(denormalized_vacancies, columns=["denormalized_vacancies"])

# # Display the denormalized vacancies
# print(denormalized_vacancies_df.head())

In [68]:
print(results['geo'].value_counts())  # Should show all provinces
print(results['noc_desc'].value_counts())  # Should show all sectors
print(results.head())  # Preview of the results

geo
Quebec          660
Saskatchewan    660
Alberta         220
Name: count, dtype: int64
noc_desc
Total, all occupations                                                         180
Business, finance and administration occupations                               180
Health occupations                                                             180
Natural and applied sciences and related occupations                           160
Natural resources, agriculture and related production occupations              120
Occupations in art, culture, recreation and sport                              120
Occupations in education, law and social, community and government services    120
Occupations in manufacturing and utilities                                     120
Sales and service occupations                                                  120
Trades, transport and equipment operators and related occupations              120
Legislative and senior management occupations                          

In [75]:
results.to_csv("C:/Users/sanja/OneDrive/Desktop/nor.csv", index=False)
print("Predictions saved to 'predicted_job_vacancies_2025_to_2035.csv'.")

Predictions saved to 'predicted_job_vacancies_2025_to_2035.csv'.


In [None]:
# print(f"Length of combined_dataset: {len(combined_dataset)}")
# print(combined_dataset[0])  # Verify what data it returns


In [None]:
# # Prepare combined dataset
# combined_dataset = TimeSeriesDataSet.from_dataset(dataset, combined_data)

In [None]:
# # Make predictions
# predictions, _ = tft.predict(DataLoader(combined_dataset, batch_size=batch_size), return_x=True)

In [None]:
# # Inverse scale the predictions
# predictions = scaler.inverse_transform(predictions)

In [None]:
# # Save predictions to CSV
# future_dates = pd.date_range(start='2025-01-01', periods=120, freq='M')
# results = pd.DataFrame({
#     "ref_date": future_dates,
#     "geo": geo_encoder.inverse_transform(future_df['geo_encoded']),
#     "noc_desc": sector_encoder.inverse_transform(future_df['noc_desc_encoded']),
#     "predicted_vacancies": predictions.flatten()
# })
# results.to_csv("predicted_job_vacancies_2025_to_2035.csv", index=False)

# print("Predictions saved to 'predicted_job_vacancies_2025_to_2035.csv'.")