In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import io

#data loading

with open('data/target.csv', 'r', encoding='utf-8') as f:
    file_content = f.read()
lines = file_content.splitlines()

#debug
print(f"DEBUG: Content of lines[9]: {lines[9]}")
print(f"DEBUG: Content of lines[14]: {lines[14]}")

#extracts and parses data
if len(lines) > 9:
    header_parts = [part.strip('"') for part in lines[9].split(',')]
    date_strings = header_parts[2:]
else:
    raise ValueError("CSV file does not have enough lines to extract header.")
print(f"DEBUG: Extracted date_strings (first 5): {date_strings[:5]}")
print(f"DEBUG: Length of date_strings: {len(date_strings)}")

#extract the data line for 'Job vacancy rate'

if len(lines) > 14:
    data_parts = [part.strip('"') for part in lines[14].split(',')]
    raw_values = data_parts[3:]
else:
    raise ValueError("CSV file does not have enough lines to extract job vacancy rate data.")

#debug
print(f"DEBUG: Full data_parts for line 14: {data_parts}")
print(f"DEBUG: Extracted raw_values (first 5): {raw_values[:5]}")
print(f"DEBUG: Length of raw_values: {len(raw_values)}")

#convert raw_values to numeric. handling '..' as NaN & commas in numbers
values = []
for val in raw_values:
    if val == '..':
        values.append(np.nan)
    else:
        try:
            values.append(float(val.replace(',', '')))
        except ValueError as e:
            print(f"DEBUG: ValueError converting '{val}' to float: {e}")
            values.append(np.nan)
print(f"DEBUG: Converted values (first 5): {values[:5]}")
print(f"DEBUG: Number of non-NaN values in 'values': {len([x for x in values if not np.isnan(x)])}")


#create panda series, then convert to dataframe
min_len = min(len(date_strings), len(values))

valid_date_strings = [d for d in date_strings[:min_len] if d]

dates = pd.to_datetime(valid_date_strings, format='%B %Y')
job_vacancy_rate = pd.Series(values[:len(dates)], index=dates, name='Job vacancy rate')

#convert the series to a aataframe
df = job_vacancy_rate.to_frame()

df['Job vacancy rate'].interpolate(method='linear', inplace=True)

print("DataFrame Head:")
display(df.head())
print("\nDataFrame Info:")
df.info()

#normlaize data
scaler = MinMaxScaler()
data_normalized = scaler.fit_transform(df[['Job vacancy rate']].values)

#function to create sequences, sliding window
def create_sequences(data, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[i:i+seq_length]
        y = data[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

SEQ_LENGTH = 12
X, y = create_sequences(data_normalized, SEQ_LENGTH)

#pytorch data and data loader
class JobVacancyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

#split into train or test
train_size = len(X) - 12
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

#create dataLoaders
train_dataset = JobVacancyDataset(X_train, y_train)
test_dataset = JobVacancyDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

#model architecture with lstm model
class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=1):
        super(LSTMForecaster, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        #lstm Layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        #initialize hidden state & cell state with 0s
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        #forward pass
        out, _ = self.lstm(x, (h0, c0))

        out = self.fc(out[:, -1, :])
        return out

#initialize model
model = LSTMForecaster(input_size=1, hidden_size=50, num_layers=1)

#test
sample_input, sample_target = next(iter(train_loader))
sample_output = model(sample_input)

print(f"Input Shape: {sample_input.shape}")
print(f"Output Shape: {sample_output.shape}")

In [None]:
#evaluate model on test set
from sklearn.metrics import mean_absolute_error, mean_squared_error

model.eval() #set model to evaluation mode

all_test_predictions = []
all_test_targets = []

with torch.no_grad():
    for features, targets in test_loader:
        outputs = model(features)
        all_test_predictions.extend(outputs.cpu().numpy().flatten())
        all_test_targets.extend(targets.cpu().numpy().flatten())

#convert to numpy arrays
all_test_predictions = np.array(all_test_predictions).reshape(-1, 1)
all_test_targets = np.array(all_test_targets).reshape(-1, 1)

#inverse transform to original scale
predictions_test_actual = scaler.inverse_transform(all_test_predictions)
targets_test_actual = scaler.inverse_transform(all_test_targets)

#calculate metrics
mae = mean_absolute_error(targets_test_actual, predictions_test_actual)
rmse = np.sqrt(mean_squared_error(targets_test_actual, predictions_test_actual))

print(f"\nTest Set Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")


In [None]:
#visualize test set predictions
import matplotlib.pyplot as plt

test_dates = df.index[len(df) - len(predictions_test_actual):]

plt.figure(figsize=(12, 6))
plt.plot(test_dates, targets_test_actual, label='Actual Test Data', color='blue')
plt.plot(test_dates, predictions_test_actual, label='Predicted Test Data', color='green', linestyle='--')

plt.title('Job Vacancy Rate: Actual vs. Predicted (Test Set)')
plt.xlabel('Date')
plt.ylabel('Job Vacancy Rate')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
#training loop
import torch.optim as optim

#loss function MSE & optimizer using adam
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
model.train()

print("Starting Training...")
for epoch in range(num_epochs):
    epoch_loss = 0
    for features, targets in train_loader:
        #forward pass
        outputs = model(features)
        loss = criterion(outputs, targets)

        #backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}')

In [None]:
#forecast 2026
model.eval()

future_steps = 15
predictions = []

current_sequence = torch.tensor(data_normalized[-SEQ_LENGTH:], dtype=torch.float32).unsqueeze(0)

print("Forecasting future months...")
with torch.no_grad():
    for _ in range(future_steps):
        #predict the next step
        prediction = model(current_sequence)

        #save prediction (normalized)
        predictions.append(prediction.item())

        #update the sequence:
        new_step = prediction.unsqueeze(1)
        current_sequence = torch.cat((current_sequence[:, 1:, :], new_step), dim=1)

print("Forecasting complete.")

In [None]:
#visualize
import matplotlib.pyplot as plt

predictions_actual = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))

#generate the future dates
last_date = df.index[-1]
future_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=future_steps, freq='MS')

#ceate a dataframe for the forecast
forecast_df = pd.DataFrame(predictions_actual, index=future_dates, columns=['Forecast'])

plt.figure(figsize=(12, 6))
plt.plot(df.index, df['Job vacancy rate'], label='Historical Data (2015-2025)')
plt.plot(forecast_df.index, forecast_df['Forecast'], label='Forecast (2025-2026)', color='red', linestyle='--')

plt.title('Job Vacancy Rate Forecast: 2026 Outlook')
plt.xlabel('Date')
plt.ylabel('Job Vacancy Rate')
plt.legend()
plt.grid(True)
plt.show()

print("\nPredicted Job Market for 2026:")
print(forecast_df[forecast_df.index.year == 2026])