Step 1: Install Dependencies

In [2]:
!pip install pytorch-forecasting
!pip install pytorch-lightning
!pip install torch
!pip install pandas
!pip install matplotlib

Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.4.0-py3-none-any.whl.metadata (14 kB)
Collecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.m

Step 2: Import Required Libraries

In [12]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_lightning import Trainer  # Corrected import here
from pytorch_forecasting import metrics
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


Step 3: Load and Preprocess Data

In [35]:
# Load your dataset (replace 'path_to_data.csv' with your actual path)
data = pd.read_csv("/kaggle/input/america-dengue-data/dengue data 17-07-2025.csv")
data.index
# Filter data for the city of San Jose ('sj')
data_sj = data[data['city'] == 'sj'].copy()

# Ensure the week_start_date is in datetime format
data_sj['week_start_date'] = pd.to_datetime(data_sj['week_start_date'])

# Feature engineering: create additional time-based features (e.g., year, month, day, week of the year)
data_sj['year'] = data_sj['week_start_date'].dt.year
data_sj['month'] = data_sj['week_start_date'].dt.month
data_sj['day'] = data_sj['week_start_date'].dt.day
data_sj['dayofweek'] = data_sj['week_start_date'].dt.dayofweek
data_sj['weekofyear'] = data_sj['week_start_date'].dt.isocalendar().week

# Normalize continuous variables (e.g., NDVI, temperature, precipitation, etc.)
scaler = MinMaxScaler()
scaled_columns = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
                  'precipitation_amt_mm', 'reanalysis_air_temp_k',
                  'reanalysis_relative_humidity_percent', 'station_avg_temp_c',
                  'station_max_temp_c', 'station_min_temp_c']
data_sj[scaled_columns] = scaler.fit_transform(data_sj[scaled_columns])

# Inspect the first few rows
data_sj.head()


  data_sj['week_start_date'] = pd.to_datetime(data_sj['week_start_date'])


Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases,month,day,dayofweek
0,sj,1990,18,1990-04-30,0.58784,0.626763,0.523701,0.541889,0.031797,0.261008,...,2.628571,0.359684,6.9,0.303371,0.282051,16.0,4,4,30,0
1,sj,1990,19,1990-05-07,0.640416,0.669811,0.4353,0.492142,0.058423,0.362993,...,2.371429,0.535573,6.371429,0.561798,0.564103,8.6,5,5,7,0
2,sj,1990,20,1990-05-14,0.487412,0.704284,0.42268,0.526662,0.088428,0.454027,...,2.3,0.535573,6.485714,0.617978,0.641026,41.4,4,5,14,0
3,sj,1990,21,1990-05-21,0.594546,0.785005,0.594845,0.672866,0.039324,0.486881,...,2.428571,0.640316,6.771429,0.741573,0.705128,4.0,3,5,21,0
4,sj,1990,22,1990-05-28,0.669649,0.804187,0.652699,0.698613,0.019252,0.571755,...,3.014286,0.843874,9.371429,0.932584,0.782051,5.8,6,5,28,0


In [24]:
!pip install --upgrade pytorch-forecasting
!pip show pytorch-forecasting



Name: pytorch-forecasting
Version: 1.4.0
Summary: Forecasting timeseries with PyTorch - dataloaders, normalizers, metrics and models
Home-page: 
Author: Jan Beitner
Author-email: 
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: lightning, numpy, pandas, scikit-learn, scipy, torch
Required-by: 


Step 4: Prepare Time Series Dataset

In [37]:
# # Define the target and time series features
# target = 'total_cases'  # What we're trying to predict

# # Define the categorical and continuous features
# categorical_cols = ['city', 'year', 'month', 'dayofweek']  # Add 'month' and 'dayofweek' for better time features
# continuous_cols = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 
#                    'reanalysis_air_temp_k', 'reanalysis_relative_humidity_percent', 
#                    'station_avg_temp_c', 'station_precip_mm']  # Example columns

# # Ensure data is sorted by time before passing to TimeSeriesDataSet
# data_sj['week_start_date'] = pd.to_datetime(data_sj['week_start_date'])
# data_sj = data_sj.sort_values(['city', 'week_start_date'])

# # Scale continuous features for the model
# scaler = MinMaxScaler()
# data_sj[continuous_cols] = scaler.fit_transform(data_sj[continuous_cols])

# # Create the dataset for time series forecasting
# max_encoder_length = 24  # Number of time steps used for the encoder (past data)
# max_prediction_length = 1  # We are predicting the next week's 'total_cases'

# # Create a TimeSeriesDataset for the TFT model
# from pytorch_forecasting import TimeSeriesDataSet

# try:
#     training = TimeSeriesDataSet(
#         data_sj,
#         time_idx='weekofyear',  # Using week of year as time index
#         target=target,
#         group_ids=['city'],  # Group by city
#         categorical_columns=categorical_cols,  # Correct argument name for categorical variables
#         continuous_columns=continuous_cols,  # Correct argument name for continuous variables
#         min_encoder_length=max_encoder_length,
#         max_encoder_length=max_encoder_length,
#         min_prediction_length=max_prediction_length,
#         max_prediction_length=max_prediction_length,
#         static_categoricals=['city'],  # Use city as static categorical
#         time_varying_known_reals=continuous_cols,  # These are known features (i.e., input to the model)
#         time_varying_unknown_reals=[target]  # We want to predict the 'total_cases' variable
#     )
    
#     # Split the data into training and validation sets
#     train_size = int(len(data_sj) * 0.8)
#     train_dataset = training[:train_size]
#     val_dataset = training[train_size:]

#     # Inspect the data
#     print(f"Training dataset size: {len(train_dataset)}")
#     print(f"Validation dataset size: {len(val_dataset)}")

# except Exception as e:
#     print(f"Error: {e}")


Error: TimeSeriesDataSet.__init__() got an unexpected keyword argument 'categorical_columns'


In [44]:
# Convert 'month' to string to treat it as a categorical variable
data_sj['month'] = data_sj['month'].astype(str)

# Verify the data types again
print(data_sj[categorical_cols].dtypes)  # Should show 'object' for categorical columns

# Scale continuous features for the model
continuous_cols = ['ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 
                   'reanalysis_air_temp_k', 'reanalysis_relative_humidity_percent', 
                   'station_avg_temp_c', 'station_precip_mm']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_sj[continuous_cols] = scaler.fit_transform(data_sj[continuous_cols])

# Now, recreate the TimeSeriesDataset for the TFT model
from pytorch_forecasting import TimeSeriesDataSet

# Define the target variable
target = 'total_cases'

# Create the TimeSeriesDataset
max_encoder_length = 24  # Number of time steps used for the encoder (past data)
max_prediction_length = 1  # We are predicting the next week's 'total_cases'

try:
    training = TimeSeriesDataSet(
        data_sj,
        time_idx='weekofyear',  # 'weekofyear' is an integer
        target=target,
        group_ids=['city'],  # Group by city
        static_categoricals=['city'],  # 'city' is static and does not change over time
        static_reals=[],  # No static continuous variables
        time_varying_known_categoricals=['year', 'month', 'dayofweek'],  # These are categorical features that change over time
        time_varying_known_reals=continuous_cols,  # These are continuous features that change over time
        time_varying_unknown_reals=[target],  # 'total_cases' is what we're predicting
        min_encoder_length=max_encoder_length,
        max_encoder_length=max_encoder_length,
        min_prediction_length=max_prediction_length,
        max_prediction_length=max_prediction_length
    )
    
    # Split the data into training and validation sets
    train_size = int(len(data_sj) * 0.8)
    train_dataset = training[:train_size]
    val_dataset = training[train_size:]

    # Inspect the data
    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")

except Exception as e:
    print(f"Error: {e}")


city         object
year         object
month        object
dayofweek     int32
dtype: object
Error: Data type of category dayofweek was found to be numeric - use a string type / categorified string


Step 5: Model Building (Temporal Fusion Transformer)


In [28]:
# Instantiate the Temporal Fusion Transformer model
tft = TemporalFusionTransformer.from_dataset(
    train_dataset,
    learning_rate=0.03,
    hidden_size=16,  # Size of hidden layers
    attention_head_size=1,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=1,  # Single output (total_cases)
    loss=pl.metrics.MeanSquaredError(),  # Loss function
    log_interval=10,  # Log every 10 batches
    reduce_on_plateau_patience=4,  # Patience for learning rate scheduler
)

# Model Summary
print(tft)


NameError: name 'train_dataset' is not defined

Step 6: Training the Model

In [None]:
# Set up the PyTorch Lightning trainer
trainer = Trainer(
    max_epochs=20,  # Number of epochs
    gpus=1 if torch.cuda.is_available() else 0,  # Use GPU if available
    gradient_clip_val=0.1,  # Gradient clipping to avoid exploding gradients
    limit_train_batches=30,  # Limit train batches for faster debugging
    limit_val_batches=30,  # Limit validation batches for faster debugging
)

# Train the model
trainer.fit(
    tft,
    train_dataloader=train_dataset.to_dataloader(train=True, batch_size=64, num_workers=4),
    val_dataloaders=val_dataset.to_dataloader(train=False, batch_size=64, num_workers=4),
)


Step 7: Evaluation and Prediction

In [None]:
# Make predictions on the validation set
raw_predictions, x = tft.predict(val_dataset, mode="raw", return_x=True)

# Extract predictions and ground truth values for evaluation
predictions = raw_predictions['prediction']
actuals = x['target']

# Compare predictions and actuals
plt.figure(figsize=(12, 6))
plt.plot(predictions.numpy(), label="Predictions")
plt.plot(actuals.numpy(), label="Actuals", linestyle="--")
plt.title("Predictions vs Actuals for Total Cases")
plt.xlabel("Week")
plt.ylabel("Total Cases")
plt.legend()
plt.show()

# Calculate performance metrics (e.g., Mean Absolute Error, RMSE)
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(actuals.numpy(), predictions.numpy())
rmse = np.sqrt(mean_squared_error(actuals.numpy(), predictions.numpy()))

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
