In [None]:
from dataset import TreeCoverLossDataset, DriverTypeDataset
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.preprocessing import MinMaxScaler

# Prepare sequences for a specific country
def prepare_country_data(country_data, seq_length=5):
    """
    Prepares input sequences and targets for a given country's data.
    Uses separate scalers for TreeCoverLoss and CO2.
    """
    data = np.array(country_data)[:, 1:]  # Exclude year
    scaler_loss = MinMaxScaler()
    scaler_co2 = MinMaxScaler()

    # Scale each feature separately
    tree_cover_loss_scaled = scaler_loss.fit_transform(data[:, [0]])  # TreeCoverLoss
    co2_scaled = scaler_co2.fit_transform(data[:, [1]])  # CO2
    scaled_data = np.hstack((tree_cover_loss_scaled, co2_scaled))

    sequences, targets = [], []
    for i in range(len(scaled_data) - seq_length):
        seq = scaled_data[i:i + seq_length]
        target = scaled_data[i + seq_length, 0]  # Predict TreeCoverLoss
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets), scaler_loss

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use the last hidden state
        return out

def train_model(model, train_loader, criterion, optimizer, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        for seq, target in train_loader:
            seq, target = seq.float(), target.float()
            seq, target = seq.to(device), target.to(device)

            # Forward pass
            output = model(seq)
            loss = criterion(output.squeeze(), target)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")


def evaluate_model(model, test_loader, scaler_loss):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for seq, target in test_loader:
            seq, target = seq.float(), target.float()
            seq = seq.to(device)

            output = model(seq)
            predictions.append(output.cpu().numpy())
            actuals.append(target.numpy())

    # Rescale predictions and actuals for TreeCoverLoss
    predictions = scaler_loss.inverse_transform(np.array(predictions).reshape(-1, 1))
    actuals = scaler_loss.inverse_transform(np.array(actuals).reshape(-1, 1))

    # Calculate RMSE
    rmse = np.sqrt(np.mean((predictions - actuals) ** 2))
    print(f"RMSE: {rmse:.4f}")
    return rmse

# Parameters
seq_length = 5
input_size = 2  # TreeCoverLoss and CO2
hidden_size = 64
num_layers = 2
output_size = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
dataset = TreeCoverLossDataset("TreeCoverLoss_2001-2020_ByRegion.csv", split_train_test=True)
model = LSTMModel(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

rmse_list = []
# Train and evaluate for each country
for country in dataset.country_list:
    print(f"\nProcessing country: {country}")

    if len(dataset.train_data[country]) < seq_length + 1:
        print(f"Skipping {country}: Insufficient training data.")
        continue
    if len(dataset.test_data[country]) < seq_length + 1:
        print(f"Skipping {country}: Insufficient testing data.")
        continue

    # Prepare train and test data
    train_seq, train_target, scaler_loss = prepare_country_data(dataset.train_data[country], seq_length)
    test_seq, test_target, _ = prepare_country_data(dataset.test_data[country], seq_length)

    if len(train_seq) == 0 or len(test_seq) == 0:
        print(f"Skipping {country}: No valid sequences.")
        continue

    # Create DataLoaders
    train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_seq), torch.tensor(train_target))
    test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_seq), torch.tensor(test_target))

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)

    # Train the model
    train_model(model, train_loader, criterion, optimizer, num_epochs=50)

    # Evaluate the model
    rmse = evaluate_model(model, test_loader, scaler_loss)
    rmse_list.append(rmse)
    print(f"RMSE for {country}: {rmse:.4f}")

print(f"Average RMSE for all countries: {sum(rmse_list) / len(rmse_list)}")


Processing country: ABW
Skipping ABW: Insufficient training data.

Processing country: AFG
Epoch 1/50, Loss: 0.3635
Epoch 2/50, Loss: 0.3425
Epoch 3/50, Loss: 0.3225
Epoch 4/50, Loss: 0.3034
Epoch 5/50, Loss: 0.2850
Epoch 6/50, Loss: 0.2671
Epoch 7/50, Loss: 0.2498
Epoch 8/50, Loss: 0.2328
Epoch 9/50, Loss: 0.2163
Epoch 10/50, Loss: 0.2003
Epoch 11/50, Loss: 0.1849
Epoch 12/50, Loss: 0.1702
Epoch 13/50, Loss: 0.1566
Epoch 14/50, Loss: 0.1445
Epoch 15/50, Loss: 0.1346
Epoch 16/50, Loss: 0.1277
Epoch 17/50, Loss: 0.1245
Epoch 18/50, Loss: 0.1258
Epoch 19/50, Loss: 0.1307
Epoch 20/50, Loss: 0.1367
Epoch 21/50, Loss: 0.1407
Epoch 22/50, Loss: 0.1413
Epoch 23/50, Loss: 0.1390
Epoch 24/50, Loss: 0.1351
Epoch 25/50, Loss: 0.1310
Epoch 26/50, Loss: 0.1276
Epoch 27/50, Loss: 0.1253
Epoch 28/50, Loss: 0.1241
Epoch 29/50, Loss: 0.1239
Epoch 30/50, Loss: 0.1242
Epoch 31/50, Loss: 0.1249
Epoch 32/50, Loss: 0.1257
Epoch 33/50, Loss: 0.1264
Epoch 34/50, Loss: 0.1269
Epoch 35/50, Loss: 0.1272
Epoch 3