In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler

In [None]:
# Check if GPU is available
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

In [None]:
# Step 1: Load and preprocess the data

# Load the dataset
data = pd.read_csv("../data.csv")

# Ensure 'year' is an integer
data["year"] = data["year"].astype(int)

# Sort the data by year (in case it's not)
data = data.sort_values("year").reset_index(drop=True)

# Extract the emissions data and scale it between 0 and 1
scaler = MinMaxScaler()
emissions = scaler.fit_transform(
    data[
        [
            "Coal_Emissions_Global",
            "NaturalGas_Emissions_Global",
            "Petroleum_Emissions_Global",
        ]
    ]
)

In [None]:
# Step 2: Create a PyTorch dataset and dataloader


# Define a custom dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        x = self.data[index : index + self.seq_length]
        y = self.data[index + self.seq_length]
        return x, y


# Define the sequence length
SEQ_LENGTH = 5  # Number of previous years to consider

# Create the dataset
dataset = TimeSeriesDataset(emissions, SEQ_LENGTH)

# Split the dataset into training and validation sets
train_size = int(len(dataset) * 0.8)
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size]
)

# Create data loaders
BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Step 3: Define the Transformer model


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=500):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Create constant 'pe' matrix with values dependent on
        # pos and i
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # Add positional encoding to input tensor
        x = x + self.pe[:, : x.size(1)]
        return self.dropout(x)


class TimeSeriesTransformer(nn.Module):
    def __init__(
        self,
        num_features,
        d_model=64,
        nhead=4,
        num_layers=2,
        dim_feedforward=128,
        dropout=0.1,
    ):
        super(TimeSeriesTransformer, self).__init__()
        self.d_model = d_model
        self.input_projection = nn.Linear(num_features, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )
        self.decoder = nn.Linear(d_model, num_features)

    def forward(self, src):
        # src shape: [batch_size, seq_length, num_features]
        src = self.input_projection(src) * np.sqrt(self.d_model)
        src = self.positional_encoding(src)
        # Transformer expects input shape: [seq_length, batch_size, d_model]
        src = src.permute(1, 0, 2)
        output = self.transformer_encoder(src)
        # Take the output from the last time step
        output = output[-1, :, :]
        output = self.decoder(output)
        return output


# Instantiate the model
model = TimeSeriesTransformer(num_features=emissions.shape[1]).to(DEVICE)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Step 4: Train the model

EPOCHS = 50

for epoch in range(EPOCHS):
    # Training phase
    model.train()
    train_loss = 0.0
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        optimizer.zero_grad()
        output = model(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * x_batch.size(0)

    train_loss /= len(train_loader.dataset)

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            x_batch = x_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)

            output = model(x_batch)
            loss = criterion(output, y_batch)
            val_loss += loss.item() * x_batch.size(0)

    val_loss /= len(val_loader.dataset)

    print(
        f"Epoch {epoch + 1}/{EPOCHS}, "
        f"Training Loss: {train_loss:.4f}, "
        f"Validation Loss: {val_loss:.4f}"
    )

In [None]:
# Step 5: Predict future emissions until 2030

# Prepare to predict future emissions
model.eval()

# Number of years to predict
last_year = data["year"].iloc[-1]
predict_years = np.arange(last_year + 1, 2031)
num_predictions = len(predict_years)

# Initialize the input sequence with the last known data
input_seq = (
    torch.tensor(emissions[-SEQ_LENGTH:], dtype=torch.float32).unsqueeze(0).to(DEVICE)
)

# List to store predictions
predictions = []

for _ in range(num_predictions):
    with torch.no_grad():
        output = model(input_seq)
    predictions.append(output.cpu().numpy())
    # Prepare the input for the next prediction
    next_input = output.unsqueeze(0)
    input_seq = torch.cat((input_seq[:, 1:, :], next_input), dim=1)

# Transform predictions back to original scale
predictions = np.array(predictions).squeeze(
    axis=1
)  # Shape: (num_predictions, num_features)
predictions = scaler.inverse_transform(predictions)

In [None]:
# Step 6: Prepare and save the combined data

# Create a DataFrame for predicted data
future_data = pd.DataFrame(
    predictions,
    columns=[
        "Coal_Emissions_Global",
        "NaturalGas_Emissions_Global",
        "Petroleum_Emissions_Global",
    ],
)
future_data["year"] = predict_years

# Combine historical and predicted data
historical_data = data.copy()
combined_data = pd.concat([historical_data, future_data], ignore_index=True)

# Compute total emissions
combined_data["Total_Emissions"] = (
    combined_data["Coal_Emissions_Global"]
    + combined_data["NaturalGas_Emissions_Global"]
    + combined_data["Petroleum_Emissions_Global"]
)

# Save the combined data to a CSV file
combined_data.to_csv("emissions_predictions.csv", index=False)
print('Combined data saved to "emissions_predictions.csv".')

In [None]:
# Step 7: Plot and save the data
# ------------------------------

# Plot total emissions over time
plt.figure(figsize=(12, 6))
plt.plot(
    combined_data["year"],
    combined_data["Total_Emissions"],
    label="Total Emissions",
    color="blue",
)
plt.axvline(
    x=last_year,
    color="red",
    linestyle="--",
    label="Prediction Start (Year {})".format(last_year + 1),
)
plt.xlabel("Year")
plt.ylabel("Total Emissions")
plt.title("Historical and Predicted Total Emissions (Global)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("total_emissions.png")
plt.show()
print('Plot saved as "total_emissions.png".')

In [None]:
# Step 8: Display total emissions for predicted years
# ---------------------------------------------------

# Calculate total predicted emissions from the predicted years
predicted_total_emissions = (
    future_data["Coal_Emissions_Global"].sum()
    + future_data["NaturalGas_Emissions_Global"].sum()
    + future_data["Petroleum_Emissions_Global"].sum()
)

print(
    f"Total predicted emissions from {predict_years[0]} to {predict_years[-1]}: "
    f"{predicted_total_emissions:.2f}"
)

import matplotlib.pyplot as plt

# Updated labels and data
labels = [
    "Coal_Emissions_Global",
    "NaturalGas_Emissions_Global",
    "Petroleum_Emissions_Global",
]
sizes = [
    397.8934631347656,
    264.2558288574219,
    641.5406494140625,
]

# Plot configuration
colors = ["#ff9999", "#66b3ff", "#99ff99"]  # Pleasant colors for each segment
explode = (0.0, 0.0, 0.0)  # Only "explode" the first slice

# Create the pie chart
fig, ax = plt.subplots()
ax.pie(
    sizes,
    explode=explode,
    labels=labels,
    colors=colors,
    autopct="%1.1f%%",
    startangle=90,
)
ax.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.

# Set the background color of the figure to be transparent
fig.patch.set_alpha(0.0)
ax.patch.set_alpha(0.0)

# Save the plot with a transparent background
plt.savefig(
    "pie_chart_transparent_background.png", bbox_inches="tight", transparent=True
)
plt.show()