# London Atmospheric Emissions Inventory Analysis

The code below is used to predict atmospheric emissions given data provided for London's atmospheric emissions in several years. These years are 2008, 2010, 2013, and 2020. The dataset focuses on major road in London and the atmospheric emissions recorded yearly.

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

## Load in datasets and concatenate dataframes

In [None]:
df_2008_org = pd.read_excel('Major Roads/LAEI2013_MajorRoads_EmissionsbyLink_2008.xlsx')
df_2010_org = pd.read_excel('Major Roads/LAEI2013_MajorRoads_EmissionsbyLink_2010.xlsx')
df_2013_org = pd.read_excel('Major Roads/LAEI2013_MajorRoads_EmissionsbyLink_2013.xlsx')
df_2020_org = pd.read_excel('Major Roads/LAEI2013_MajorRoads_EmissionsbyLink_2020.xlsx')

# Remove unneeded column
df_2020_org = df_2020_org.drop(['DotRef'], axis=1)

# Renaming Borough_ExactCut to the same name so they combine with each other after concatenation
df_2008_org = df_2008_org.rename(columns={'BoroughExactCut': 'Borough_ExactCut'})
df_2013_org = df_2013_org.rename(columns={"BoroughExactCut": "Borough_ExactCut"})
df_2020_org = df_2020_org.rename(columns={"BoroughExactCut": "Borough_ExactCut"})


main_df = pd.concat([df_2008_org, df_2010_org, df_2013_org, df_2020_org])
main_df.info()

## Function to convert all object columns to Numeric Encoding

In [None]:
def convert_object_columns_to_numeric(df):
    """
        This function takes a dataframe and will first find all object type columns, loop through each of them and encode them using a LabelEncoder object. This will ensure
        that the dataframe will not contain anymore object columns.
    """

    label_encoder = LabelEncoder()
    categorical_columns = df.select_dtypes(include=['object']).columns # Code to find all column names that are object type
    df_encoded = df.copy()

    for col in categorical_columns:
        df_encoded[col] = label_encoder.fit_transform(df_encoded[col]) # Label encode each object type column
        df_encoded[col] = df_encoded[col].astype(int) # And convert the column to integer

    df_encoded.info()
    return df_encoded

## Function to split the data based on target column and filter train/test set based on the Year of the data

In [None]:
def split_data_based_on_target_label(df, target_column, test_year):
    """
        This function is responsible for taking in a dataframe, a target column, and a test year and will split the dataframe into an X and Y dataframe based on the target column.
        Then will take all rows with a Year value of less than the test_year and will use this as the training set and everything equal to or above the test year is part of the test set.
        E.g. if test_year = 2020 then all rows < 2020 will be training and all >= 2020 will be part of the test set.
    """
    train_set = df[df['Year'] < test_year]
    test_set = df[df['Year'] >= test_year]

    X_train = train_set.drop(target_column, axis=1)
    y_train = train_set[target_column]

    X_test = test_set.drop(target_column, axis=1)
    y_test = test_set[target_column]

    return X_train, X_test, y_train, y_test

## Function to analyze the performance of the model

In [None]:
def analyze_model(y_test, predictions, model_type):
    """
        This function is used to compare the predictions of the model and the actual ground truth labels to assess performance
    """
    mse = mean_squared_error(y_test, predictions)
    print("Mean Squared Error (MSE):", mse)

    # Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)

    # Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test, predictions)
    print("Mean Absolute Error (MAE):", mae)

    # R-squared (R2) Score
    r2 = r2_score(y_test, predictions)
    print("R-squared (R2) Score:", r2)

    # Plotting actual vs predicted values
    plt.scatter(y_test, predictions)
    plt.xlabel('Actual values')
    plt.ylabel('Predicted values')
    plt.title(model_type + ' Actual vs. Predicted Values')
    # Plotting the identity line; perfect predictions would lie on this line
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
    plt.show()

    return mse, rmse

## Function to perform model training

In [None]:
def train_and_analyze_model(X_train, y_train, X_test, y_test, model, model_type):
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    # Call analyze function to analyze the performance of the model
    mse, rmse = analyze_model(y_test, predictions, model_type)
    return mse, rmse

## Neural Network Architecture

In [None]:
# Define the neural network architecture
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

## Function to build the neural network model

In [None]:
def build_neural_network_and_analyze_model(X_train, y_train, X_test, y_test, model_type):
    """
        This function will be used to take the data and scale it, convert to tensors, and train a Neural Network model and generate predictions from it. Then will call the analyze function to analyze performance.
    """
    # Standardize features by removing the mean and scaling to unit variance
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

    # Reshape y tensors to match the output shape
    y_train_tensor = y_train_tensor.view(-1, 1)
    y_test_tensor = y_test_tensor.view(-1, 1)

    # Initialize the network
    input_size = X_train_tensor.shape[1]
    model = Net(input_size)

    # Loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training the model
    epochs = 8000
    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

    # Predict on the test data
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor)

    # Compute the loss (MSE) and other metrics (MAE and R²)
    mse = criterion(predictions, y_test_tensor).item()
    mae = torch.mean(torch.abs(predictions - y_test_tensor)).item()
    r2 = 1 - (torch.sum((y_test_tensor - predictions) ** 2) / torch.sum((y_test_tensor - torch.mean(y_test_tensor)) ** 2)).item()
        
    print(f"Local mse, mae, and r2 values are {mse}, {mae}, {r2}")
    mse, rmse = analyze_model(y_test_tensor, predictions, model_type)
    return mse, rmse


In [None]:
def split_dataset_and_train_models(df, target_column, model_list, test_year=2020):
    df_encoded = convert_object_columns_to_numeric(df)
    X_train, X_test, y_train, y_test = split_data_based_on_target_label(df_encoded, target_column, test_year)
    lowest_mse_algorithm = ""
    lowest_mse = 999999
    lowest_rmse_algorithm = ""
    lowest_rmse = 999999
    for model in model_list:
        if model != "NeuralNetwork":
            mse, rmse = train_and_analyze_model(X_train, y_train, X_test, y_test, model_list[model], model)
        else:
            mse, rmse = build_neural_network_and_analyze_model(X_train, y_train, X_test, y_test, model)
        if mse < lowest_mse:
            lowest_mse = mse
            lowest_mse_algorithm = model
    
        if rmse < lowest_rmse:
            lowest_rmse = rmse
            lowest_rmse_algorithm = model

    print(f"Best performing algorithm according to Mean Squared Error is {lowest_mse_algorithm} and has a value of {lowest_mse}")
    print(f"Best performing algorithm according to Root Mean Squared Error is {lowest_rmse_algorithm} and has a value of {lowest_rmse}")

In [None]:
model_list = {"LinearRegression": LinearRegression(), "DecisionTreeRegressor": DecisionTreeRegressor(), "NeuralNetwork": True}


split_dataset_and_train_models(main_df, "PetrolCar", model_list, 2020)