In [1]:
import scipy.stats as stats
import numpy as np
import sklearn as sk
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [2]:
def prepare_data(df, train_year, test_year):
    # Ensure datetime is in datetime format
    df['UTC time'] = pd.to_datetime(df['UTC time'])
    
    # Filter for training year (e.g., 2020)
    train_data = df[df['UTC time'].dt.year == train_year]
    X_train = train_data.drop(columns=['UTC time']).values[:-24]  # Features for training
    y_train = train_data['carbon_intensity'].values[24:].reshape(-1, 1)  # Label for training
    
    # Filter for testing year (e.g., 2021)
    test_data = df[df['UTC time'].dt.year == test_year]
    X_test = test_data.drop(columns=['UTC time']).values[:-24]  # Features for testing
    y_test = test_data['carbon_intensity'].values[24:].reshape(-1, 1)  # Label for testing
    
    # Standardize the features and target
    X_scaler = StandardScaler()
    y_scaler = StandardScaler()
    
    X_train_normalized = X_scaler.fit_transform(X_train)
    y_train_normalized = y_scaler.fit_transform(y_train)
    X_test_normalized = X_scaler.transform(X_test)
    y_test_normalized = y_scaler.transform(y_test)
    
    print("X_train_normalized shape:", X_train_normalized.shape)
    print("y_train_normalized shape:", y_train_normalized.shape)
    print("X_test_normalized shape:", X_test_normalized.shape)
    print("y_test_normalized shape:", y_test_normalized.shape)
    print(X_train_normalized[:5])
    print(y_train_normalized[:5])
    print(X_test_normalized[:5])
    print(y_test_normalized[:5])
    
    return X_train_normalized, y_train_normalized, X_test_normalized, y_test_normalized, y_scaler

    
    



# Load your dataset
df = pd.read_csv('./data/BPAT/BPAT_direct_emissions.csv')

# Specify the training and testing years
train_year = 2020
test_year = 2021

# Prepare the data
X_train, y_train, X_test, y_test, y_scaler = prepare_data(df, train_year, test_year)

# 3. Train the model (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# 4. Predict the values for the test set
y_pred_normalized = model.predict(X_test[:-24])

y_pred = y_scaler.inverse_transform(y_pred_normalized)
y_test_original = y_scaler.inverse_transform(y_test)

# 6. Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test_original[24:], y_pred))
mape = mean_absolute_percentage_error(y_test_original[24:], y_pred)

print("Predicted values (first 5):", y_pred[:5])  # Show the first few predictions for verification
print(f"RMSE: {rmse}")
print(f"MAPE: {mape}")

X_train_normalized shape: (8760, 8)
y_train_normalized shape: (8760, 1)
X_test_normalized shape: (8736, 8)
y_test_normalized shape: (8736, 1)
[[-1.7318531   0.54286793  1.06426151  0.34010433 -0.21739122 -0.62317695
   1.03737889  1.24607327]
 [-1.73145765  0.46252179  1.07024468  0.35370776 -0.09174209 -0.62317695
   1.10749225  1.10786917]
 [-1.73106221  0.43760826  1.05827834  0.35824223 -0.08878066 -0.62317695
   1.26900339  1.21152224]
 [-1.73066676  0.41954595  1.06126992  0.35824223  0.06098294 -0.62317695
   0.90216027  1.10786917]
 [-1.73027131  0.54411361  1.05528675  0.37184566 -0.17593124 -0.62317695
   0.74565723  1.00421609]]
[[0.68627927]
 [0.52179499]
 [0.4308302 ]
 [0.51058015]
 [0.61151369]]
[[ 1.74173923  0.16418224  1.08221103  0.4217249   0.33808792 -0.10586913
   0.94222505 -0.72333518]
 [ 1.74213468  0.02030659  1.09118579  0.4217249   0.81445465 -0.58338404
   0.71185257 -0.86153929]
 [ 1.74253012  0.00473563  1.0881942   0.42625937  0.9181046  -0.62317695
   0.

Now finding the energy source that best predicts the carbon intensity value for a particular region