In [65]:
import scipy.stats as stats
import numpy as np
import sklearn as sk
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

In [66]:
def prepare_data(df, train_year, test_year):
    # Ensure datetime is in datetime format
    df['UTC time'] = pd.to_datetime(df['UTC time'])
    # filer out 2020-02-29, leap year...
    df = df[~((df['UTC time'].dt.year == 2020) & 
          (df['UTC time'].dt.month == 2) & 
          (df['UTC time'].dt.day == 29))]
    
    # Filter for training year (e.g., 2020)
    train_data = df[df['UTC time'].dt.year == train_year]
    X_train = train_data['nat_gas'].values[:].reshape(-1, 1)  # Feature for training
    y_train = train_data['carbon_intensity'].values[:].reshape(-1, 1)  # Label for training
    
    print("Training data shape (X_train, y_train):", X_train.shape, y_train.shape)
    print("Last training values:", X_train[-1], y_train[-1])
    
    # Filter for testing year (e.g., 2021)
    test_data = df[df['UTC time'].dt.year == test_year]
    X_test = test_data['nat_gas'].values[:].reshape(-1, 1)  # Feature for testing
    y_test = test_data['carbon_intensity'].values[:].reshape(-1, 1)  # Label for testing
    
    print("Testing data shape (X_test, y_test):", X_test.shape, y_test.shape)
    
    return X_train, y_train, X_test, y_test

# Load your dataset
df = pd.read_csv('./data/BPAT/BPAT_direct_emissions.csv')

# Specify the training and testing years
train_year = 2020
test_year = 2021

# Prepare the data
X_train, y_train, X_test, y_test = prepare_data(df, train_year, test_year)

# 3. Train the model (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# 4. Predict the values for the test set
y_pred = model.predict(X_test)

# 5. Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

print("Predicted values:", y_pred[:5])  # Show the first few predictions for verification
print(f"RMSE: {rmse}")
print(f"MAPE: {mape}")

Training data shape (X_train, y_train): (8760, 1) (8760, 1)
Last training values: [892] [35.02]
Testing data shape (X_test, y_test): (8760, 1) (8760, 1)
Predicted values: [[46.33226543]
 [46.46267876]
 [46.41920765]
 [46.37573654]
 [46.46267876]]
RMSE: 10.038769465040714
MAPE: 0.2209973091532864


Now finding the energy source that best predicts the carbon intensity value for a particular region

In [67]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

def prepare_data(df, train_year, test_year, feature_column):
    # Ensure datetime is in datetime format
    df['UTC time'] = pd.to_datetime(df['UTC time'])
    
    # Filter out February 29 for the leap year (2020) to ensure alignment
    df = df[~((df['UTC time'].dt.year == 2020) & 
              (df['UTC time'].dt.month == 2) & 
              (df['UTC time'].dt.day == 29))]
    
    # Filter for the training year
    train_data = df[df['UTC time'].dt.year == train_year]
    X_train = train_data[feature_column].values.reshape(-1, 1)  # Feature for training
    y_train = train_data['carbon_intensity'].values.reshape(-1, 1)  # Label for training
    
    # Filter for the testing year
    test_data = df[df['UTC time'].dt.year == test_year]
    X_test = test_data[feature_column].values.reshape(-1, 1)  # Feature for testing
    y_test = test_data['carbon_intensity'].values.reshape(-1, 1)  # Label for testing
    
    return X_train, y_train, X_test, y_test

def evaluate_energy_sources(df, train_year, test_year, energy_sources):
    results = []

    for feature_column in energy_sources:
        # Prepare data for the current energy source
        X_train, y_train, X_test, y_test = prepare_data(df, train_year, test_year, feature_column)
        
        # Train the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        
        # Store the results
        results.append({
            "Energy Source": feature_column,
            "RMSE": rmse,
            "MAPE": mape
        })
        
        # Print the results for each energy source
        print(f"Energy Source: {feature_column}")
        print(f"  RMSE: {rmse}")
        print(f"  MAPE: {mape}\n")
    
    # Convert results to DataFrame for easy comparison
    results_df = pd.DataFrame(results)
    
    # Identify the best predictor based on RMSE
    best_rmse = results_df.loc[results_df['RMSE'].idxmin()]
    print("Best Energy Source based on RMSE:")
    print(best_rmse)
    
    # Identify the best predictor based on MAPE
    best_mape = results_df.loc[results_df['MAPE'].idxmin()]
    print("\nBest Energy Source based on MAPE:")
    print(best_mape)
    
    return results_df

# Load your dataset
df = pd.read_csv('./data/BPAT/BPAT_direct_emissions.csv')

# Specify the training and testing years
train_year = 2020
test_year = 2021

# List of energy sources to evaluate
energy_sources = ['nat_gas', 'nuclear', 'hydro', 'solar', 'wind', 'other']

# Evaluate each energy source
results_df = evaluate_energy_sources(df, train_year, test_year, energy_sources)

Energy Source: nat_gas
  RMSE: 10.038769465040714
  MAPE: 0.2209973091532864

Energy Source: nuclear
  RMSE: 19.10432176624607
  MAPE: 0.4388524534475594

Energy Source: hydro
  RMSE: 12.924598880458367
  MAPE: 0.3639017143450299

Energy Source: solar
  RMSE: 17.699558927316264
  MAPE: 0.3747868824344542

Energy Source: wind
  RMSE: 17.28711321387423
  MAPE: 0.3649581258873038

Energy Source: other
  RMSE: 17.72385883629802
  MAPE: 0.3930488792712189

Best Energy Source based on RMSE:
Energy Source      nat_gas
RMSE             10.038769
MAPE              0.220997
Name: 0, dtype: object

Best Energy Source based on MAPE:
Energy Source      nat_gas
RMSE             10.038769
MAPE              0.220997
Name: 0, dtype: object


In [68]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

def prepare_data(df, train_year, test_year, feature_column):
    # Ensure datetime is in datetime format
    df['UTC time'] = pd.to_datetime(df['UTC time'])
    
    # Filter out February 29 for the leap year (2020) to ensure alignment
    df = df[~((df['UTC time'].dt.year == 2020) & 
              (df['UTC time'].dt.month == 2) & 
              (df['UTC time'].dt.day == 29))]
    
    # Filter for the training year
    train_data = df[df['UTC time'].dt.year == train_year]
    X_train = train_data[feature_column].values.reshape(-1, 1)  # Feature for training
    y_train = train_data['carbon_intensity'].values.reshape(-1, 1)  # Label for training
    
    # Filter for the testing year
    test_data = df[df['UTC time'].dt.year == test_year]
    X_test = test_data[feature_column].values.reshape(-1, 1)  # Feature for testing
    y_test = test_data['carbon_intensity'].values.reshape(-1, 1)  # Label for testing
    
    return X_train, y_train, X_test, y_test

def evaluate_energy_sources(df, train_year, test_year, energy_sources):
    results = []

    for feature_column in energy_sources:
        # Prepare data for the current energy source
        X_train, y_train, X_test, y_test = prepare_data(df, train_year, test_year, feature_column)
        
        # Train the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        
        # Store the results
        results.append({
            "Energy Source": feature_column,
            "RMSE": rmse,
            "MAPE": mape
        })
        
        # Print the results for each energy source
        print(f"Energy Source: {feature_column}")
        print(f"  RMSE: {rmse}")
        print(f"  MAPE: {mape}\n")
    
    # Convert results to DataFrame for easy comparison
    results_df = pd.DataFrame(results)
    
    # Identify the best predictor based on RMSE
    best_rmse = results_df.loc[results_df['RMSE'].idxmin()]
    print("Best Energy Source based on RMSE:")
    print(best_rmse)
    
    # Identify the best predictor based on MAPE
    best_mape = results_df.loc[results_df['MAPE'].idxmin()]
    print("\nBest Energy Source based on MAPE:")
    print(best_mape)
    
    return results_df

# Load your dataset
df = pd.read_csv('./data/ERCO/ERCO_direct_emissions.csv')

# Specify the training and testing years
train_year = 2020
test_year = 2021

# List of energy sources to evaluate
energy_sources = ['nat_gas', 'nuclear', 'hydro', 'solar', 'wind', 'other']

# Evaluate each energy source
results_df = evaluate_energy_sources(df, train_year, test_year, energy_sources)

Energy Source: nat_gas
  RMSE: 43.11696585422935
  MAPE: 0.13021757670814138

Energy Source: nuclear
  RMSE: 65.70470366158392
  MAPE: 0.21621781973145446

Energy Source: hydro
  RMSE: 82.58709287548777
  MAPE: 0.2160555169471528

Energy Source: solar
  RMSE: 74.39346683701321
  MAPE: 0.24146404070994104

Energy Source: wind
  RMSE: 33.57935463447589
  MAPE: 0.10188746741891733

Energy Source: other
  RMSE: 65.62217840418519
  MAPE: 0.21590175935396377

Best Energy Source based on RMSE:
Energy Source         wind
RMSE             33.579355
MAPE              0.101887
Name: 4, dtype: object

Best Energy Source based on MAPE:
Energy Source         wind
RMSE             33.579355
MAPE              0.101887
Name: 4, dtype: object


In [69]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

def prepare_data(df, train_year, test_year, feature_column):
    # Ensure datetime is in datetime format
    df['UTC time'] = pd.to_datetime(df['UTC time'])
    
    # Filter out February 29 for the leap year (2020) to ensure alignment
    df = df[~((df['UTC time'].dt.year == 2020) & 
              (df['UTC time'].dt.month == 2) & 
              (df['UTC time'].dt.day == 29))]
    
    # Filter for the training year
    train_data = df[df['UTC time'].dt.year == train_year]
    X_train = train_data[feature_column].values.reshape(-1, 1)  # Feature for training
    y_train = train_data['carbon_intensity'].values.reshape(-1, 1)  # Label for training
    
    # Filter for the testing year
    test_data = df[df['UTC time'].dt.year == test_year]
    X_test = test_data[feature_column].values.reshape(-1, 1)  # Feature for testing
    y_test = test_data['carbon_intensity'].values.reshape(-1, 1)  # Label for testing
    
    return X_train, y_train, X_test, y_test

def evaluate_energy_sources(df, train_year, test_year, energy_sources):
    results = []

    for feature_column in energy_sources:
        # Prepare data for the current energy source
        X_train, y_train, X_test, y_test = prepare_data(df, train_year, test_year, feature_column)
        
        # Train the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        
        # Store the results
        results.append({
            "Energy Source": feature_column,
            "RMSE": rmse,
            "MAPE": mape
        })
        
        # Print the results for each energy source
        print(f"Energy Source: {feature_column}")
        print(f"  RMSE: {rmse}")
        print(f"  MAPE: {mape}\n")
    
    # Convert results to DataFrame for easy comparison
    results_df = pd.DataFrame(results)
    
    # Identify the best predictor based on RMSE
    best_rmse = results_df.loc[results_df['RMSE'].idxmin()]
    print("Best Energy Source based on RMSE:")
    print(best_rmse)
    
    # Identify the best predictor based on MAPE
    best_mape = results_df.loc[results_df['MAPE'].idxmin()]
    print("\nBest Energy Source based on MAPE:")
    print(best_mape)
    
    return results_df

# Load your dataset
df = pd.read_csv('./data/CISO/CISO_direct_emissions.csv')

# Specify the training and testing years
train_year = 2020
test_year = 2021

# List of energy sources to evaluate
energy_sources = ['nat_gas', 'nuclear', 'hydro', 'solar', 'wind', 'other']

# Evaluate each energy source
results_df = evaluate_energy_sources(df, train_year, test_year, energy_sources)


Energy Source: nat_gas
  RMSE: 51.40177102680604
  MAPE: 0.2901440988572895

Energy Source: nuclear
  RMSE: 65.660818450447
  MAPE: 0.39824348108206126

Energy Source: hydro
  RMSE: 65.23097382434575
  MAPE: 0.4089411162289022

Energy Source: solar
  RMSE: 37.57897293141436
  MAPE: 0.1867512894737963

Energy Source: wind
  RMSE: 63.36786099810229
  MAPE: 0.387323503026966

Energy Source: other
  RMSE: 144.72404726762346
  MAPE: 0.5728172114430853

Best Energy Source based on RMSE:
Energy Source        solar
RMSE             37.578973
MAPE              0.186751
Name: 3, dtype: object

Best Energy Source based on MAPE:
Energy Source        solar
RMSE             37.578973
MAPE              0.186751
Name: 3, dtype: object


In [70]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

def prepare_data(df, train_year, test_year, feature_column):
    # Ensure datetime is in datetime format
    df['UTC time'] = pd.to_datetime(df['UTC time'])
    
    # Filter out February 29 for the leap year (2020) to ensure alignment
    df = df[~((df['UTC time'].dt.year == 2020) & 
              (df['UTC time'].dt.month == 2) & 
              (df['UTC time'].dt.day == 29))]
    
    # Filter for the training year
    train_data = df[df['UTC time'].dt.year == train_year]
    X_train = train_data[feature_column].values.reshape(-1, 1)  # Feature for training
    y_train = train_data['carbon_intensity'].values.reshape(-1, 1)  # Label for training
    
    # Filter for the testing year
    test_data = df[df['UTC time'].dt.year == test_year]
    X_test = test_data[feature_column].values.reshape(-1, 1)  # Feature for testing
    y_test = test_data['carbon_intensity'].values.reshape(-1, 1)  # Label for testing
    
    return X_train, y_train, X_test, y_test

def evaluate_energy_sources(df, train_year, test_year, energy_sources):
    results = []

    for feature_column in energy_sources:
        # Prepare data for the current energy source
        X_train, y_train, X_test, y_test = prepare_data(df, train_year, test_year, feature_column)
        
        # Train the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mape = mean_absolute_percentage_error(y_test, y_pred)
        
        # Store the results
        results.append({
            "Energy Source": feature_column,
            "RMSE": rmse,
            "MAPE": mape
        })
        
        # Print the results for each energy source
        print(f"Energy Source: {feature_column}")
        print(f"  RMSE: {rmse}")
        print(f"  MAPE: {mape}\n")
    
    # Convert results to DataFrame for easy comparison
    results_df = pd.DataFrame(results)
    
    # Identify the best predictor based on RMSE
    best_rmse = results_df.loc[results_df['RMSE'].idxmin()]
    print("Best Energy Source based on RMSE:")
    print(best_rmse)
    
    # Identify the best predictor based on MAPE
    best_mape = results_df.loc[results_df['MAPE'].idxmin()]
    print("\nBest Energy Source based on MAPE:")
    print(best_mape)
    
    return results_df

# Load your dataset
df = pd.read_csv('./data/ISNE/ISNE_direct_emissions.csv')

# Specify the training and testing years
train_year = 2020
test_year = 2021

# List of energy sources to evaluate
energy_sources = ['nat_gas', 'nuclear', 'hydro', 'solar', 'wind', 'other']

# Evaluate each energy source
results_df = evaluate_energy_sources(df, train_year, test_year, energy_sources)


Energy Source: nat_gas
  RMSE: 22.90030145115215
  MAPE: 0.07311825625341264

Energy Source: nuclear
  RMSE: 34.03537962269723
  MAPE: 0.1231958734885468

Energy Source: hydro
  RMSE: 35.863594937354705
  MAPE: 0.12854641503957318

Energy Source: solar
  RMSE: 36.521454728646326
  MAPE: 0.1313896674474732

Energy Source: wind
  RMSE: 33.27434722637004
  MAPE: 0.11691482796788312

Energy Source: other
  RMSE: 36.215191688867556
  MAPE: 0.12931268294235554

Best Energy Source based on RMSE:
Energy Source      nat_gas
RMSE             22.900301
MAPE              0.073118
Name: 0, dtype: object

Best Energy Source based on MAPE:
Energy Source      nat_gas
RMSE             22.900301
MAPE              0.073118
Name: 0, dtype: object
