In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge, Ridge, LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_squared_error, r2_score, accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision
import itertools 

In [55]:
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('emissions-hourly-2023-wa.csv')

In [56]:
df.head()

Unnamed: 0,State,Facility Name,Facility ID,Unit ID,Associated Stacks,Date,Hour,Operating Time,Gross Load (MW),Steam Load (1000 lb/hr),...,Heat Input (mmBtu),Heat Input Measure Indicator,Primary Fuel Type,Secondary Fuel Type,Unit Type,SO2 Controls,NOx Controls,PM Controls,Hg Controls,Program Code
0,WA,Fredonia Generating Station,607,CT3,,2023-01-01,0,0.0,,,...,,,Natural Gas,Diesel Oil,Combustion turbine,,"Water Injection,Selective Catalytic Reduction",,,ARP
1,WA,Fredonia Generating Station,607,CT3,,2023-01-01,1,0.0,,,...,,,Natural Gas,Diesel Oil,Combustion turbine,,"Water Injection,Selective Catalytic Reduction",,,ARP
2,WA,Fredonia Generating Station,607,CT3,,2023-01-01,2,0.0,,,...,,,Natural Gas,Diesel Oil,Combustion turbine,,"Water Injection,Selective Catalytic Reduction",,,ARP
3,WA,Fredonia Generating Station,607,CT3,,2023-01-01,3,0.0,,,...,,,Natural Gas,Diesel Oil,Combustion turbine,,"Water Injection,Selective Catalytic Reduction",,,ARP
4,WA,Fredonia Generating Station,607,CT3,,2023-01-01,4,0.0,,,...,,,Natural Gas,Diesel Oil,Combustion turbine,,"Water Injection,Selective Catalytic Reduction",,,ARP


In [57]:
#dropping features as it contains 100% null values or above 75% null values
drop_columns = [4,9,25,27,29,30]
df.drop(df.columns[drop_columns], axis=1, inplace=True)

In [58]:
df_cleaned = df.dropna()  #Dropping the rows which have null values as the Gross Load (MW),SO2 Mass (lbs), SO2 Mass Measure Indicator,
#SO2 Rate (lbs/mmBtu),SO2 Rate Measure Indicator, CO2 Mass (short tons), CO2 Mass Measure Indicator,CO2 Rate (short tons/mmBtu), CO2 Rate Measure Indicator
 #NOx Mass (lbs), NOx Mass Measure Indicator, NOx Rate (lbs/mmBtu), NOx Rate Measure Indicator, Heat Input (mmBtu), Heat Input Measure Indicator
#is important features for the analysis

In [59]:
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.replace(r'[^A-Za-z0-9]+', '')  # Removing non-alphanumeric characters
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.upper()  # Convert to uppercase 
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.replace('CT1', 'CT-1') #replacing with correct strings
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.replace('CT2', 'CT-2')
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.replace('CT3', 'CT-3')
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.replace('CT4', 'CT-4')
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.replace('CTG1', 'CTG-1')
df_cleaned['Unit ID'] = df_cleaned['Unit ID'].str.replace('BW22', 'BW-22')

In [60]:
df_cleaned['Unit ID'].value_counts(dropna=False) #after formatting

Unit ID
CT-1     26244
1        14136
CT-2     13085
CT-3      9271
F1CT      7378
BW-22     7155
CTG-1     7046
2         6836
CT-1A     6506
CT-1B     6129
CT-4      2905
Name: count, dtype: int64

In [61]:
#as we see the value in SO2 Rate which is greater than 1 we replace those with median to avoid outliers
outliers = df_cleaned['SO2 Rate (lbs/mmBtu)'] > 1.0
replacement_value = df_cleaned.loc[~outliers, 'SO2 Rate (lbs/mmBtu)'].median()
df_cleaned.loc[outliers, 'SO2 Rate (lbs/mmBtu)'] = replacement_value

In [62]:
df_cleaned.isna().sum() #We can observe there is no null values and now we have a clean data

State                           0
Facility Name                   0
Facility ID                     0
Unit ID                         0
Date                            0
Hour                            0
Operating Time                  0
Gross Load (MW)                 0
SO2 Mass (lbs)                  0
SO2 Mass Measure Indicator      0
SO2 Rate (lbs/mmBtu)            0
SO2 Rate Measure Indicator      0
CO2 Mass (short tons)           0
CO2 Mass Measure Indicator      0
CO2 Rate (short tons/mmBtu)     0
CO2 Rate Measure Indicator      0
NOx Mass (lbs)                  0
NOx Mass Measure Indicator      0
NOx Rate (lbs/mmBtu)            0
NOx Rate Measure Indicator      0
Heat Input (mmBtu)              0
Heat Input Measure Indicator    0
Primary Fuel Type               0
Unit Type                       0
NOx Controls                    0
Program Code                    0
dtype: int64

In [64]:
df_cleaned.head()

Unnamed: 0,State,Facility Name,Facility ID,Unit ID,Date,Hour,Operating Time,Gross Load (MW),SO2 Mass (lbs),SO2 Mass Measure Indicator,...,NOx Mass (lbs),NOx Mass Measure Indicator,NOx Rate (lbs/mmBtu),NOx Rate Measure Indicator,Heat Input (mmBtu),Heat Input Measure Indicator,Primary Fuel Type,Unit Type,NOx Controls,Program Code
1373,WA,Fredonia Generating Station,607,CT-3,2023-02-27,5,0.1,16.0,0.039,Measured,...,2.668,Calculated,0.122,Measured,21.87,Measured,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
1374,WA,Fredonia Generating Station,607,CT-3,2023-02-27,6,1.0,57.0,1.008,Measured,...,11.184,Calculated,0.02,Measured,559.2,Measured,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
1375,WA,Fredonia Generating Station,607,CT-3,2023-02-27,7,1.0,58.0,1.012,Measured,...,8.421,Calculated,0.015,Measured,561.4,Measured,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
1376,WA,Fredonia Generating Station,607,CT-3,2023-02-27,8,1.0,54.0,0.807,Measured,...,11.082,Calculated,0.021,Measured,527.7,Measured,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
1377,WA,Fredonia Generating Station,607,CT-3,2023-02-27,9,1.0,52.0,0.6,Measured,...,7.228,Calculated,0.014,Measured,516.3,Measured,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP


In [9]:
#identifing the features which are not related using co relation matrix and also dropping that feature 
numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df_cleaned[numeric_columns].corr()

In [10]:
target = 'NOx Rate (lbs/mmBtu)'
correlation_with_target = correlation_matrix[target].drop(target)

In [11]:
threshold = 0.1 
low_correlation_features = correlation_with_target[abs(correlation_with_target) < threshold].index.tolist()

In [12]:
print(f"Features with low correlation to the target variable:{low_correlation_features}")

Features with low correlation to the target variable:['Hour']


In [13]:
df_cleaned = df_cleaned.drop(columns=low_correlation_features) #dropping low correlation features

In [14]:
print("Original dataset shape:", df_cleaned.shape)

Original dataset shape: (106691, 25)


In [53]:
df.head()

Unnamed: 0,State,Facility Name,Facility ID,Unit ID,Date,Hour,Operating Time,Gross Load (MW),SO2 Mass (lbs),SO2 Mass Measure Indicator,...,NOx Mass (lbs),NOx Mass Measure Indicator,NOx Rate (lbs/mmBtu),NOx Rate Measure Indicator,Heat Input (mmBtu),Heat Input Measure Indicator,Primary Fuel Type,Unit Type,NOx Controls,Program Code
0,WA,Fredonia Generating Station,607,CT3,2023-01-01,0,0.0,,,,...,,,,,,,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
1,WA,Fredonia Generating Station,607,CT3,2023-01-01,1,0.0,,,,...,,,,,,,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
2,WA,Fredonia Generating Station,607,CT3,2023-01-01,2,0.0,,,,...,,,,,,,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
3,WA,Fredonia Generating Station,607,CT3,2023-01-01,3,0.0,,,,...,,,,,,,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP
4,WA,Fredonia Generating Station,607,CT3,2023-01-01,4,0.0,,,,...,,,,,,,Natural Gas,Combustion turbine,"Water Injection,Selective Catalytic Reduction",ARP


In [15]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['Unit ID'])

In [16]:
df_cleaned['Heat Input Measure Indicator'].replace('', 'Substitute', inplace=True)
df_cleaned = pd.get_dummies(df_cleaned, columns=['Heat Input Measure Indicator'])

In [17]:
single_value_columns = [col for col in df_cleaned.columns if df_cleaned[col].nunique() == 1]
df_cleaned.drop(columns=single_value_columns, inplace=True)

In [18]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['Facility Name'])

In [19]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['SO2 Mass Measure Indicator'])

In [20]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['CO2 Mass Measure Indicator'])

In [21]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['Primary Fuel Type'])

In [22]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['Unit Type'])

In [23]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['NOx Controls'])

In [24]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['Program Code'])

In [25]:
# Convert boolean columns to integers
for col in df_cleaned.columns:
    if df_cleaned[col].dtype == 'bool':
        df_cleaned[col] = df_cleaned[col].astype(int)

In [26]:
from sklearn.preprocessing import MinMaxScaler
numerical_columns = df_cleaned.select_dtypes(include='number')
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(numerical_columns)
df_cleaned = pd.DataFrame(scaled_data, columns=numerical_columns.columns)

In [29]:
features = df_cleaned[['Gross Load (MW)','SO2 Mass (lbs)','Heat Input (mmBtu)','NOx Mass (lbs)','CO2 Rate (short tons/mmBtu)',
                       'CO2 Mass (short tons)','SO2 Rate (lbs/mmBtu)']]
target = df_cleaned['NOx Rate (lbs/mmBtu)']
print(len(features.columns))

7


In [30]:
# Split the dataset into training (75%) and testing + validation (25%)
X_train, X_temp, y_train, y_temp = train_test_split(features, target, test_size=0.25, random_state=42)

# Further split the temporary dataset into testing (50%) and validation (50%)
X_test, X_validation, y_test, y_validation = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [31]:
# Print the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)
print("Shape of X_validation:", X_validation.shape)
print("Shape of y_validation:", y_validation.shape)

Shape of X_train: (80018, 7)
Shape of y_train: (80018,)
Shape of X_test: (13336, 7)
Shape of y_test: (13336,)
Shape of X_validation: (13337, 7)
Shape of y_validation: (13337,)


In [42]:
# 2) Using Linear Regression Model
model_linear = LinearRegression()
model_linear.fit(X_train, y_train) #fitting the mode

In [43]:
# Making predictions
model_linear_predict_test = model_linear.predict(X_test) #for test case
model_linear_predict_validation = model_linear.predict(X_validation) #for validation case

In [44]:
# evaluation metrics for test set
model_linear_mse_test = mean_squared_error(y_test, model_linear_predict_test)
print(f"MSE for test data : {model_linear_mse_test}")

MSE for test data : 0.0002263784256541415


In [45]:
model_linear_r2_score_test = r2_score(y_test, model_linear_predict_test) #Using R2 score
print(f"R2 score for test data : {model_linear_r2_score_test}")

R2 score for test data : 0.7893677915414341


In [46]:
model_linear_mse_validation = mean_squared_error(y_validation, model_linear_predict_validation) #For validation dataset
print(f"MSE for validation data : {model_linear_mse_validation}")

MSE for validation data : 0.0002099576712308199


In [47]:
model_linear_r2_score_validation = r2_score(y_validation, model_linear_predict_validation)
print(f"R2 score for validation data : {model_linear_r2_score_validation}")

R2 score for validation data : 0.8011947939144273


In [50]:
# Save the trained model to a pickle file
import pickle
with open('linear_regression_model.pkl', 'wb') as f:
    pickle.dump(model_linear, f)