In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> Implement polynomial regression using the Ridge Regression method available in scikit-learn, see sklearn.linear model.Ridge() and look at the behavior of the solution when changing the parameter alpha (𝛼).

This notebook focuses on analyzing multiple ridge regression models over the "Coal Electric Power Sector CO2 Emissions" section of the "Carbon dioxide emissions from electricity generation" table available at https://www.eia.gov/electricity/data.php#elecenv. It will focus mostly on the technical aspects of creating a ridge regression model, exploring the effects of different polynomials and alphas and plotting the results, while evaluating their accuracy in predicting monthly trends on a smaller test sample.

In [None]:
#Import necessary libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 


First, we will load the dataset and separate the data we wish to manipulate: monthly emission estimates from coal energy production.

In [None]:
ds = pd.read_csv('/kaggle/input/co2emissions/MER_T12_06.csv')

# Coal Electric Power Sector CO2 Emissions
ds_1 = ds[ds['Column_Order']==1]
# Remove yearly values
ds_1 = ds_1[~ds_1.index.isin(ds_1[12::13].index)].sort_values(['YYYYMM'])


Let's plot the starting data:

In [None]:
x = pd.to_datetime(ds_1['YYYYMM'], format='%Y%m')
y = ds_1['Value'].astype('float64')

plt.figure(figsize=(25, 10))
plt.xlabel('Date')
plt.ylabel('Million Metric Tons of Carbon Dioxide')

plt.scatter(x,y)

Now we will normalize the dates and emissions while retaining the ability to invert the transform for plotting.
We will also separate the data into a training and a test section, to evaluate the regressions' accuracy.

In [None]:
x_scaler = StandardScaler()
y_scaler = StandardScaler()

x_scaler.fit(x.values.reshape(-1, 1))
y_scaler.fit(y.values.reshape(-1, 1))

X = x_scaler.transform(x.values.reshape(-1, 1))
Y = y_scaler.transform(y.values.reshape(-1, 1))



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

X_train = x_scaler.transform(np.array(x_train).reshape(-1, 1))
Y_train = y_scaler.transform(np.array(y_train).reshape(-1, 1))
X_test = x_scaler.transform(np.array(x_test).reshape(-1, 1))
Y_test = y_scaler.transform(np.array(y_test).reshape(-1, 1))

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

Next, we will plot the effects on a varying number of features into our polynomial regression. Note that alpha is set to 0 and we are only evaluating the basic regression. 

The resulting difference may be more or less obvious depending on the training sample selected, but we can see a stagnation or even a decrease in the model score after the polynomial degree rises over a certain amount, due to overfitting.

In [None]:

plt.figure(figsize=(25, 10))
plt.xlabel('Date')
plt.ylabel('Million Metric Tons of Carbon Dioxide')


plt.scatter(x, y, alpha=0.3)

for deg in range(1,18):
    model = make_pipeline(PolynomialFeatures(deg), Ridge(alpha = 0))
    model.fit(X_train,Y_train)
    
    r_test = mean_squared_error(Y_test, model.predict(X_test))
    print('{:13f}: {}, {}'.format(deg, r_test, model.score(X_test, Y_test)))
    
    plt.plot(x,y_scaler.inverse_transform(model.predict(X)),label = 'deg={}'.format(deg))

plt.legend()

We will now evaluate the effects of different alpha values on a ridge regression from a 6th degree polynomial.

Depending on the original accuracy, the polynomial degree and the logarithmic scale chosen, we should see that while lower values of alpha don't affect the model score very much, and some intermediate values may improve it, a value of alpha that is too large will start deviating the regression from the original data until, for values of alpha close to +Inf, the coefficient will near 0.

In [None]:


plt.figure(figsize=(25, 10))
plt.xlabel('Date')
plt.ylabel('Million Metric Tons of Carbon Dioxide')

# alpha=transparency
plt.scatter(x, y, alpha=0.3)

# This is the alpha we're actually interested in.
for alpha in np.logspace(-5, 5, 11):
    model = make_pipeline(PolynomialFeatures(6), Ridge(alpha = alpha))
    model.fit(X_train,Y_train)
    
    r_test = mean_squared_error(Y_test, model.predict(X_test))
    print('{:13f}: {}, {}'.format(alpha, r_test, model.score(X_test, Y_test)))
    
    plt.plot(x,y_scaler.inverse_transform(model.predict(X)),label = 'alpha={}'.format(alpha))

plt.legend()

Looking at the other sources of CO2 emissions for energy production, we can see that while some are following an upward trend, the vast majority of total CO2 emissions, still dominated by coal plants, has been on a significant decline over the last 10 years.

It is worth noting that the ridge regression model following some of the better performing parameters of the previous sections is able to correctly extrapolate the overall trend on data points, such as natural gas emissions, highly dependent on seasonal demand.

In [None]:
plt.figure(figsize=(25, 10))
plt.xlabel('Date')
plt.ylabel('Million Metric Tons of Carbon Dioxide')

for i in range(1,10):
    ds_2 = ds[ds['Column_Order']==i]
    # Remove yearly values
    ds_2 = ds_2[~ds_2.index.isin(ds_2[12::13].index)].sort_values(['YYYYMM'])
    ds_2 = ds_2[ds_2['Value'] != 'Not Available']
    x2 = pd.to_datetime(ds_2['YYYYMM'], format='%Y%m')
    y2 = ds_2['Value'].astype('float64')
    plt.scatter(x2,y2, s=20, label=ds_2['Description'].iloc[0], alpha=0.30)
    
    x2scaler = StandardScaler()
    y2scaler = StandardScaler()
    
    X2 = x2scaler.fit_transform(x2.values.reshape(-1, 1))
    Y2 = y2scaler.fit_transform(y2.values.reshape(-1, 1))
    
    model = make_pipeline(PolynomialFeatures(6), Ridge(alpha = 0.001))
    model.fit(X2,Y2)
    
    plt.plot(x2,y2scaler.inverse_transform(model.predict(X2)))

plt.legend()


