In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import necessary libraries

import numpy as np
import pandas as pd
import math
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import learning_curve
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Load the dataset
regdf = pd.read_csv('/kaggle/input/polynomial-regression/regression_table.csv')
regdf.head(10)

In [None]:
# Get statistical summary from dataset

print('The shape of the dataframe is - {}'.format(regdf.shape))
print('The number of entries - {}'.format(regdf.size))
print('-'*60)
print('The basic statistics -\n{}'.format(regdf.describe()))
print('-'*60)
print('Unique values per column -\n{}'.format(regdf.nunique()))
print('-'*60)
print('Null values per column -\n{}'.format(regdf.isnull().sum()))
print('-'*60)
print('Data types per column -\n{}'.format(regdf.dtypes))

In [None]:
# Generate a Scatter Plot to Visualize the Pattern of the Data

plt.figure(figsize=(18, 9))
plt.style.use('seaborn')
plt.scatter(regdf.X, regdf.Y)
plt.title('Scatter Plot')
plt.xlabel('X Values')
plt.ylabel('Y Values')
plt.show()

In [None]:
# A Quick Implementation of Different Order Polynomial Regression Models. 
# Choose the highest order of polynomials (degree) upto which we want to fit our model & visualize. Then plot the predicted values using trained polynomial regression models along with depicting their  𝑟2  scores


degree = 20
fig, ax = plt.subplots(math.ceil(degree/2),2, figsize=(12, 20))
ax=ax.flatten()           # Converting multidimensional array into a regular "single" dimensional array.
poly_x = []
model = []
for i in range(degree):
                          # Create and store each model into "model" and polynomial features into "poly_x"
    poly_reg = PolynomialFeatures(degree=i+1)
    x = poly_reg.fit_transform(regdf[['X']])
    poly_x.append(x)
    model.append(LinearRegression())
    model[i].fit(x, regdf['Y'])
    # Plot the predicted regression curve on the scatterplot of feature and target variable
    ax[i].scatter(regdf.X, regdf.Y)
    ax[i].plot(regdf['X'], model[i].predict(x), color='r')
    ax[i].set_xlabel('X Values')
    ax[i].set_ylabel('Y Values')
    # Include r2 score in title
    ax[i].set_title('Degree {} polynomial, r2 score = {:.3f}'.format(i+1, r2_score(regdf['Y'], model[i].predict(x))))

fig.tight_layout()  

**Comparing Performances of Different Estimators**

K-fold Cross-Validation is used to train & validate each model. Learning curve was plotted for each model to understand the training and validation error changes with increasing training set size. Also, learning curves would help to diagnose if there is overfitting or underfitting issues. 20 different models were generated and 20-fold cross-validation was chosen to fit the models and calculate the error metrics. Root Mean Squared Error (RMSE) is used as the error metric in calculation. Large value of k is used because -

The sample size is small
To keep the training set large enough in order to get a good fit.

In [None]:
degree = 20
fig, ax = plt.subplots(math.ceil(degree/2),2, figsize=(12, 20), sharey=True)
ax=ax.flatten()
plt.style.use('seaborn')
train_sizes = [150, 300, 450, 600, 800, 950]  # Specify absolute sizes of the training sets for calculating scores
K = 20                                        # Choosing K for K-Fold Cross-Validation
estimator = LinearRegression()
rmseval = []
rmsetrain = []
for i in range(20):
    # Generate "i+1"th degree polynomial features to train model
    poly_reg = PolynomialFeatures(degree=i+1)
    x = poly_reg.fit_transform(regdf[['X']])
    # Get scores on training and validation sets at predetermined training sizes
    train_sizes, train_scores, validation_scores = learning_curve(estimator = estimator, X = x, y = regdf['Y'], cv = K, train_sizes=train_sizes, scoring = 'neg_mean_squared_error')
    # Get the mean training and validation scores of different training or validation sets for each training size
    train_scores_mean = np.sqrt(-train_scores.mean(axis = 1))           # Also MSE is converted to RMSE
    validation_scores_mean = np.sqrt(-validation_scores.mean(axis = 1)) # Also MSE is converted to RMSE
    # Store the mean training and validation scores for the final(6th) training size
    rmseval.append(validation_scores_mean[5])
    rmsetrain.append(train_scores_mean[5])
    # Display error metrics for different regression models
    print('Degree={}'.format(i+1))
    print('train sizes = {}'.format(train_sizes))
    print('train error = {}'.format(train_scores_mean))
    print('validation error = {}'.format(validation_scores_mean))
    print('-'*30)
    # Plot learning curve
    ax[i].plot(train_sizes, train_scores_mean, label = 'Training error')
    ax[i].plot(train_sizes, validation_scores_mean, label = 'Validation error')
    ax[i].set_yscale('log')
    ax[i].set_ylabel('RMSE', fontsize = 14)
    ax[i].set_xlabel('Training set size', fontsize = 14)
    ax[i].legend()
    ax[i].set_title('Learning curve for degree {} polynomial'.format(i+1))

fig.tight_layout()

**Get the Parameters of Predicted  𝑛𝑡ℎ  Degree Polynomial Function**

Since the results show best performance for using  10𝑡ℎ  order polynomial, we find the coefficients and intercept terms for the model. 10th degree polynomial function with one parameter is -

y=f(x)=ax+b 𝑥2 +c 𝑥3 +d 𝑥4 +e 𝑥5 +f 𝑥6 +g 𝑥7 +h 𝑥8 +i 𝑥9 +j 𝑥10 +k

Here, the coefficients are from a-j and intercept is k.

[Note: The coefficients and intercept are collected from a previously trained model using all the sample data.]

In [None]:
n = 10
intercept = model[n-1].intercept_
coefficient = model[n-1].coef_[1:]
print('Intercept : {}\nCoefficient : {}'.format(intercept, coefficient))