In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams['figure.figsize'] = (10,6)

# Understanding under-overfitting

## Data generation

In [None]:
x = np.array([i*np.pi/180 for i in range(10,360,3)])

np.random.seed(10)  
y = np.sin(x) + np.random.normal(0,0.15,len(x))

data = pd.DataFrame(np.column_stack([x,y]),columns=['x','y'])
data.head()

In [None]:
plt.figure(figsize=(12,8))
plt.plot(data['x'],data['y'],'.')

In [None]:
for i in range(2,16):  
    colname = 'x_%d'%i     
    data[colname] = data['x']**i
data.head()

## Creating test and train Set Randomly

In [None]:
data['randNumCol'] = np.random.randint(1, 6, data.shape[0])
train=data[data['randNumCol']<=3]
test=data[data['randNumCol']>3]
train = train.drop('randNumCol', axis=1)
test = test.drop('randNumCol', axis=1)

In [None]:
#Separating the independent and dependent variables
X_train = train.drop('y', axis=1).values
y_train = train['y'].values

X_test = test.drop('y', axis=1).values
y_test = test['y'].values

In [None]:
#Import Linear Regression model from scikit-learn.
from sklearn.linear_model import LinearRegression

def fit_model(max_exponent):
    
    
    model = LinearRegression(normalize=True)
    model.fit(X_train[:,0:max_exponent],y_train)
    y_train_pred = model.predict(X_train[:,0:max_exponent])

    rss_train = sum((y_train_pred-y_train)**2) / X_train.shape[0]

    y_test_pred = model.predict(X_test[:,0:max_exponent])
    rss_test = sum((y_test_pred-y_test)**2)/ X_test.shape[0]

    print("Training Error", round(rss_train,3))
    print("Testing Error",round(rss_test,3))
    
    if (rss_train>0.15) & (rss_test>0.15):
        print("*** Underfitting ***")
    elif (rss_train - rss_test < -0.1):
        print("*** Overfitting ***")
    else:
        print("*** Good fit ***")

    plt.plot(X_train[:,0:1],y_train_pred)
    plt.plot(X_train[:,0:1],y_train,'.')
    plt.show()

In [None]:
fit_model(1)

In [None]:
fit_model(3)

In [None]:
fit_model(15)

In [None]:
# defining a function which will fit linear regression model, plot the results, and return the coefficients
def linear_regression(train_x, train_y, test_x, test_y, features, models_to_plot):
        
    #Fit the model
    linreg = LinearRegression(normalize=True)
    linreg.fit(train_x,train_y)
    train_y_pred = linreg.predict(train_x)
    test_y_pred = linreg.predict(test_x)
    
    #Check if a plot is to be made for the entered features
    if features in models_to_plot:
        plt.subplot(models_to_plot[features])
        plt.tight_layout()
        plt.plot(train_x[:,0:1],train_y_pred)
        
        plt.plot(train_x[:,0:1],train_y,'.')
        
        plt.title('Number of Predictors: %d'%features)
    
    #Return the result in pre-defined format
    rss_train = sum((train_y_pred-train_y)**2)/train_x.shape[0]
    ret = [rss_train]
    
    rss_test = sum((test_y_pred-test_y)**2)/test_x.shape[0]
    ret.extend([rss_test])
    
    ret.extend([linreg.intercept_])
    ret.extend(linreg.coef_)
    
    return ret

In [None]:
#Initialize a dataframe to store the results:
col = ['mrss_train','mrss_test','intercept'] + ['coef_Var_%d'%i for i in range(1,16)]
ind = ['Number_of_variable_%d'%i for i in range(1,16)]
coef_matrix_simple = pd.DataFrame(index=ind, columns=col)

In [None]:
#Define the number of features for which a plot is required:
models_to_plot = {1:231,3:232,6:233,9:234,12:235,15:236}

#Iterate through all powers and store the results in a matrix form
plt.figure(figsize=(12,8))
for i in range(1,16):
    train_x = X_train[:,0:i]
    train_y = y_train
    test_x = X_test[:,0:i]
    test_y = y_test
    
    coef_matrix_simple.iloc[i-1,0:i+3] = linear_regression(train_x,train_y, test_x, test_y, features=i, models_to_plot=models_to_plot)

In [None]:
coef_matrix_simple.head()

In [None]:
plt.plot(range(1,16),coef_matrix_simple.mrss_train.values)
plt.plot(range(1,16),coef_matrix_simple.mrss_test.values)
plt.xlabel('polynomial degree')
plt.ylabel('MRSS')
plt.legend(['train', 'test'])