# Machine Learning 2 - Multiple Polynomial Regression

## Import libraries

In [6]:
import pandas as pd
import seaborn as sns
import numpy as np
from IPython.display import display
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from statistics import median
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

sns.set()

In [7]:
SPX = pd.read_csv("outputs/SPX_out1.csv") 
SPX

Unnamed: 0,Date,Close,Volume,ClosePrev,CloseNext
0,1950-04-01,17.186667,2.371296e+07,,18.180000
1,1950-07-01,18.180000,2.760741e+07,17.186667,18.570000
2,1950-10-01,18.570000,2.260370e+07,18.180000,19.816667
3,1951-01-01,19.816667,3.068148e+07,18.570000,21.620000
4,1951-04-01,21.620000,3.020555e+07,19.816667,21.636667
...,...,...,...,...,...
284,2021-04-01,3832.760000,5.277860e+10,3549.220000,4227.593333
285,2021-07-01,4227.593333,4.502311e+10,3832.760000,4408.493333
286,2021-10-01,4408.493333,4.207726e+10,4227.593333,4646.186667
287,2022-01-01,4646.186667,4.631885e+10,4408.493333,4433.720000


In [8]:
df_full = pd.read_csv("outputs/df_full.csv")
df_full

Unnamed: 0.1,Unnamed: 0,gdp,gnp,real_gdp,real_gdp_per_capita,net_exports,gni,govt_spending,consumer_spending,private_domestic_investment,cpi,consumer_oil_price,ir,unemployment_rate,Close,Volume,ClosePrev,CloseNext
0,1950-07-01,308.153,309.760,2340.112,15398.0,-0.740,307.413,600.663,200.505,1.247,24.203,11.267,1.61,4.6,18.180000,2.760741e+07,17.186667,18.570000
1,1950-10-01,319.945,321.554,2384.920,15623.0,-0.154,319.791,643.100,197.946,1.289,24.693,11.500,1.75,4.2,18.570000,2.260370e+07,18.180000,19.816667
2,1951-01-01,336.000,337.537,2417.311,15769.0,0.177,336.177,711.537,209.207,1.296,25.697,11.700,1.75,3.5,19.816667,3.068148e+07,18.570000,21.620000
3,1951-04-01,344.090,345.973,2459.196,15979.0,1.943,346.033,806.376,204.942,1.332,25.947,11.933,1.75,3.1,21.620000,3.020555e+07,19.816667,21.636667
4,1951-07-01,351.385,353.381,2509.880,16234.0,3.742,355.127,895.015,207.616,1.385,25.933,11.933,1.75,3.2,21.636667,2.172778e+07,21.620000,22.980000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,2020-04-01,19477.444,19649.442,17258.205,52031.0,-538.876,18938.568,3378.132,12989.729,519.850,256.418,219.570,0.25,13.0,2921.443333,5.985144e+10,3136.440000,3019.010000
280,2020-07-01,21138.574,21365.412,18560.774,55933.0,-725.723,20412.851,3360.238,14293.832,539.864,259.438,232.403,0.25,8.8,3019.010000,6.714349e+10,2921.443333,3378.143333
281,2020-10-01,21477.597,21728.223,18767.778,56533.0,-798.431,20679.166,3356.030,14467.611,561.269,260.879,234.862,0.25,6.8,3378.143333,5.204018e+10,3019.010000,3549.220000
282,2021-01-01,22038.226,22273.060,19055.655,57405.0,-872.540,21165.686,3390.921,15005.444,576.340,263.525,274.983,0.25,6.2,3549.220000,5.152288e+10,3378.143333,3832.760000


## Select polynomial degree

In [9]:
y = pd.DataFrame(df_full['CloseNext'])
X = pd.DataFrame(df_full[['real_gdp', 'consumer_spending', 'gnp', 'gdp', 'private_domestic_investment']])

res_df2 = pd.DataFrame({'R^2':[], 'MSE train':[], 'MSE test':[]})

for i in range (1, 11):
    var = []
    mse_train = []
    mse_test = []
    for j in range (20):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=j)

        poly = PolynomialFeatures(degree=i)
        poly_reg = LinearRegression()

        X_ = poly.fit_transform(X_train)
        X_ = np.delete(X_,(1),axis=1)
        poly_reg.fit(X_, y_train)
        y_pred = poly_reg.predict(X_)
        var.append(poly_reg.score(X_, y_train))

        X_2 = poly.fit_transform(X_test)
        X_2 = np.delete(X_2,(1),axis=1)
        y_pred2 = poly_reg.predict(X_2)
        
        mse_train.append(mse(y_train, y_pred))
        mse_test.append(mse(y_test, y_pred2))
    
    new_row = {'R^2':median(var), 'MSE train':median(mse_train), 'MSE test':median(mse_test)}
    res_df2 = res_df2.append(new_row, ignore_index=True)

res_df2.index += 1
res_df2.index.name = 'polynomial degree'
display(res_df2)

Unnamed: 0_level_0,R^2,MSE train,MSE test
polynomial degree,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.979564,14448.885878,15351.01
2,0.995696,3071.476351,4643.098
3,0.998536,1019.524458,13020.98
4,0.999748,177.297906,361070.7
5,0.99996,28.431422,157262300.0
6,0.999965,24.722715,3368364000.0
7,0.999972,19.827908,52205020000.0
8,0.99994,40.578037,245316100000.0
9,0.999944,41.435046,1201690000000.0
10,0.999906,68.692818,1731801000000.0


As above, the lowest MSE (from test set) is achieved at polynomial degree 2.<br/>
Beyond degree 2, we see <b>overfitting</b>.

## Obtain Samples & Export

In [10]:
# get median MSE (for train set) for degree 2
trials_n = 1001 # no. of trials 

res_df2 = pd.DataFrame({'R^2':[], 'MSE train':[], 'MSE test':[]})

for i in range (trials_n):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=i)

    poly = PolynomialFeatures(degree=2)
    poly_reg = LinearRegression()
    
    X_ = poly.fit_transform(X_train)
    X_ = np.delete(X_,(1),axis=1)
    poly_reg.fit(X_, y_train)
    y_pred = poly_reg.predict(X_)
    var = poly_reg.score(X_, y_train)
    
    X_2 = poly.fit_transform(X_test)
    X_2 = np.delete(X_2,(1),axis=1)
    y_pred2 = poly_reg.predict(X_2)
    
    new_row = {'R^2':var, 'MSE train':mse(y_train, y_pred), 'MSE test':mse(y_test, y_pred2)}
    res_df2 = res_df2.append(new_row, ignore_index=True)

res_df2.to_csv('outputs/res_df2.csv')