In [24]:
import numpy as np
import statsmodels.formula.api as smf
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
# Set seed for reproducible results
np.random.seed(414)

In [3]:
# Gen toy data
X = np.linspace(0, 15, 1000)
y = 3 * np.sin(X) + np.random.normal(1 + X, .2, 1000)

In [4]:
train_X, train_y = X[:700], y[:700]
test_X, test_y = X[700:], y[700:]

In [7]:
train_df = pd.DataFrame({'X': train_X, 'y': train_y})
test_df = pd.DataFrame({'X': test_X, 'y': test_y})

In [14]:
# Linear Fit
poly_l = smf.ols(formula='y ~ 1 + X', data=train_df).fit()

In [9]:
# Quadratic Fit
poly_q = smf.ols(formula='y ~ 1 + X + I(X**2)', data=train_df).fit()

In [10]:
# Linear Fit
poly_lt = smf.ols(formula='y ~ 1 + X', data=test_df).fit()

In [11]:
# Quadratic Fit
poly_qt = smf.ols(formula='y ~ 1 + X + I(X**2)', data=test_df).fit()

In [15]:
print(poly_l.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.642
Model:                            OLS   Adj. R-squared:                  0.642
Method:                 Least Squares   F-statistic:                     1254.
Date:                Wed, 06 Jul 2016   Prob (F-statistic):          5.52e-158
Time:                        20:47:07   Log-Likelihood:                -1483.4
No. Observations:                 700   AIC:                             2971.
Df Residuals:                     698   BIC:                             2980.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.9959      0.152     13.104      0.0

In [16]:
print(poly_q.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.666
Model:                            OLS   Adj. R-squared:                  0.665
Method:                 Least Squares   F-statistic:                     694.4
Date:                Wed, 06 Jul 2016   Prob (F-statistic):          1.25e-166
Time:                        20:47:24   Log-Likelihood:                -1459.6
No. Observations:                 700   AIC:                             2925.
Df Residuals:                     697   BIC:                             2939.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      3.1458      0.221     14.261      0.0

In [17]:
print(poly_lt.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.958
Model:                            OLS   Adj. R-squared:                  0.957
Method:                 Least Squares   F-statistic:                     6715.
Date:                Wed, 06 Jul 2016   Prob (F-statistic):          1.96e-206
Time:                        20:47:30   Log-Likelihood:                -336.79
No. Observations:                 300   AIC:                             677.6
Df Residuals:                     298   BIC:                             685.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept    -20.6762      0.425    -48.684      0.0

In [18]:
print(poly_qt.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.964
Model:                            OLS   Adj. R-squared:                  0.964
Method:                 Least Squares   F-statistic:                     3954.
Date:                Wed, 06 Jul 2016   Prob (F-statistic):          9.17e-215
Time:                        20:47:42   Log-Likelihood:                -312.73
No. Observations:                 300   AIC:                             631.5
Df Residuals:                     297   BIC:                             642.6
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept    -51.1460      4.258    -12.013      0.0

In [22]:
model = sm.OLS(y,X)

In [31]:
plt.plot(X,y)
plt.show()

In [35]:
import statsmodels.regression.linear_model as lm

In [40]:
lm.RegressionResults.mse_model(poly_l)

5099.7841870644406

In [41]:
lm.RegressionResults.mse_model(poly_lt)

3737.4434519777815

In [42]:
np.sqrt([5099.7841870644406,3737.4434519777815])

array([ 71.41277328,  61.13463382])

In [45]:
def rmse(model):
    print(np.sqrt(lm.RegressionResults.mse_model(model)))


In [46]:
rmse(poly_l)

71.4127732767


In [47]:
rmse(poly_lt)

61.1346338173


In [48]:
rmse(poly_q)

51.4125040631


In [49]:
rmse(poly_qt)

43.37059528


In [53]:
poly_l.predict(test_df)

array([ 11.34605278,  11.35941012,  11.37276745,  11.38612479,
        11.39948213,  11.41283947,  11.4261968 ,  11.43955414,
        11.45291148,  11.46626882,  11.47962616,  11.49298349,
        11.50634083,  11.51969817,  11.53305551,  11.54641284,
        11.55977018,  11.57312752,  11.58648486,  11.59984219,
        11.61319953,  11.62655687,  11.63991421,  11.65327154,
        11.66662888,  11.67998622,  11.69334356,  11.7067009 ,
        11.72005823,  11.73341557,  11.74677291,  11.76013025,
        11.77348758,  11.78684492,  11.80020226,  11.8135596 ,
        11.82691693,  11.84027427,  11.85363161,  11.86698895,
        11.88034629,  11.89370362,  11.90706096,  11.9204183 ,
        11.93377564,  11.94713297,  11.96049031,  11.97384765,
        11.98720499,  12.00056232,  12.01391966,  12.027277  ,
        12.04063434,  12.05399168,  12.06734901,  12.08070635,
        12.09406369,  12.10742103,  12.12077836,  12.1341357 ,
        12.14749304,  12.16085038,  12.17420771,  12.18

In [55]:
print(test_df)

             X          y
0    10.510511   9.113557
1    10.525526   8.798065
2    10.540541   8.931348
3    10.555556   9.047734
4    10.570571   8.659095
5    10.585586   9.115196
6    10.600601   8.966861
7    10.615616   8.947426
8    10.630631   8.761166
9    10.645646   8.570319
10   10.660661   8.542211
11   10.675676   8.840849
12   10.690691   8.685590
13   10.705706   9.116769
14   10.720721   9.013263
15   10.735736   8.645331
16   10.750751   8.956652
17   10.765766   8.911172
18   10.780781   8.496090
19   10.795796   8.867917
20   10.810811   8.936554
21   10.825826   8.829840
22   10.840841   9.052438
23   10.855856   8.630678
24   10.870871   8.828351
25   10.885886   8.995500
26   10.900901   8.735839
27   10.915916   9.041912
28   10.930931   8.613124
29   10.945946   8.498318
..         ...        ...
270  14.564565  18.319723
271  14.579580  18.164795
272  14.594595  18.381468
273  14.609610  18.050854
274  14.624625  18.022420
275  14.639640  18.263179
276  14.6546

In [56]:
poly_l.predict(test_df) - test_df['y']

0      2.232496
1      2.561345
2      2.441419
3      2.338391
4      2.740387
5      2.297643
6      2.459336
7      2.492128
8      2.691746
9      2.895950
10     2.937415
11     2.652134
12     2.820751
13     2.402929
14     2.519792
15     2.901082
16     2.603119
17     2.661955
18     3.090395
19     2.731925
20     2.676645
21     2.796717
22     2.587477
23     3.022593
24     2.838278
25     2.684486
26     2.957505
27     2.664789
28     3.106934
29     3.235098
         ...   
270   -3.367189
271   -3.198904
272   -3.402219
273   -3.058248
274   -3.016457
275   -3.243858
276   -3.497930
277   -3.300452
278   -3.308628
279   -3.055096
280   -3.243268
281   -3.124255
282   -3.296146
283   -3.335235
284   -3.133595
285   -2.900085
286   -2.930093
287   -2.737897
288   -3.087257
289   -2.898749
290   -3.145400
291   -3.147115
292   -2.800047
293   -2.517928
294   -2.929250
295   -2.394407
296   -2.664105
297   -2.776308
298   -2.593389
299   -2.792384
Name: y, dtype: float64

In [57]:
z = np.mean(((poly_l.predict(test_df) - test_df['y'])**2))

In [58]:
print(np.sqrt(z))

2.55881638154
