# Predict Stock Returns in Scikit Learn - Ridge/Lasso Regression 

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [65]:
df = pd.read_csv('data.csv').drop('Unnamed: 0', axis=1)

In [66]:
# f_
fundamentals_df = df[['Revenue', 'Cost_of_Sales', 'Gross_profit',
       'Operating_profit', 'Net_Profit', 'Issue_of_shares', 'Share_repurchase',
       'Non_current_assets', 'Current_assets', 'Non_current_liabilities',
       'Current_liabilities', 'net_cash_op_act','target_return']]

In [67]:
fundamentals_df.head()

Unnamed: 0,Revenue,Cost_of_Sales,Gross_profit,Operating_profit,Net_Profit,Issue_of_shares,Share_repurchase,Non_current_assets,Current_assets,Non_current_liabilities,Current_liabilities,net_cash_op_act,target_return
0,951.4,621.9,329.5,431.7,317.9,3.9,3.9,732.8,237.2,220.7,454.3,959.6,0.090136
1,845.3,606.0,239.3,141.3,115.0,0.3,0.0,741.4,188.0,221.0,396.3,816.6,-0.053612
2,991.8,609.9,381.9,-8.9,-136.0,2.6,0.0,719.6,203.3,306.8,328.0,1003.0,0.050286
3,1066.8,665.9,400.9,138.2,44.3,559.6,0.0,753.0,248.5,412.7,197.5,1056.6,-0.060365
4,1109.5,701.6,407.9,197.8,40.4,3.1,3.1,779.3,172.3,444.2,97.4,1095.5,-0.061601


create X and y for each df

In [68]:
y = fundamentals_df['target_return']
X = fundamentals_df.drop(['target_return'], axis=1)

In [69]:
from sklearn.preprocessing import StandardScaler

In [70]:
scaler = StandardScaler()

In [71]:
X_scaled = scaler.fit_transform(X)

In [72]:
X_standardize = pd.DataFrame(X_scaled,columns=X.columns)
X_standardize.head()

Unnamed: 0,Revenue,Cost_of_Sales,Gross_profit,Operating_profit,Net_Profit,Issue_of_shares,Share_repurchase,Non_current_assets,Current_assets,Non_current_liabilities,Current_liabilities,net_cash_op_act
0,-0.258862,-0.247648,-0.27656,-0.192073,-0.255228,-0.173122,-0.174834,-0.189262,-0.193271,-0.191939,-0.183768,-0.195021
1,-0.259293,-0.247743,-0.277666,-0.193437,-0.260497,-0.173219,-0.175136,-0.189261,-0.193319,-0.191938,-0.183781,-0.196623
2,-0.258698,-0.24772,-0.275918,-0.194142,-0.267016,-0.173157,-0.175136,-0.189264,-0.193304,-0.191906,-0.183797,-0.194535
3,-0.258394,-0.247383,-0.275685,-0.193451,-0.262333,-0.158028,-0.175136,-0.189259,-0.19326,-0.191866,-0.183827,-0.193934
4,-0.258221,-0.247169,-0.275599,-0.193172,-0.262434,-0.173143,-0.174896,-0.189255,-0.193335,-0.191854,-0.18385,-0.193499


Now let's import the train_test_split method from sklearn.model_selection:

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X_standardize, y, test_size=0.3, random_state=23)

In [75]:
from sklearn.linear_model import LinearRegression

In [76]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [77]:
ridge001 = Ridge(alpha=0.001)
ridge01 = Ridge(alpha=0.01)
ridge1 = Ridge(alpha=0.1)
ridge = Ridge(alpha=1)

In [114]:
lasso = Lasso(alpha=1)
linear = LinearRegression()

In [79]:
ridge001.fit(X_train,y_train)
ridge01.fit(X_train,y_train)
ridge1.fit(X_train,y_train)
ridge.fit(X_train,y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [80]:
ridge001.score(X_train,y_train)

0.020753123656174099

In [115]:
lasso.fit(X_train,y_train)

Lasso(alpha=1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [116]:
pred_lasso = lasso.predict(X_test)

In [122]:
test20 = lasso.predict([20])

ValueError: Expected 2D array, got 1D array instead:
array=[20].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [83]:
pred_ridge001 = ridge001.predict(X_test)
pred_ridge01 = ridge01.predict(X_test)
pred_ridge1 = ridge1.predict(X_test)
pred_ridge = ridge.predict(X_test)

In [117]:
from sklearn import metrics

In [118]:
metrics.mean_squared_error(y_test, pred_lasso)

0.016604158489132295

In [86]:
print('Ridge    {}'.format( metrics.mean_squared_error(y_test, pred_ridge))) #0.001

print('Ridge001 {}'.format(metrics.mean_squared_error(y_test, pred_ridge001)))

print('Ridge01  {}'.format(metrics.mean_squared_error(y_test, pred_ridge01)))

print('Ridge1   {}'.format(metrics.mean_squared_error(y_test, pred_ridge1)))

Ridge    0.01641075479536693
Ridge001 0.01644522608068567
Ridge01  0.016428135819976264
Ridge1   0.01639164361981555


In [119]:
linear.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [120]:
pd.DataFrame(np.array([linear.intercept_, ridge001.intercept_, lasso.intercept_]),
             columns=['Intercept'], 
             index=['Linear', 'Ridge', 'Lasso'])

Unnamed: 0,Intercept
Linear,0.029137
Ridge,0.028139
Lasso,0.02848


In [123]:
# Create dataframe for coefficents of linear model
lm_coef = pd.DataFrame(linear.coef_,index=X.columns,columns=['Linear'])

# Create dataframe for coefficents of ridge model
ridge_coef = pd.DataFrame(ridge001.coef_,index=X.columns,columns=['Ridge'])

# Create dataframe for coefficents of lasso model
lasso_coef = pd.DataFrame(lasso.coef_,index=X.columns,columns=['Lasso'])

pd.concat([lm_coef, ridge_coef, lasso_coef], axis=1)

Unnamed: 0,Linear,Ridge,Lasso
Revenue,33.982114,0.544245,0.0
Cost_of_Sales,-22.971161,-0.388429,0.0
Gross_profit,-11.292392,-0.21903,0.0
Operating_profit,0.048679,0.050363,0.0
Net_Profit,-0.001551,-0.00355,0.0
Issue_of_shares,-0.00214,-0.001596,0.0
Share_repurchase,-0.032834,-0.032029,0.0
Non_current_assets,-0.232145,-0.22594,0.0
Current_assets,0.009982,0.007999,0.0
Non_current_liabilities,0.117075,0.110609,0.0


In [113]:
0.014949*0.5

0.0074745

In [18]:
lm.intercept_

0.030743091269107608

In [90]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

NameError: name 'lm' is not defined

In [20]:
from sklearn import metrics

MSE

In [21]:
# Get predictions for training data
fit_lm = lm.predict(X_train)

# Print MSE for models
print('MSE (train)')
print('Linear:', metrics.mean_squared_error(y_train, fit_lm))

MSE (train)
Linear: 0.0154314938524


R$^2$

In [22]:
print('R SQUARED SCORE')
print('Linear:', lm.score(X_train, y_train))

R SQUARED SCORE
Linear: 0.0280715616586





## Using statsmodels rather than sklearn

In [65]:
from pandas.core import datetools
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [84]:
# f_
f_traindata = f_X_train.join(f_y_train)
f_lm_sm = smf.ols( 'target_return~ Revenue + Cost_of_Sales + Gross_profit\
                + Operating_profit + Net_Profit + Issue_of_shares + Share_repurchase\
                + Non_current_assets + Current_assets + Non_current_liabilities\
                + Current_liabilities + net_cash_op_act',
                data = f_traindata).fit()
print(f_lm_sm.summary()) 

                            OLS Regression Results                            
Dep. Variable:          target_return   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                 -0.021
Method:                 Least Squares   F-statistic:                    0.5776
Date:                Tue, 17 Apr 2018   Prob (F-statistic):              0.859
Time:                        09:54:48   Log-Likelihood:                 168.68
No. Observations:                 253   AIC:                            -311.4
Df Residuals:                     240   BIC:                            -265.4
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [85]:
# pf_
pf_traindata = pf_X_train.join(pf_y_train)
pf_lm_sm = smf.ols( 'target_return~ current_price + moving_average + moving_volatility \
                + trading_range + momentum',
                data = pf_traindata).fit()
print(pf_lm_sm.summary()) 

                            OLS Regression Results                            
Dep. Variable:          target_return   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     1.295
Date:                Tue, 17 Apr 2018   Prob (F-statistic):              0.267
Time:                        09:54:54   Log-Likelihood:                 168.35
No. Observations:                 253   AIC:                            -324.7
Df Residuals:                     247   BIC:                            -303.5
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             0.0337      0.01

In [86]:
# m_
m_traindata = m_X_train.join(m_y_train)
m_lm_sm = smf.ols( 'target_return~ exp_market_change',
                data = m_traindata).fit()
print(m_lm_sm.summary()) 

                            OLS Regression Results                            
Dep. Variable:          target_return   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     2.180
Date:                Tue, 17 Apr 2018   Prob (F-statistic):              0.141
Time:                        09:54:57   Log-Likelihood:                 166.18
No. Observations:                 253   AIC:                            -328.4
Df Residuals:                     251   BIC:                            -321.3
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             0.0333      0.00

In [87]:
# ex_
ex_traindata = ex_X_train.join(ex_y_train)
ex_lm_sm = smf.ols( 'target_return~ rates',
                data = ex_traindata).fit()
print(ex_lm_sm.summary()) 

                            OLS Regression Results                            
Dep. Variable:          target_return   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     5.570
Date:                Tue, 17 Apr 2018   Prob (F-statistic):             0.0190
Time:                        09:54:59   Log-Likelihood:                 167.86
No. Observations:                 253   AIC:                            -331.7
Df Residuals:                     251   BIC:                            -324.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1824      0.066      2.777      0.0

In [89]:
print('Adjusted R squared SCORE')
print('Model pf_: ', pf_lm_sm.rsquared_adj) 
print('Model f_:  ', f_lm_sm.rsquared_adj)
print('Model m_:  ', m_lm_sm.rsquared_adj)
print('Model ex_: ', ex_lm_sm.rsquared_adj)

Adjusted R squared SCORE
Model pf_:  0.00581222044023
Model f_:   -0.0205248602585
Model m_:   0.00466086847822
Model ex_:  0.0178110417458


In [92]:
m_lm = LinearRegression()
m_lm.fit(m_X_train,m_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [93]:
pf_lm = LinearRegression()
pf_lm.fit(pf_X_train,pf_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [94]:
f_lm = LinearRegression()
f_lm.fit(f_X_train,f_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [95]:
ex_lm = LinearRegression()
ex_lm.fit(ex_X_train,ex_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [96]:
m_predictions = m_lm.predict(m_X_test)
pf_predictions = pf_lm.predict(pf_X_test)
f_predictions = f_lm.predict(f_X_test)
ex_predictions = ex_lm.predict(ex_X_test)

In [98]:
print('Test MSE')
print('Model m_: ', metrics.mean_squared_error(m_y_test, m_predictions))
print('Model pf_:', metrics.mean_squared_error(pf_y_test, pf_predictions))
print('Model f_: ', metrics.mean_squared_error(f_y_test, f_predictions))
print('Model ex_:', metrics.mean_squared_error(ex_y_test, ex_predictions))

Test MSE
Model m_:  0.0161412361756
Model pf_: 0.0162671937496
Model f_:  0.0182799024515
Model ex_: 0.0170650399279


In [99]:
print('pf_Intercept:              ', pf_lm.intercept_)

pf_Intercept:               0.0337036272142


In [101]:
-0.0244668917227 + 0.0161412361756

-0.008325655547100002

In [110]:
abs_res = abs(ex_predictions-ex_y_test)

In [118]:
 abs_res_li = sorted(list(abs_res))

In [121]:
abs_res_li[len(abs_res_li)//2]

0.065835776525266176

In [113]:
abs_res.mode()

0      0.000084
1      0.001732
2      0.002643
3      0.003829
4      0.004772
5      0.005047
6      0.008019
7      0.011180
8      0.011784
9      0.013021
10     0.014072
11     0.014301
12     0.016849
13     0.018450
14     0.018831
15     0.019248
16     0.019339
17     0.020635
18     0.021385
19     0.022630
20     0.024096
21     0.024129
22     0.024709
23     0.025617
24     0.026814
25     0.029493
26     0.031672
27     0.032307
28     0.032890
29     0.032989
         ...   
79     0.116015
80     0.118933
81     0.125729
82     0.126175
83     0.126365
84     0.127904
85     0.135128
86     0.138359
87     0.139105
88     0.141029
89     0.143038
90     0.149764
91     0.153364
92     0.159608
93     0.170219
94     0.173417
95     0.176538
96     0.183332
97     0.199749
98     0.212007
99     0.228817
100    0.264977
101    0.275077
102    0.278195
103    0.285989
104    0.298688
105    0.368864
106    0.383506
107    0.413120
108    0.466926
Length: 109, dtype: floa

In [None]:
traindata = X_train.join(y_train)
lm_sm = smf.ols( 'target_return~ current_price + momentum + moving_average\
             + moving_volatility + trading_range + exp_market_change\
             + rates + Revenue + Cost_of_Sales + Gross_profit\
             + Operating_profit + Net_Profit + Issue_of_shares\
             + Share_repurchase + Non_current_assets + Current_assets\
             + Non_current_liabilities + Current_liabilities + net_cash_op_act',
             data = traindata).fit()
print(lm_sm.summary()) 

                            OLS Regression Results                            
Dep. Variable:          target_return   R-squared:                       0.084
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     1.121
Date:                Tue, 05 Dec 2017   Prob (F-statistic):              0.330
Time:                        09:51:05   Log-Likelihood:                 206.36
No. Observations:                 253   AIC:                            -372.7
Df Residuals:                     233   BIC:                            -302.0
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

## Predicting

The trained model can now be used to predict future stock returns. We use the predict() method on our model instance. Note that now we input the testing data => only the features since our model is already trained and now needs to predict the returns (labels).

In [None]:
predictions = lm.predict(X_test)

Next we look at some statistical measures to test our output. In this case we use MSE. A low MSE means that our model generalizes well to unseen data.

In [None]:
print('MSE:', metrics.mean_squared_error(y_test, predictions))

MSE: 0.0244668917227


## Comparing two models

To illustrate how we can compare two linear models, let us build a second model which is almost identical to the model run above, but excludes 'net_cash_op_act' from the predictors. We will then look at three metrics for comparing models, namely adjusted $R^2$, AIC and BIC. 

In [None]:
# Lables
y2 = df['target_return']

# Features
X2 = df.drop(['Date', 'company', 'target_return','net_cash_op_act'], axis=1)

# Train/Test Split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=50)

# Fitting linear regression to training data
lm2 = LinearRegression()
lm2.fit(X_train2,y_train2)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [None]:
print('Intercept:              ', lm2.intercept_)
print('\n')
coeff_df = pd.DataFrame(lm2.coef_,X2.columns,columns=['Coefficient'])
print( coeff_df)

Intercept:               0.241654317677


                          Coefficient
current_price            4.465901e-07
momentum                -1.985497e-02
moving_average          -3.233334e-07
moving_volatility        1.070858e-05
trading_range           -2.330393e-05
exp_market_change       -3.550128e-01
rates                   -3.527479e-02
Revenue                 -2.017030e-05
Cost_of_Sales            2.023262e-05
Gross_profit             2.019402e-05
Operating_profit        -8.856042e-08
Net_Profit              -1.758764e-06
Issue_of_shares          8.152830e-08
Share_repurchase         9.587784e-07
Non_current_assets       1.028664e-09
Current_assets           1.029827e-08
Non_current_liabilities  2.478594e-08
Current_liabilities     -4.328677e-09


In [None]:
traindata2 = X_train2.join(y_train2)
lm2_sm = smf.ols( 'target_return~ current_price + momentum + moving_average\
             + moving_volatility + trading_range + exp_market_change\
             + rates + Revenue + Cost_of_Sales + Gross_profit\
             + Operating_profit + Net_Profit + Issue_of_shares\
             + Share_repurchase + Non_current_assets + Current_assets\
             + Non_current_liabilities + Current_liabilities',
             data = traindata2).fit()
print(lm2_sm.summary()) 

                            OLS Regression Results                            
Dep. Variable:          target_return   R-squared:                       0.081
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     1.153
Date:                Tue, 05 Dec 2017   Prob (F-statistic):              0.302
Time:                        09:51:24   Log-Likelihood:                 206.04
No. Observations:                 253   AIC:                            -374.1
Df Residuals:                     234   BIC:                            -306.9
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

Let us compare the $R^2$ from the two models

In [None]:
print('R squared SCORE')
print('Model 1:', lm.score(X_train, y_train)) 
print('Model 2:', lm2.score(X_train2, y_train2))


R squared SCORE
Model 1: 0.0837699589315
Model 2: 0.0814820736495


If we only used the $R^2$ statistic to compare the two models, we would assume that model 1 provides a better fit. However, model 1 also has 19 parameters compared to model 2 which only has 18. One of the disadvantages with $R^2$ is that it does not adjust for using different parameters, and as you add more and more parameters to your model, you will generally increase the $R^2$ value. To overcome this problem, we can rather look at $$ Adjusted R^2 = 1 - (1-R^2) \times \frac{n-1}{n - p -1} $$ 

In [None]:
print('Adjusted R squared SCORE')
print('Model 1:', lm_sm.rsquared_adj) 
print('Model 2:', lm2_sm.rsquared_adj)


Adjusted R squared SCORE
Model 1: 0.00905592124781
Model 2: 0.0108268485457


We can see that with the adjusted $R^2$ values, model 2 is a better fit. Two alternative methods for comparing model selection is to look at information criteria e.g Akaike's information criterion (AIC) or Bayesian information criterion(BIC). The aim is to choose a model with the lowest AIC or BIC. For further information you can read up  http://scott.fortmann-roe.com/docs/MeasuringError.html.

In [None]:
print('AIC')
print('Model 1:', lm_sm.aic) 
print('Model 2:', lm2_sm.aic)
print('BIC')
print('Model 1:', lm_sm.bic) 
print('Model 2:', lm2_sm.bic)


AIC
Model 1: -372.711806076
Model 2: -374.080836279
BIC
Model 1: -302.044016301
Model 2: -306.946435993


Based on both AIC and BIC model 2 appears to give a better fit. However, let us compare the two models on the test data set and compare the MSE to see which model gives the best fit on the test data.

In [None]:
predictions2 = lm2.predict(X_test2)
print('Test MSE')
print('Model 1:', metrics.mean_squared_error(y_test, predictions))
print('Model 2:', metrics.mean_squared_error(y_test2, predictions2))


Test MSE
Model 1: 0.0244668917227
Model 2: 0.0246023224392


In this scenario we can see that the Test MSE are almost identical. Thus to avoid overfitting, it would be better to choose the model with 18 parameters.

In the next notebook we will look at techniques to improve a multifactor linear regression model.