# Model analysis of _Multivareate Linear Regression_

$$Y~=~\beta_0+\beta_1X_1+\beta_2X_2+...+\beta_pX_p+error$$

### Import Data to be Trained and Tested on

In [2]:
#imports
import pandas as pd

In [3]:
# af_ data with all the features
df_all_feat = pd.read_csv('data_all.csv').drop(['Unnamed: 0','Date', 'company'], axis=1)
df_all_feat.head()

Unnamed: 0,current_price,momentum,moving_average,moving_volatility,trading_range,target_return,exp_market_change,rates,Revenue,Cost_of_Sales,...,Non_current_liabilities,Current_liabilities,net_cash_op_act,Debt_M,Debt(perc_GDP),Debt_Per_Capita,CPI_perc,GDP_Mill_dollars,GDP_Growth_perc,Unemployment_alt
0,29500.0,-0.103343,30598.216667,1854.530002,850.55,0.090136,-0.016852,7.08,951.4,621.9,...,220.7,454.3,959.6,130020,34.68,2557.0,5.468,375.304,3.0,24.7
1,31150.0,0.139982,28743.15,1079.933674,697.066667,-0.053612,0.044321,6.58,845.3,606.0,...,221.0,396.3,816.6,130020,34.68,2557.0,5.468,375.304,3.0,24.7
2,31420.0,-0.026943,31682.016667,1388.464601,864.983333,0.050286,0.008204,6.42,991.8,609.9,...,306.8,328.0,1003.0,130020,34.68,2557.0,5.468,375.304,3.0,24.7
3,34987.0,0.096771,32142.716667,784.923382,705.033333,-0.060365,0.136024,5.65,1066.8,665.9,...,412.7,197.5,1056.6,130020,34.68,2557.0,5.468,375.304,3.0,24.7
4,33230.0,-0.007586,32411.016667,1134.366223,629.016667,-0.061601,0.076571,5.53,1109.5,701.6,...,444.2,97.4,1095.5,159429,38.23,3091.0,3.571,416.879,3.3,24.7


In [38]:
# sf_ data with only significant features
df_sig_feat = pd.read_csv('data_signif.csv').drop('Unnamed: 0', axis=1)
df_sig_feat.head()

Unnamed: 0,target_return,exp_market_change,rates,Debt_M,Debt_Per_Capita,CPI_perc,GDP_Mill_dollars,GDP_Growth_perc,Unemployment_alt
0,0.090136,-0.016852,7.08,130020,2557.0,5.468,375.304,3.0,24.7
1,-0.053612,0.044321,6.58,130020,2557.0,5.468,375.304,3.0,24.7
2,0.050286,0.008204,6.42,130020,2557.0,5.468,375.304,3.0,24.7
3,-0.060365,0.136024,5.65,130020,2557.0,5.468,375.304,3.0,24.7
4,-0.061601,0.076571,5.53,159429,3091.0,3.571,416.879,3.3,24.7


In [39]:
af_y = df_all_feat['target_return']
af_X = df_all_feat.drop('target_return',axis=1)

In [40]:
sf_y = df_sig_feat['target_return']
sf_X = df_sig_feat.drop('target_return',axis=1)

### Train, Test, Split

In [41]:
from sklearn.model_selection import train_test_split

In [42]:
# af_
af_X_train, af_X_test, af_y_train, af_y_test = train_test_split(af_X, af_y, test_size=0.3, random_state=42)

In [43]:
# sf_
sf_X_train, sf_X_test, sf_y_train, sf_y_test = train_test_split(sf_X, sf_y, test_size=0.3, random_state=42)

### Model/ Regression Analysis Tool:
__model1:__ using all the columns

__model2:__ using _best features_


In [45]:
from sklearn.linear_model import LinearRegression

In [46]:
af_lm = LinearRegression()

sf_lm = LinearRegression()

In [48]:
# train
af_lm.fit(af_X_train, af_y_train)

sf_lm.fit(sf_X_train, sf_y_train)

In [107]:
print('Intercept')
print('af_ data with all the features:            {}'.format(af_lm.intercept_))
print('sf_ data with only significant features:  {}'.format(sf_lm.intercept_))

Intercept
af_ data with all the features:            3.6676436106798143
sf_ data with only significant features:  -7.830813629934605


In [52]:
af_coeff_df = pd.DataFrame(af_lm.coef_,af_X.columns,columns=['af_Coefficient'])
af_coeff_df

Unnamed: 0,af_Coefficient
current_price,-8.886334e-06
momentum,0.03244438
moving_average,8.923288e-06
moving_volatility,4.199535e-05
trading_range,-5.790419e-05
exp_market_change,-0.7091349
rates,-0.1500951
Revenue,-4.013521e-05
Cost_of_Sales,4.03237e-05
Gross_profit,4.028782e-05


In [53]:
sf_coeff_df = pd.DataFrame(sf_lm.coef_,sf_X.columns,columns=['sf_Coefficient'])
sf_coeff_df

Unnamed: 0,sf_Coefficient
exp_market_change,-0.755018
rates,-0.14655
Debt_M,0.000223
Debt_Per_Capita,-0.014213
CPI_perc,0.079574
GDP_Mill_dollars,0.021879
GDP_Growth_perc,0.284082
Unemployment_alt,0.273447


## Performance
R2, F - Tests, Adjusted R2, AIC, BIC 

In [54]:
# get model predictions
# Get predictions for training data
af_fit_lm = af_lm.predict(af_X_train)
sf_fit_lm = sf_lm.predict(sf_X_train)

# get model predictions
# Get predictions for testing data
af_fit_test_lm = af_lm.predict(af_X_test)
sf_fit_test_lm = sf_lm.predict(sf_X_test)

In [55]:
from sklearn import metrics

In [78]:
#'MSE (train)')
af_mse_train =  metrics.mean_squared_error(af_y_train, af_fit_lm)
sf_mse_train =  metrics.mean_squared_error(sf_y_train, sf_fit_lm)

#MSE (test)')
af_mse_test = metrics.mean_squared_error(af_y_test, af_fit_test_lm)
sf_mse_test = metrics.mean_squared_error(sf_y_test, sf_fit_test_lm)

In [93]:
print('MEAN SQUARED ERROR                       MSE (train)               MSE (test)')
print('af_ data with all the features:          {}      {}'.format(af_mse_train, af_mse_test))
print('sf_ data with only significant features: {}      {}'.format(sf_mse_train, sf_mse_test))

MEAN SQUARED ERROR                       MSE (train)               MSE (test)
af_ data with all the features:          0.013257051340309297      0.014914183378949565
sf_ data with only significant features: 0.013964740553623118      0.01400770930792062


In [97]:
# R^2 train
af_R2_train = af_lm.score(af_X_train, af_y_train)
sf_R2_train = sf_lm.score(sf_X_train, sf_y_train)

# R^2 (test)
af_R2_test = af_lm.score(af_X_test, af_y_test)
sf_R2_test = sf_lm.score(sf_X_test, sf_y_test)

In [104]:
print('R SQUARED SCORE                          R^2(train)               R^2(test)')
print('af_ data with all the features:          {}      {}'.format(af_R2_train, af_R2_test))
print('sf_ data with only significant features: {}      {}'.format(sf_R2_train, sf_R2_test))

R SQUARED SCORE                          R^2(train)               R^2(test)
af_ data with all the features:          0.16884159444470093      0.08761309621561031
sf_ data with only significant features: 0.12447261501877714      0.14306736079132054


### Pros

### Cons