# Solution Assignment

## 0. Load needed libraries and data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import statsmodels.api as sm
from statsmodels.iolib.summary2 import summary_col

In [2]:
df = pd.read_stata('https://github.com/QuantEcon/lecture-source-py/blob/master/source/_static/lecture_specific/ols/maketable2.dta?raw=true')
df['const'] = 1 # Adding a constant vector to a dataset.

In [3]:
df.head(10)

Unnamed: 0,shortnam,africa,lat_abst,avexpr,logpgp95,other,asia,loghjypl,baseco,const
0,AFG,0.0,0.366667,,,0.0,1.0,,,1
1,AGO,1.0,0.136667,5.363636,7.770645,0.0,0.0,-3.411248,1.0,1
2,ARE,0.0,0.266667,7.181818,9.804219,0.0,1.0,,,1
3,ARG,0.0,0.377778,6.386364,9.133459,0.0,0.0,-0.872274,1.0,1
4,ARM,0.0,0.444444,,7.682482,0.0,1.0,,,1
5,AUS,0.0,0.3,9.318182,9.897972,1.0,0.0,-0.170788,1.0,1
6,AUT,0.0,0.524444,9.727273,9.974877,0.0,0.0,-0.3439,,1
7,AZE,0.0,0.447778,,7.306531,0.0,1.0,,,1
8,BDI,1.0,0.036667,,6.565265,0.0,0.0,-3.506558,,1
9,BEL,0.0,0.561111,9.681818,9.992871,0.0,0.0,-0.179127,,1


In [4]:
df.loc[0:1,['africa','asia',]]

Unnamed: 0,africa,asia
0,0.0,1.0
1,1.0,0.0


In [5]:
df.iloc[0:2,[1,6]]

Unnamed: 0,africa,asia
0,0.0,1.0
1,1.0,0.0


## 1.Defining three regression models and estimate them

In [6]:
# Define three sets of features (independant variables)
df=df[['logpgp95','const', 'avexpr', 'lat_abst', 'asia', 'africa', 'other']].dropna().reset_index()
X1 = df[['const', 'avexpr']]
X2 = df[['const', 'avexpr', 'lat_abst']]
X3 = df[['const', 'avexpr', 'lat_abst', 'asia', 'africa', 'other']]
Y = df['logpgp95']
# Regress and drop observations if there are missing values.
reg1 = sm.OLS(Y, X1).fit()
reg2 = sm.OLS(Y, X2).fit()
reg3 = sm.OLS(Y, X3).fit()

In [7]:
info_dict={'R-squared Adj' : lambda x: f"{x.rsquared:.2f}",
           'No. observations' : lambda x: f"{int(x.nobs):d}"}

results_table = summary_col(results=[reg1,reg2,reg3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                         'Model 3'],
                            info_dict=info_dict,
                            regressor_order=['const',
                                             'avexpr',
                                             'lat_abst',
                                             'asia', 'africa', 'other'
                                                    ])

results_table.add_title('Table 1 - OLS Regressions')

print(results_table)

        Table 1 - OLS Regressions
                 Model 1 Model 2 Model 3 
-----------------------------------------
const            4.63*** 4.87*** 5.85*** 
                 (0.30)  (0.33)  (0.34)  
avexpr           0.53*** 0.46*** 0.39*** 
                 (0.04)  (0.06)  (0.05)  
lat_abst                 0.87*   0.33    
                         (0.49)  (0.45)  
asia                             -0.15   
                                 (0.15)  
africa                           -0.92***
                                 (0.17)  
other                            0.30    
                                 (0.37)  
R-squared        0.61    0.62    0.70    
                 0.61    0.62    0.72    
R-squared Adj    0.61    0.62    0.72    
No. observations 111     111     111     
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01


## 2. Splitting into 5-folds cross validation with Mean Absolute Error (MAE) Metrics.

In [8]:
from sklearn.model_selection import KFold 

In [9]:
k = 5 # Let in be 5 folds
kf = KFold(n_splits=k)

In [10]:
for X,Y in zip([X1,X2,X3],[Y,Y,Y]):
    err = 0
    for train,test in kf.split(X):
            reg = sm.OLS(Y.loc[train], X.loc[train]).fit()
            y_pred =reg.predict(X.loc[test])
            e = Y.loc[test]-y_pred
            err += np.sum(np.absolute(e))     
    MAE_5 = err/(len(Y))
    print('MAE on 5-fold CV: {}'.format(MAE_5))

MAE on 5-fold CV: 0.5428409582409092
MAE on 5-fold CV: 0.559264498873126
MAE on 5-fold CV: 0.4891565491239306


In [11]:
for X,Y in zip([X1,X2,X3],[Y,Y,Y]):
    err = 0
    for train,test in kf.split(X):
            reg = sm.OLS(Y.loc[train], X.loc[train]).fit()
            y_pred =reg.predict(X.loc[test])
            e = Y.loc[test]-y_pred
            err += np.sum(np.absolute(e))/len(Y.loc[test])  # Denominator is the number of out-sample observations 
    MAE_5 = err/k # k is the number of cross validatio folds. Thus, len(Y.loc[test])*k = number of observations
    print('MAE on 5-fold CV: {}'.format(MAE_5))

MAE on 5-fold CV: 0.543164089507955
MAE on 5-fold CV: 0.559550302371463
MAE on 5-fold CV: 0.48932017925615046


## 3. Conclusions on MAE comparing to R-sq

From MAE, model 3 yielded the lowest MAE, the highest accuracy for out-sample forcasting among peers. With the same direction to R-sq, model 3 gave the best in-sample predictive ability due to its highest R-sq value. In conclusion, getting more variables might help improving both in-sample and out-sample accuracy, but further investigation may be required.