In [186]:
from linearmodels import PanelOLS
import pandas as pd
import numpy as np
import numpy.linalg as la
import quantecon as qe
import statsmodels as sm


In [187]:
%store -r digital

In [188]:
digital = digital.copy()
digital.describe()

Unnamed: 0,date,Access Electricity,Cellular %,Fixed broadband %,GDP pcp PPP,Landline %,Secure Servers,Urbanisation,internet_users
count,4760.0,4281.0,4706.0,2582.0,4609.0,4716.0,2544.0,4760.0,4112.0
mean,2003.603151,74.965174,47.593259,8.127566,13725.649134,18.386481,5035.287264,54.723774,22.301736
std,7.486815,32.99137,52.354724,11.018232,17127.331388,18.871348,31531.731324,24.149608,26.907292
min,1991.0,0.013999,0.0,0.0,242.001214,0.0,1.0,5.491,0.0
25%,1997.0,51.980549,0.814175,0.1602,2393.315832,2.25942,11.0,33.8025,1.059273
50%,2004.0,94.973044,24.868669,2.345964,7144.218057,11.498102,73.0,54.74,8.71
75%,2010.0,100.0,89.504912,12.675103,18921.2116,29.457233,622.0,73.93325,38.225
max,2016.0,100.0,332.090701,61.738849,140037.115597,110.191151,530309.0,100.0,98.32361


In [189]:
digital.head()

Unnamed: 0,country,date,Access Electricity,Cellular %,Fixed broadband %,GDP pcp PPP,Landline %,Secure Servers,Urbanisation,internet_users,Code,Region,cgroup
0,Afghanistan,2016,,66.003744,0.02689,1876.544682,0.348892,49.0,27.132,10.595726,AFG,South Asia,L
1,Afghanistan,2015,,61.577682,0.02208,1861.124332,0.343677,46.0,26.703,8.26,AFG,South Asia,L
2,Afghanistan,2014,89.5,58.845471,0.004795,1875.447407,0.325861,32.0,26.282,7.0,AFG,South Asia,L
3,Afghanistan,2013,75.154373,55.012226,0.00491,1877.411953,0.313466,30.0,25.871,5.9,AFG,South Asia,L
4,Afghanistan,2012,69.1,51.434547,0.005029,1873.153946,0.301822,33.0,25.468,5.454545,AFG,South Asia,L


In [190]:
#add a year categorical variable for the pooled OLS analysis
year = pd.Categorical(digital['date'])




In [191]:
year

[2016, 2015, 2014, 2013, 2012, ..., 1995, 1994, 1993, 1992, 1991]
Length: 4760
Categories (26, int64): [1991, 1992, 1993, 1994, ..., 2013, 2014, 2015, 2016]

In [192]:
digital.set_index(['country', 'date'], inplace = True)
digital['year'] = year


In [193]:
digital.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Access Electricity,Cellular %,Fixed broadband %,GDP pcp PPP,Landline %,Secure Servers,Urbanisation,internet_users,Code,Region,cgroup,year
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Afghanistan,2016,,66.003744,0.02689,1876.544682,0.348892,49.0,27.132,10.595726,AFG,South Asia,L,2016
Afghanistan,2015,,61.577682,0.02208,1861.124332,0.343677,46.0,26.703,8.26,AFG,South Asia,L,2015
Afghanistan,2014,89.5,58.845471,0.004795,1875.447407,0.325861,32.0,26.282,7.0,AFG,South Asia,L,2014
Afghanistan,2013,75.154373,55.012226,0.00491,1877.411953,0.313466,30.0,25.871,5.9,AFG,South Asia,L,2013
Afghanistan,2012,69.1,51.434547,0.005029,1873.153946,0.301822,33.0,25.468,5.454545,AFG,South Asia,L,2012


In [194]:
#for analysis: need to add an intercept
digital['intercept'] = 1
#alternative: sm.add_constant

In [195]:
digital.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Access Electricity,Cellular %,Fixed broadband %,GDP pcp PPP,Landline %,Secure Servers,Urbanisation,internet_users,Code,Region,cgroup,year,intercept
country,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Afghanistan,2016,,66.003744,0.02689,1876.544682,0.348892,49.0,27.132,10.595726,AFG,South Asia,L,2016,1
Afghanistan,2015,,61.577682,0.02208,1861.124332,0.343677,46.0,26.703,8.26,AFG,South Asia,L,2015,1
Afghanistan,2014,89.5,58.845471,0.004795,1875.447407,0.325861,32.0,26.282,7.0,AFG,South Asia,L,2014,1
Afghanistan,2013,75.154373,55.012226,0.00491,1877.411953,0.313466,30.0,25.871,5.9,AFG,South Asia,L,2013,1
Afghanistan,2012,69.1,51.434547,0.005029,1873.153946,0.301822,33.0,25.468,5.454545,AFG,South Asia,L,2012,1


In [196]:
exog = digital.drop(['internet_users', 'Code'], axis = 1)

In [197]:
exog.head()
#including a dummy for every year makes the matrix not have full column rank, need to investigate further later
test_exog = exog.drop('year', axis = 1)

For comparison purpose, I will estimate the following models: 
1. Pooled OLS as baseline
2. Random Effects 
3. The between estimator
4. Fixed effect (time or entity or both)
5. First difference
Then, I will compare all these diference models

In [198]:
#1 Pooled OLS
from linearmodels.panel import PooledOLS
mod = PooledOLS(digital['internet_users'], test_exog)
pooled_res = mod.fit()
pooled_res


Inputs contain missing values. Dropping rows with missing observations.


0,1,2,3
Dep. Variable:,internet_users,R-squared:,0.8790
Estimator:,PooledOLS,R-squared (Between):,0.9093
No. Observations:,1903,R-squared (Within):,0.7542
Date:,"Mon, Oct 09 2017",R-squared (Overall):,0.8790
Time:,22:15:24,Log-likelihood,-6965.9
Cov. Estimator:,Unadjusted,,
,,F-statistic:,856.00
Entities:,185,P-value,0.0000
Avg Obs:,10.286,Distribution:,"F(16,1886)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Access Electricity,0.0889,0.0178,5.0083,0.0000,0.0541,0.1238
Cellular %,0.0963,0.0072,13.419,0.0000,0.0822,0.1104
Fixed broadband %,1.4045,0.0369,38.042,0.0000,1.3320,1.4769
GDP pcp PPP,7.445e-05,2.04e-05,3.6503,0.0003,3.445e-05,0.0001
Landline %,-0.0437,0.0245,-1.7798,0.0753,-0.0918,0.0045
Secure Servers,1.842e-05,8.777e-06,2.0985,0.0360,1.205e-06,3.563e-05
Urbanisation,-0.0091,0.0151,-0.6027,0.5468,-0.0387,0.0205
Region.Europe & Central Asia,1.6454,0.7594,2.1667,0.0304,0.1560,3.1347
Region.Latin America & Caribbean,-1.0971,0.7984,-1.3741,0.1696,-2.6629,0.4688


In [265]:
#2 Random Effects
from linearmodels.panel import RandomEffects
mod = RandomEffects(digital['internet_users'], test_exog)
re_res = mod.fit()

Inputs contain missing values. Dropping rows with missing observations.


In [200]:
re_res

0,1,2,3
Dep. Variable:,internet_users,R-squared:,0.8010
Estimator:,RandomEffects,R-squared (Between):,0.8817
No. Observations:,1903,R-squared (Within):,0.7860
Date:,"Mon, Oct 09 2017",R-squared (Overall):,0.8650
Time:,22:15:25,Log-likelihood,-6211.1
Cov. Estimator:,Unadjusted,,
,,F-statistic:,474.41
Entities:,185,P-value,0.0000
Avg Obs:,10.286,Distribution:,"F(16,1886)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Access Electricity,0.0965,0.0315,3.0674,0.0022,0.0348,0.1582
Cellular %,0.1645,0.0063,25.912,0.0000,0.1520,0.1769
Fixed broadband %,1.2738,0.0346,36.843,0.0000,1.2060,1.3416
GDP pcp PPP,-3.968e-05,3.144e-05,-1.2620,0.2071,-0.0001,2.199e-05
Landline %,0.0255,0.0328,0.7769,0.4373,-0.0388,0.0897
Secure Servers,-3.689e-05,1.085e-05,-3.4004,0.0007,-5.817e-05,-1.561e-05
Urbanisation,0.0393,0.0332,1.1830,0.2370,-0.0258,0.1044
Region.Europe & Central Asia,-0.2125,1.7951,-0.1184,0.9058,-3.7331,3.3081
Region.Latin America & Caribbean,-3.8845,1.8535,-2.0958,0.0362,-7.5196,-0.2493


In [201]:
re_res.variance_decomposition

Effects                   42.856798
Residual                  38.315311
Percent due to Effects     0.527974
Name: Variance Decomposition, dtype: float64

In [202]:
re_res.theta.head()

Unnamed: 0_level_0,theta
entity,Unnamed: 1_level_1
Afghanistan,0.610534
Albania,0.6994
Algeria,0.736681
Angola,0.6994
Antigua and Barbuda,0.6994


In [203]:
#3 Between Estimator
from linearmodels.panel import BetweenOLS
mod = BetweenOLS(digital['internet_users'], test_exog)
be_res = mod.fit()

Inputs contain missing values. Dropping rows with missing observations.


In [204]:
be_res

0,1,2,3
Dep. Variable:,internet_users,R-squared:,0.9220
Estimator:,BetweenOLS,R-squared (Between):,0.9220
No. Observations:,185,R-squared (Within):,0.6359
Date:,"Mon, Oct 09 2017",R-squared (Overall):,0.8599
Time:,22:15:27,Log-likelihood,-611.86
Cov. Estimator:,Unadjusted,,
,,F-statistic:,124.11
Entities:,185,P-value,0.0000
Avg Obs:,10.286,Distribution:,"F(16,168)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Access Electricity,0.1314,0.0425,3.0870,0.0024,0.0474,0.2154
Cellular %,-0.0126,0.0247,-0.5088,0.6116,-0.0614,0.0363
Fixed broadband %,1.6787,0.1770,9.4815,0.0000,1.3292,2.0282
GDP pcp PPP,0.0001,6.001e-05,2.1342,0.0343,9.599e-06,0.0002
Landline %,-0.1443,0.0886,-1.6281,0.1054,-0.3193,0.0307
Secure Servers,6.63e-05,2.552e-05,2.5980,0.0102,1.592e-05,0.0001
Urbanisation,-0.0211,0.0368,-0.5753,0.5659,-0.0937,0.0514
Region.Europe & Central Asia,3.1860,1.8720,1.7019,0.0906,-0.5096,6.8816
Region.Latin America & Caribbean,0.8648,1.8971,0.4559,0.6491,-2.8804,4.6100


In [205]:
#4 Panel with time effects
from linearmodels.panel import PanelOLS
mod = PanelOLS(digital['internet_users'], test_exog, time_effects = True)
fet_res = mod.fit()

Inputs contain missing values. Dropping rows with missing observations.


In [206]:
fet_res

0,1,2,3
Dep. Variable:,internet_users,R-squared:,0.8804
Estimator:,PanelOLS,R-squared (Between):,0.9118
No. Observations:,1903,R-squared (Within):,0.6973
Date:,"Mon, Oct 09 2017",R-squared (Overall):,0.8672
Time:,22:15:30,Log-likelihood,-6879.3
Cov. Estimator:,Unadjusted,,
,,F-statistic:,861.97
Entities:,185,P-value,0.0000
Avg Obs:,10.286,Distribution:,"F(16,1874)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Access Electricity,0.1037,0.0172,6.0389,0.0000,0.0700,0.1373
Cellular %,0.0430,0.0087,4.9551,0.0000,0.0260,0.0601
Fixed broadband %,1.1888,0.0418,28.433,0.0000,1.1068,1.2707
GDP pcp PPP,9.806e-05,1.971e-05,4.9758,0.0000,5.941e-05,0.0001
Landline %,0.0799,0.0262,3.0523,0.0023,0.0286,0.1313
Secure Servers,1.112e-05,8.444e-06,1.3168,0.1881,-5.441e-06,2.768e-05
Urbanisation,0.0176,0.0147,1.1939,0.2327,-0.0113,0.0465
Region.Europe & Central Asia,2.0012,0.7288,2.7457,0.0061,0.5718,3.4305
Region.Latin America & Caribbean,-1.0431,0.7661,-1.3617,0.1735,-2.5455,0.4593


Panel with entity effects does not run, because one variable is fully accounted for. Most likely culprit: 
Region

In [207]:
fe_exog = test_exog.drop(["Region", 'intercept'], axis = 1)

In [208]:
#4. Fixed effect without region
mod = PanelOLS(digital['internet_users'], fe_exog, entity_effects = True)
fe_res = mod.fit(cov_type ='robust')

Inputs contain missing values. Dropping rows with missing observations.


In [209]:
fe_res

0,1,2,3
Dep. Variable:,internet_users,R-squared:,0.7897
Estimator:,PanelOLS,R-squared (Between):,0.7713
No. Observations:,1903,R-squared (Within):,0.7897
Date:,"Mon, Oct 09 2017",R-squared (Overall):,0.8050
Time:,22:15:40,Log-likelihood,-6063.1
Cov. Estimator:,Robust,,
,,F-statistic:,641.26
Entities:,185,P-value,0.0000
Avg Obs:,10.286,Distribution:,"F(10,1708)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Access Electricity,0.1076,0.0556,1.9337,0.0533,-0.0015,0.2166
Cellular %,0.1755,0.0088,20.054,0.0000,0.1584,0.1927
Fixed broadband %,1.3026,0.0543,24.002,0.0000,1.1962,1.4091
GDP pcp PPP,-0.0002,6.746e-05,-2.5260,0.0116,-0.0003,-3.809e-05
Landline %,-0.0281,0.0566,-0.4970,0.6193,-0.1391,0.0828
Secure Servers,-6.46e-05,9.308e-06,-6.9407,0.0000,-8.286e-05,-4.635e-05
Urbanisation,0.3642,0.1331,2.7353,0.0063,0.1030,0.6253
cgroup.L,-5.8763,1.5699,-3.7432,0.0002,-8.9554,-2.7973
cgroup.LM,-7.7209,1.3243,-5.8300,0.0000,-10.318,-5.1234


In a later analysis: look at the different options: https://bashtage.github.io/linearmodels/doc/panel/models.html#linearmodels.panel.model.PanelOLS

In [210]:
#4. Fixed effect time and entity(might need to exclude cgroup) 
mod = PanelOLS(digital['internet_users'], fe_exog, entity_effects = True, time_effects = True)
fetm_res = mod.fit(cov_type ='robust')

Inputs contain missing values. Dropping rows with missing observations.


In [211]:
fetm_res

0,1,2,3
Dep. Variable:,internet_users,R-squared:,0.2946
Estimator:,PanelOLS,R-squared (Between):,-2.2727
No. Observations:,1903,R-squared (Within):,0.5135
Date:,"Mon, Oct 09 2017",R-squared (Overall):,-1.9309
Time:,22:16:01,Log-likelihood,-5807.3
Cov. Estimator:,Robust,,
,,F-statistic:,70.840
Entities:,185,P-value,0.0000
Avg Obs:,10.286,Distribution:,"F(10,1696)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Access Electricity,-0.3005,0.0555,-5.4157,0.0000,-0.4094,-0.1917
Cellular %,0.0794,0.0103,7.7223,0.0000,0.0592,0.0996
Fixed broadband %,0.7911,0.0644,12.275,0.0000,0.6647,0.9175
GDP pcp PPP,-0.0002,5.436e-05,-3.2709,0.0011,-0.0003,-7.119e-05
Landline %,0.1249,0.0494,2.5281,0.0116,0.0280,0.2217
Secure Servers,-7.884e-05,1.241e-05,-6.3502,0.0000,-0.0001,-5.449e-05
Urbanisation,-0.3182,0.1220,-2.6088,0.0092,-0.5574,-0.0790
cgroup.L,-1.6486,1.3883,-1.1875,0.2352,-4.3716,1.0744
cgroup.LM,-5.6983,1.1422,-4.9888,0.0000,-7.9386,-3.4580


In [266]:
#5. First difference
from linearmodels.panel import FirstDifferenceOLS
#need to exclude time_invariant variables, in particular region 
mod = FirstDifferenceOLS(digital['internet_users'], fd_exog)
fd_res = mod.fit()

Inputs contain missing values. Dropping rows with missing observations.


In [267]:
fd_res

0,1,2,3
Dep. Variable:,internet_users,R-squared:,0.5345
Estimator:,FirstDifferenceOLS,R-squared (Between):,-1.3204
No. Observations:,1660,R-squared (Within):,0.7646
Date:,"Mon, Oct 09 2017",R-squared (Overall):,-0.9229
Time:,22:53:51,Log-likelihood,-5007.9
Cov. Estimator:,Unadjusted,,
,,F-statistic:,189.44
Entities:,185,P-value,0.0000
Avg Obs:,10.286,Distribution:,"F(10,1650)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Access Electricity,0.1442,0.0614,2.3493,0.0189,0.0238,0.2647
Cellular %,0.1079,0.0084,12.816,0.0000,0.0914,0.1244
Fixed broadband %,1.2324,0.0522,23.614,0.0000,1.1300,1.3347
GDP pcp PPP,4.912e-05,5.113e-05,0.9606,0.3369,-5.117e-05,0.0001
Landline %,-0.1178,0.0557,-2.1155,0.0345,-0.2269,-0.0086
Secure Servers,-4.342e-05,1.576e-05,-2.7545,0.0059,-7.434e-05,-1.25e-05
Urbanisation,1.0168,0.1907,5.3313,0.0000,0.6427,1.3909
cgroup.L,-1.0900,1.4686,-0.7423,0.4580,-3.9705,1.7904
cgroup.LM,-1.0664,1.1116,-0.9594,0.3375,-3.2468,1.1139


In [214]:
from linearmodels.panel import compare
compare({'BE':be_res,'RE':re_res,'FE': fe_res, 'Pooled':pooled_res})

0,1,2,3,4
,BE,FE,Pooled,RE
Dep. Variable,internet_users,internet_users,internet_users,internet_users
Estimator,BetweenOLS,PanelOLS,PooledOLS,RandomEffects
No. Observations,185,1903,1903,1903
Cov. Est.,Unadjusted,Robust,Unadjusted,Unadjusted
R-squared,0.9220,0.7897,0.8790,0.8010
R-Squared (Within),0.6359,0.7897,0.7542,0.7860
R-Squared (Between),0.9220,0.7713,0.9093,0.8817
R-Squared (Overall),0.8599,0.8050,0.8790,0.8650
F-statistic,124.11,641.26,856.00,474.41


Is random or fixed effects more appropriate?

In [258]:
#quick and dirty fix: 
import statsmodels.formula.api as sm
from scipy import stats
def hausman(fe, reparams, recov):
    """
    Compute hausman test for fixed effects/random effects models

    b = beta_fe
    B = beta_re

    From theory we have that b is always consistent, but B is consistent
    under the alternative hypothesis and efficient under the null.

    The test statistic is computed as

    z = (b - B)' [V_b - v_B^{-1}](b - B)

    The statistic is distributed z \sim \chi^2(k), where k is the number
 import statsmodels.formula.api as sm
from scipy import stats   of regressors in the model.

    Parameters
    ==========
    fe : statsmodels.regression.linear_panel.PanelLMWithinResults
        The results obtained by using sm.PanelLM with the
        method='within' option.

    re : statsmodels.regression.linear_panel.PanelLMRandomResults
        The results obtained by using sm.PanelLM with the
        method='swar' option.

    Returns
    =======
    chi2 : float
        The test statistic

    df : int
        The number of degrees of freedom for the distribution of the
        test statistic

    pval : float
        The p-value associated with the null hypothesis

    Notes
    =====
    The null hypothesis supports the claim that the random effects
    estimator is "better". If we reject this hypothesis it is the same
    as saying we should be using fixed effects because there are
    systematic differences in the coefficients.

    """

    # Pull data out
    b = fe.params
    B = reparams
    v_b = fe.cov
    v_B = recov

    # NOTE: find df. fe should toss time-invariant variables, but it
    #       doesn't. It does return garbage so we use that to filter
    df = b[np.abs(b) < 1e8].size

    # compute test statistic and associated p-value
    chi2 = np.dot((b - B).T, la.inv(v_b - v_B).dot(b - B))
    test = la.inv(v_b - v_B).dot(b - B)
    pval = stats.chi2.sf(chi2, df)

    return chi2, df, pval

#Code adapted from Econtools


In [232]:
re_res_params = re_res.params.drop(re_res.params.index.difference(fe_res.params.index))

In [247]:
re_res_cov = re_res.cov.drop(re_res.cov.index.difference(fe_res.cov.index), axis = 0)
re_res_cov.drop(re_res.cov.index.difference(fe_res.cov.index), axis = 1, inplace = True)

In [268]:
hausman(fe_res, re_res_params, re_res_cov)

(79.161497234698388, 10, 7.328261259649491e-13)

The test statistic is large, and we reject the null at a reasonable confidence level, therefore random effects is preferred. The most likely explanasion is our inclusion of region dummies and high/low income, which are likelyto absorb most variation from the entities.