# OLS Estimation of Model

In [2]:
import pandas as pd
import numpy as np
import gme as gme
#loading the file and inspect data
url = 'https://www.dropbox.com/s/2uha8rwc8bngcsz/servicesdataset%202.xlsx?dl=1'
df = pd.read_excel(url)

#drop null values from relevant columns in df 
trade_data = df[['exp','imp', 'trade', 'year','gdp_exp', 'gdp_imp', 'contig','comlang_off','distw','ent_cost_imp', 'ent_cost_exp', 'colony']]
trade_data = trade_data.dropna()


#include the accessibility column
trade_data['bilateral accessibility'] = np.exp(-np.log(trade_data['distw']))


#create EU dummy column
EU = ['AUT','BEL','CYP','CZE','DNK','EST','FIN','FRA','DEU','GRC','HUN','IRL','ITA','LVA','LTU','LUX','MLT','NLD','POL','SVK','SVN','ESP','SWE','GBR']
imp_EU = {}
exp_EU = {}
imp_countries = trade_data['imp'].tolist()
exp_countries = trade_data['exp'].tolist()
for country in imp_countries:
    if country in EU:
        imp_EU[country] = 1
        exp_EU[country] = 1
    else:
        imp_EU[country] = 0
        exp_EU[country] = 0
trade_data['imp_is_EU'] = trade_data['imp'].map(imp_EU)
trade_data['exp_is_EU'] = trade_data['exp'].map(exp_EU)
trade_data['between_EU'] = trade_data['imp_is_EU']*trade_data['exp_is_EU']
        
#include log GDP and log distance  
trade_data['log_gdp_exp'] = np.log(trade_data['gdp_exp'])
trade_data['log_gdp_imp'] = np.log(trade_data['gdp_imp'])
trade_data['log_distance'] = np.log(trade_data['distw'])

#create new dataframe with non-zero trade column
non_zero_trade_data = trade_data[trade_data['trade'] != 0]
non_zero_trade_data['log_trade'] = np.log(non_zero_trade_data['trade'])

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Run the regression model

In [17]:
#OLS
import statsmodels.api as sm
import statsmodels.formula.api as smf
X = non_zero_trade_data[['log_gdp_exp',
               'log_gdp_imp',
               'log_distance',
               'between_EU',
               'comlang_off',
               'contig'
               ]]
    
Y = non_zero_trade_data['log_trade']
X = sm.add_constant(X) # adding a constant

model = sm.OLS(Y, X).fit()

print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:              log_trade   R-squared:                       0.317
Model:                            OLS   Adj. R-squared:                  0.317
Method:                 Least Squares   F-statistic:                     813.6
Date:                Fri, 26 Feb 2021   Prob (F-statistic):               0.00
Time:                        23:03:03   Log-Likelihood:                -22482.
No. Observations:               10525   AIC:                         4.498e+04
Df Residuals:                   10518   BIC:                         4.503e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          -19.8806      0.450    -44.164   

### The result shows: 
* One percent point increase in exporter GDP leads to on average 0.48 percentage point increase in trade between two countries
* One percentage point increase in distance leads to on average 0.55 percentage point decrease in trade between two countries
* Being both in the EU increases trade between two countries on average 1.55 percentage points

Confounders that may create selection bias between trade and distance, such as common language and border, are controlled in this regression. 

However, confounders that may confound trade and exporter's GDP, such as value of services transacted, are not controlled. This leads to a positive selection bias. Countries that provide more valuable services such as professional services tend to have higher GDP, and if very few countries provide such service, they also trade more. 

### This motivates us to run a 2SLS regression using whether the two countries were in colonial relationship as IV

In [20]:
from statsmodels.sandbox.regression.gmm import IV2SLS  

Instrument = non_zero_trade_data[['colony']]
end = non_zero_trade_data[['log_trade']]
exo = non_zero_trade_data[['log_gdp_exp']]

resultIV = IV2SLS(end, exo, Instrument).fit()
print(resultIV.summary())

                          IV2SLS Regression Results                           
Dep. Variable:              log_trade   R-squared:                       0.267
Model:                         IV2SLS   Adj. R-squared:                  0.267
Method:                     Two Stage   F-statistic:                       nan
                        Least Squares   Prob (F-statistic):                nan
Date:                Fri, 26 Feb 2021                                         
Time:                        23:37:44                                         
No. Observations:               10525                                         
Df Residuals:                   10524                                         
Df Model:                           1                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
log_gdp_exp     0.1495      0.005     30.492      

#### The 2SLS regression reveals that, in fact, a one percentage point increase in exporter's GDP only leads to on average 0.15 percentage point increase in trade. 

### To test the suitability of IV, it needs to satisfy three assumptions: 
* Instrument relevance, where correlation between colony and log_gdp_exp is non-zero
* colony only affects trade through log_gdp_exp
* Instrument is as good as randomly assigned. 

### Assumption 1 can be tested by regressing colony on log_gdp_exp: 

In [5]:
#colony and gdp_ex
import statsmodels.api as sm
import statsmodels.formula.api as smf
model_ce = sm.OLS(non_zero_trade_data['log_gdp_exp'], non_zero_trade_data['colony']).fit()
print(model_ce.summary())

                            OLS Regression Results                            
Dep. Variable:            log_gdp_exp   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     538.1
Date:                Sat, 27 Feb 2021   Prob (F-statistic):          3.81e-116
Time:                        09:55:02   Log-Likelihood:                -49015.
No. Observations:               10525   AIC:                         9.803e+04
Df Residuals:                   10524   BIC:                         9.804e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
colony        26.0774      1.124     23.198      0.0

## Compare the original dataset and dataset for OLS regression

In [5]:
print(trade_data.shape)
print(non_zero_trade_data.shape)

(25069, 19)
(10525, 20)


#### OLS or 2SLS regression forces us to use a much smaller dataset, where only countries that trade with each other are included. This leads to sample selection bias, where characteristics of countries that trade more are inflated.

For example, countries that do not trade could also be more distant, as such, if they are omitted, we will underestimate the negative effect of distance on trade

## Solution: PPML Estimation

In [21]:
#data for estimation
gme_data = gme.EstimationData(data_frame=trade_data,
                              imp_var_name='imp',
                              exp_var_name='exp',
                              trade_var_name='trade',
                              year_var_name='year')

#basic regression data
model_basic = gme.EstimationModel(estimation_data = gme_data,
                                         lhs_var = 'trade',
                                         rhs_var = ['log_gdp_exp',
                                                    'log_gdp_imp',
                                                    'log_distance',
                                                    'comlang_off',
                                                    'contig'
                                                    ])
basic_estimates = model_basic.estimate()
# Return the result object and save it to a new variable for convenience 
results = basic_estimates['all']
print(results.summary())

select specification variables: ['log_gdp_exp', 'log_gdp_imp', 'log_distance', 'comlang_off', 'contig', 'trade', 'imp', 'exp', 'year'], Observations excluded by user: {'rows': 0, 'columns': 10}
drop_intratrade: no, Observations excluded by user: {'rows': 0, 'columns': 0}
drop_imp: none, Observations excluded by user: {'rows': 0, 'columns': 0}
drop_exp: none, Observations excluded by user: {'rows': 0, 'columns': 0}
keep_imp: all available, Observations excluded by user: {'rows': 0, 'columns': 0}
keep_exp: all available, Observations excluded by user: {'rows': 0, 'columns': 0}
drop_years: none, Observations excluded by user: {'rows': 0, 'columns': 0}
keep_years: all available, Observations excluded by user: {'rows': 0, 'columns': 0}
drop_missing: yes, Observations excluded by user: {'rows': 0, 'columns': 0}
Estimation began at 12:44 AM  on Feb 27, 2021
Omitted Columns: []
Estimation completed at 12:44 AM  on Feb 27, 2021
                 Generalized Linear Model Regression Results       

### As you can see, the coefficient on log_gdp is quite similar to that of the 2SLS regression, further confirming our worries for OVBs.