In [1]:
import pandas as pd

In [28]:
common = pd.read_csv('../data/raw/fixed_dyadic_variables.csv')

In [29]:
common.shape

(50176, 7)

In [31]:
# removing self_loops
common_unique = common[common['iso_o'] != common['iso_d']]

In [32]:
# reduced 224 rows 
common_unique.shape 

(49952, 7)

In [33]:
common_unique.head()

Unnamed: 0,iso_o,iso_d,contig,comlang_off,colony,comcol,dist
1,ABW,AFG,0,0,0,0,13257.814
2,ABW,AGO,0,0,0,0,9516.9131
3,ABW,AIA,0,0,0,0,983.26825
4,ABW,ALB,0,0,0,0,9091.7422
5,ABW,AND,0,1,0,0,7572.7881


In [8]:
common_unique.to_csv("../data/cleaned/dyads_fixed.csv", encoding='utf-8', index=False)

### Node level fixed variables

In [9]:
node = pd.read_csv('../data/raw/fixed_nodal_variables.csv', encoding='latin1')

In [10]:
node.shape

(238, 9)

In [11]:
# renaming the exporter column name to match it with the dyad level fixed variable data
node = node.rename(columns = {'iso3': 'iso_o'})

In [12]:
node.columns

Index(['iso2', 'iso_o', 'country', 'landlocked', 'continent', 'lat', 'lon',
       'langoff_1', 'colonizer1'],
      dtype='object')

In [21]:
# removing duplicates
equivalent_columns = ['iso2', 'iso_o', 'country', 'landlocked', 'continent', 'langoff_1', 'colonizer1']
duplicates = node.duplicated(subset = equivalent_columns, keep = 'last')
node[duplicates].shape

(13, 9)

In [22]:
node_unique = node[~duplicates]
node_unique.shape

(225, 9)

In [20]:
node_unique['iso_o'].nunique()

225

In [24]:
# There is one country in the node level data which is not in the dyad level data
difference = set(node_unique['iso_o']).difference(common_unique['iso_o'])

In [25]:
difference

{'ATF'}

In [26]:
node_unique_matches = node_unique[node_unique['iso_o'] != 'ATF']

In [34]:
node_unique_matches.shape

(224, 9)

In [36]:
node_unique_matches.to_csv("../data/cleaned/nodal_fixed.csv", encoding='utf-8', index=False)

### Merging the two datasets with time invariant factors

In [37]:
fixed = common_unique.merge(node_unique_matches, on = 'iso_o')

In [55]:
assert fixed.shape[0] == common_unique.shape[0]

In [56]:
common_unique['iso_o'].nunique() == fixed['iso_o'].nunique() == node_unique_matches['iso_o'].nunique()

True

In [57]:
fixed.shape

(49952, 15)

In [58]:
# This dataset of time invariant variables will be used both in forming networks and in the gravity model estimation
fixed.to_csv("../data/cleaned/fixed.csv", encoding='utf-8', index=False)