In [48]:
import pandas as pd
import numpy as np

**Steps to prepare the fixed dataset:**
> 1. Reading the 'dist_cepii.csv' and 'geo_cepii.csv' files contaning the dyad level fixed variables and node level fixed variables respectively 
> 2. Removing self loops, and duplicates if there is any
> 3. Checking for mismatched countries in the node and dyad level data and removing that if there is any
> 4. For developing and developed category, finding out the mismatch in the 'country' column of IMF data and node level fixed factors data
> 5. There are differences in the country names as IMF website has the latest official names, while the CEPII website uses the older names---the variables are static. Here, I edited the CEPII names while keeping the IMF names as benchmark, as I merged this two datasets on country names.
> 6. Merging the IMF data on 'country' column with the node level fixed factor data
> 7. Saving both cleaned data frames as separate csv files.
> 8. Merging the two dataset on 'iso_o' which is the ISO Alpha3 value for the exporter country

In [49]:
dyad_fixed = pd.read_csv('../data/raw/dist_cepii.csv')

In [50]:
dyad_fixed = dyad_fixed[['iso_o', 'iso_d', 'contig', 'comlang_off', 'colony', 'comcol', 'dist']]

In [51]:
dyad_fixed['log_dist'] = np.log(dyad_fixed['dist'])

In [52]:
dyad_fixed.shape

(50176, 8)

In [53]:
# removing self_loops
dyad_fixed = dyad_fixed[dyad_fixed['iso_o'] != dyad_fixed['iso_d']]

In [54]:
# reduced 224 rows which were self loops
dyad_fixed.shape 

(49952, 8)

In [55]:
dyad_fixed.head(2)

Unnamed: 0,iso_o,iso_d,contig,comlang_off,colony,comcol,dist,log_dist
1,ABW,AFG,0,0,0,0,13257.81,9.492342
2,ABW,AGO,0,0,0,0,9516.913,9.160826


In [56]:
# checking for duplicates
dupli = dyad_fixed.duplicated(keep = 'last')
#dyad_fixed[dupli].shape[0]
dyad_fixed_unique = dyad_fixed[~dupli]
print(dyad_fixed_unique.shape)
# There is no duplicates present in this dist_cepii data

(49952, 8)


In [57]:
print(dyad_fixed_unique['iso_o'].nunique())
print(dyad_fixed_unique['iso_d'].nunique())
print(dyad_fixed_unique.groupby(['iso_o', 'iso_d']).ngroups)

224
224
49952


### Node level fixed variables

In [58]:
node = pd.read_csv('../data/raw/geo_cepii.csv', encoding='latin1')

In [59]:
node.shape

(238, 34)

In [60]:
# renaming the exporter column name to match it with the dyad level fixed variable data
node = node.rename(columns = {'iso3': 'iso_o'})

In [61]:
node = node[['iso_o', 'country', 'landlocked', 'continent', 'lat', 'lon', 'langoff_1','colonizer1']]

In [62]:
# removing duplicates
equivalent_columns = ['iso_o', 'country', 'landlocked', 'continent', 'langoff_1', 'colonizer1']
duplicates = node.duplicated(subset = equivalent_columns, keep = 'last')
print(node[duplicates].shape)
#print(node[duplicates])

(13, 8)


In [63]:
node_unique = node[~duplicates]
node_unique.shape

(225, 8)

#### Checking for mismatch in the dyad level and node level fixed factor data

In [64]:
set(node_unique['iso_o'].unique()) - set(dyad_fixed_unique['iso_o'].unique())

{'ATF'}

In [65]:
set(node_unique['iso_o'].unique()) - set(dyad_fixed_unique['iso_d'].unique())

{'ATF'}

French Southern and Antarctic Lands (ATF) is not present in the dyad level data.

In [66]:
node_unique = node_unique[node_unique['iso_o'] != 'ATF']

In [67]:
set(dyad_fixed_unique['iso_o'].unique()) - set(node_unique['iso_o'].unique())

set()

In [68]:
assert node_unique['iso_o'].nunique() == dyad_fixed_unique['iso_o'].nunique() 

### Country category based on IMF 
https://www.imf.org/en/Publications/WEO/weo-database/2025/april/groups-and-aggregates#oem

In [69]:
imf = pd.read_excel("../data/raw/IMF_classification.xlsx")

In [70]:
imf["development"] = 0

In [71]:
imf.head(2)

Unnamed: 0,country,category,development
0,Andorra,Advanced Economies,0
1,Australia,Advanced Economies,0


In [72]:
imf.loc[imf['category'] == 'Advanced Economies', 'development'] = 1

In [73]:
print(imf['category'].value_counts())
print(imf['development'].value_counts())

category
Emerging and Developing Economies    155
Advanced Economies                    41
Name: count, dtype: int64
development
0    155
1     41
Name: count, dtype: int64


In [74]:
print(imf['country'].nunique())
print(node_unique['country'].nunique())

196
224


In [75]:
m = set(node_unique['country'].unique()) - set(imf['country'].unique())

In [76]:
len(m) #These are in geo_cepii but not in IMF

64

In [77]:
q = set(imf['country'].unique()) - set(node_unique['country'].unique())

In [78]:
len(q) #These are in IMF but not in geo_cepii

36

In [79]:
q

{'Belgium',
 'Cabo Verde',
 'Democratic Republic of the Congo',
 'Eswatini',
 'Hong Kong SAR',
 'Kazakhstan',
 'Kosovo',
 'Kyrgyz Republic',
 'Lao P.D.R.',
 'Libya',
 'Macao SAR',
 'Micronesia',
 'Moldova',
 'Montenegro',
 'Myanmar',
 'North Macedonia',
 'Republic of Congo',
 'Russia',
 'Serbia',
 'Slovak Republic',
 'South Sudan',
 'St. Kitts and Nevis',
 'St. Lucia',
 'St. Vincent and the Grenadines',
 'Syria',
 'São Tomé and Príncipe',
 'Taiwan Province of China',
 'Tanzania',
 'The Bahamas',
 'The Gambia',
 'The Netherlands',
 'Timor-Leste',
 'Türkiye',
 'United States',
 'Vietnam',
 'West Bank and Gaza'}

In [80]:
node_unique.loc[node_unique['country'] == 'Bahamas', 'country'] = 'The Bahamas'
node_unique.loc[node_unique['country'] == 'Belgium and Luxembourg', 'country'] = 'Belgium'
node_unique.loc[node_unique['country'] ==  'Burma', 'country'] = 'Myanmar'
node_unique.loc[node_unique['country'] ==  'Cape Verde', 'country'] = 'Cabo Verde'
node_unique.loc[node_unique['country'] ==  'East Timor', 'country'] =  'Timor-Leste'
node_unique.loc[node_unique['country'] ==  'Gambia', 'country'] =  'The Gambia'
node_unique.loc[node_unique['country'] == 'Hong Kong', 'country'] = 'Hong Kong SAR'
node_unique.loc[node_unique['country'] == 'Kazakstan', 'country'] = 'Kazakhstan'
node_unique.loc[node_unique['country'] == "Korea, Dem. People's Rep. of", 'country'] = "Korea"
node_unique.loc[node_unique['country'] =='Kyrgyzstan', 'country'] = 'Kyrgyz Republic'
node_unique.loc[node_unique['country'] == "Lao People's Democratic Republic", 'country'] =  'Lao P.D.R.'
node_unique.loc[node_unique['country'] == 'Libyan Arab Jamahiriya', 'country'] = 'Libya'
node_unique.loc[node_unique['country'] == 'Macau (Aomen)', 'country'] = 'Macao SAR'
node_unique.loc[node_unique['country'] == 'Macedonia (the former Yugoslav Rep. of)', 'country'] = 'North Macedonia'
node_unique.loc[node_unique['country'] == 'Micronesia (Federated States of)', 'country'] = 'Micronesia'
node_unique.loc[node_unique['country'] == 'Moldova, Rep.of', 'country'] = 'Moldova'
node_unique.loc[node_unique['country'] == 'Netherlands', 'country'] = 'The Netherlands'
node_unique.loc[node_unique['country'] == 'Palestine', 'country'] = 'West Bank and Gaza'
node_unique.loc[node_unique['country'] == 'Russian Federation', 'country'] = 'Russia'
node_unique.loc[node_unique['country'] == 'Saint Kitts and Nevis', 'country'] = 'St. Kitts and Nevis'
node_unique.loc[node_unique['country'] == 'Saint Lucia', 'country'] = 'St. Lucia'
node_unique.loc[node_unique['country'] == 'Saint Vincent and the Grenadines', 'country'] = 'St. Vincent and the Grenadines'
node_unique.loc[node_unique['country'] == 'Sao Tome and Principe', 'country'] = 'São Tomé and Príncipe'
node_unique.loc[node_unique['country'] == 'Serbia and Montenegro', 'country'] = 'Serbia'
node_unique.loc[node_unique['country'] == 'Slovakia', 'country'] = 'Slovak Republic'
node_unique.loc[node_unique['country'] == 'Swaziland', 'country'] = 'Eswatini'
node_unique.loc[node_unique['country'] == 'Syrian Arab Republic', 'country'] =  'Syria'
node_unique.loc[node_unique['country'] ==  'Taiwan', 'country'] =   'Taiwan Province of China'
node_unique.loc[node_unique['country'] ==  'Tanzania, United Rep. of ', 'country'] =  'Tanzania'
node_unique.loc[node_unique['country'] ==  'Turkey', 'country'] =  'Türkiye'
node_unique.loc[node_unique['country'] ==  'United States of America', 'country'] =  'United States'
node_unique.loc[node_unique['country'] ==  'Viet Nam', 'country'] =  'Vietnam'
node_unique.loc[node_unique['country'] ==  'Congo', 'country'] =  'Republic of Congo'
node_unique.loc[node_unique['country'] ==  'Congo (Democratic Republic of the)', 'country'] =  'Democratic Republic of the Congo'

In [81]:
node_unique.head()

Unnamed: 0,iso_o,country,landlocked,continent,lat,lon,langoff_1,colonizer1
0,AND,Andorra,0,Europe,42.5,1.5,Catalan,
1,ARE,United Arab Emirates,0,Asia,24.466667,54.416668,Arabic,GBR
2,AFG,Afghanistan,1,Asia,34.516666,69.199997,Persian,
3,ATG,Antigua and Barbuda,0,America,17.133333,-61.833332,English,GBR
4,AIA,Anguilla,0,America,18.216667,-63.066666,English,GBR


In [82]:
nodeUnique_imf = node_unique.merge(imf, on = 'country', how='left')

In [83]:
print(imf['country'].nunique())
print(node_unique['country'].nunique())
print(nodeUnique_imf['country'].nunique())

196
223
223


In [84]:
nodeUnique_imf['category'].value_counts()

category
Emerging and Developing Economies    152
Advanced Economies                    42
Name: count, dtype: int64

In [85]:
nodeUnique_imf['category'].isna().sum()

30

In [86]:
nodeUnique_imf[nodeUnique_imf['category'].isna()]

Unnamed: 0,iso_o,country,landlocked,continent,lat,lon,langoff_1,colonizer1,category,development
4,AIA,Anguilla,0,America,18.216667,-63.066666,English,GBR,,
7,ANT,Netherland Antilles,0,America,12.1,-68.933334,Dutch,NLD,,
23,BMU,Bermuda,0,America,32.299999,-64.800003,English,GBR,,
33,CCK,Cocos (Keeling) Islands,0,Pacific,-12.5,96.833336,Malay,GBR,,
38,COK,Cook Islands,0,Pacific,-21.200001,-159.76666,English,GBR,,
44,CUB,Cuba,0,America,23.116667,-82.416664,Spanish,ESP,,
46,CXR,Christmas Island,0,Pacific,-10.416667,105.71667,English,GBR,,
58,ESH,Western Sahara,0,Africa,27.166666,-13.183333,Arabic,ESP,,
64,FLK,Falkland Islands,0,America,-51.700001,-57.849998,English,GBR,,
66,FRO,Faroe Islands,0,Europe,62.016666,-6.766667,Faroese,DNK,,


In [87]:
dyad_fixed_unique.to_csv("../data/cleaned/dyads_fixed.csv", encoding='utf-8', index=False)

In [88]:
nodeUnique_imf.to_csv("../data/cleaned/nodal_fixed.csv", encoding='utf-8', index=False)

### Merging the two datasets with time invariant factors

In [89]:
fixed = dyad_fixed_unique.merge(nodeUnique_imf, on = 'iso_o')

In [90]:
print(fixed['iso_o'].nunique())
print(fixed['iso_d'].nunique())

224
224


In [91]:
dyad_fixed_unique['iso_o'].nunique() == fixed['iso_o'].nunique() == node_unique['iso_o'].nunique()

True

In [92]:
fixed.shape

(49952, 17)

In [93]:
fixed.head(2)

Unnamed: 0,iso_o,iso_d,contig,comlang_off,colony,comcol,dist,log_dist,country,landlocked,continent,lat,lon,langoff_1,colonizer1,category,development
0,ABW,AFG,0,0,0,0,13257.81,9.492342,Aruba,0,America,12.55,-70.099998,Dutch,NLD,Emerging and Developing Economies,0.0
1,ABW,AGO,0,0,0,0,9516.913,9.160826,Aruba,0,America,12.55,-70.099998,Dutch,NLD,Emerging and Developing Economies,0.0


In [94]:
# This dataset of time invariant variables will be used both in forming networks and in the gravity model estimation
fixed.to_csv("../data/cleaned/fixed.csv", encoding='utf-8', index=False)