In [1]:
import pandas as pd
import numpy as np

These notebook cleans data from the 'raw' subfolder and produces 20 csv files (13 dyad level and 7 node level)
Data cleaning steps:
   > - a. Renaming the exporter and importer columns
   > - b. Removing self loops
   > - c. Removing duplicate rows and keeping the first instance only
   > - d. Keeping only the necessary columns
   > - e. Keeping only the countries common in all datassets
   > - f. Sorting values by country pair and time period
   > - g. Reseting index
   > - h. Saving the data in the cleanind folder

### GHGFP: Emissions embodied in bilateral trade

In [2]:
co2 = pd.read_csv('../data/raw/DF_TRADE.csv')

In [3]:
#co2.head(30)

In [4]:
co2 = co2[co2['EXPORTER'] != co2['IMPORTER']]

In [5]:
co2 = co2.rename(columns = {'EXPORTER': 'iso_o'})
co2 = co2.rename(columns = {'IMPORTER': 'iso_d'})

In [6]:
co2['ACTIVITY'].unique()

array(['A01_02', 'A03', 'B05_06', 'B07_08', 'B09', 'C10T12', 'C13T15',
       'C16', 'C17_18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25',
       'C26', 'C27', 'C28', 'C29', 'C30', 'C31T33', 'D', 'E', 'F', 'G',
       'H49', 'H50', 'H51', 'H52', 'H53', 'I', 'J58T60', 'J61', 'J62_63',
       'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', '_T', 'A', 'B',
       'C', 'C16T18', 'C19T23', 'C20_21', 'C24_25', 'C26_27', 'C29_30',
       'D_E', 'GTN', 'GTI', 'H', 'J', 'M_N', 'OTT', 'OTQ', 'RTT', 'R_S',
       'BTE', 'GTT', 'JTN', 'FTT', 'INFO'], dtype=object)

In [7]:
co2['PRODUCT_CATEGORY'].unique()

array(['FNL', 'TTL'], dtype=object)

In [8]:
co2[(co2['iso_o'] == 'BGD') & (co2['iso_d'] == 'IND') & (co2['TIME_PERIOD'] == 1995) & (co2['ACTIVITY'] == '_T') & (co2['PRODUCT_CATEGORY'] == 'TTL')]

Unnamed: 0,STRUCTURE,STRUCTURE_ID,ACTION,FREQ,TIME_PERIOD,MEASURE,iso_o,iso_d,ACTIVITY,PRODUCT_CATEGORY,UNIT_MEASURE,UNIT_MULT,OBS_VALUE
18256160,DATAFLOW,OECD.STI.PIE:DSD_ICIO_GHG_TRADE@DF_ICIO_GHG_TR...,R,A,1995,TRADE_GHG,BGD,IND,_T,TTL,T_CO2E,6,0.152


Product category `TTL` contains both intermediate and final products. 
Activity `_T` is the sum of non-overlapping industry demand. I am keeping these two only.

In [9]:
co2_T = co2[(co2['TIME_PERIOD'] >= 2000) & (co2['ACTIVITY'] == '_T') & (co2['PRODUCT_CATEGORY'] == 'TTL')]

In [10]:
co2.shape

(21861840, 13)

In [11]:
co2_T.shape

(126126, 13)

In [12]:
print(co2_T['iso_o'].nunique())
print(co2_T['iso_d'].nunique())
print(co2_T['ACTIVITY'].nunique())
print(co2_T['PRODUCT_CATEGORY'].nunique())
print(co2_T['TIME_PERIOD'].nunique())
print(co2_T.groupby(['iso_o', 'iso_d']).ngroups)

78
78
1
1
21
6006


In [13]:
co2_T['TIME_PERIOD'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020],
      dtype=int64)

In [14]:
column = ['iso_o', 'iso_d', 'TIME_PERIOD']
dupli = co2_T.duplicated(subset = column, keep = 'last')

In [15]:
co2_uni = co2_T[~dupli]

In [16]:
co2_uni.shape #There is no duplicate.

(126126, 13)

In [17]:
co2_uni = co2_uni[['iso_o', 'iso_d', 'TIME_PERIOD', 'OBS_VALUE']]

In [18]:
co2_uni = co2_uni.reset_index(drop = True)

In [19]:
co2_uni = co2_uni[co2_uni['iso_o'] != 'W']
co2_uni = co2_uni[co2_uni['iso_d'] != 'W']

In [20]:
co2_uni.shape

(122892, 4)

In [21]:
print(co2_uni['iso_o'].nunique())
print(co2_uni['iso_d'].nunique())
print(co2_uni['TIME_PERIOD'].nunique())
print(co2_uni.groupby(['iso_o', 'iso_d']).ngroups)

77
77
21
5852


In [22]:
assert set(co2_uni['iso_o'].unique()) == set(co2_uni['iso_d'].unique()) #Exporters and importers have same set of countries.

In [23]:
countries = set(co2_uni['iso_o'].unique())

In [24]:
edgelist_co2 = co2_uni.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [25]:
assert co2_uni.groupby(['iso_o', 'iso_d']).ngroups == edgelist_co2 .shape[0] 

### Total Trade

#### From 2000 to 2004

In [26]:
c00_04 = pd.read_csv('../data/raw/HS17_2D_DE_2000_To_2004.csv')

In [27]:
c00_04 = c00_04[['REF_AREA', 'COUNTERPART_AREA', 'PRODUCT_HS', 'TIME_PERIOD', 'OBS_VALUE', 'ADJUSTMENT']]

In [28]:
c00_04 = c00_04[c00_04['COUNTERPART_AREA'] != 'W']
c00_04 = c00_04[c00_04['REF_AREA'] != 'W']

In [29]:
c00_04_total = c00_04[(c00_04['PRODUCT_HS'] == '_T') & (c00_04['ADJUSTMENT'] == 'B_ADJ_RX') ]

In [30]:
print(c00_04_total['REF_AREA'].nunique())
print(c00_04_total['COUNTERPART_AREA'].nunique())
print(c00_04_total['PRODUCT_HS'].nunique())

197
197
1


#### From 2005 to 2009

In [31]:
c05_09 = pd.read_csv('../data/raw/HS17_2D_DE_2005_To_2009.csv')

In [32]:
c05_09.head(3)

Unnamed: 0,DATAFLOW,REF_AREA,COUNTERPART_AREA,TRADE_FLOW,PRODUCT_TYPE,PRODUCT_CPA,PRODUCT_HS,FREQ,TIME_PERIOD,OBS_VALUE,OBS_STATUS,METHODOLOGY_TYPE,UNIT_MULT,UNIT_MEASURE,ADJUSTMENT,DECIMALS
0,OECD.SDD.TPS:DSD_BIMTS@DF_BIMTS_HS2017_2D(1.0),HUN,MYS,X,C,_Z,HS17_90,A,2006,1.176961,E,AG,6,USD_EXC,B,2
1,OECD.SDD.TPS:DSD_BIMTS@DF_BIMTS_HS2017_2D(1.0),HUN,MYS,X,C,_Z,HS17_90,A,2006,1.176961,E,AG,6,USD_EXC,B_ADJ_RX,2
2,OECD.SDD.TPS:DSD_BIMTS@DF_BIMTS_HS2017_2D(1.0),USA,ROU,X,C,_Z,HS17_45,A,2006,0.004019,E,AG,6,USD_EXC,B,2


In [33]:
c05_09 = c05_09[['REF_AREA', 'COUNTERPART_AREA', 'PRODUCT_HS', 'TIME_PERIOD', 'OBS_VALUE','ADJUSTMENT']]

In [34]:
c05_09 = c05_09[c05_09['COUNTERPART_AREA'] != 'W']
c05_09 = c05_09[c05_09['REF_AREA'] != 'W']

In [35]:
c05_09_total = c05_09[(c05_09['PRODUCT_HS'] == '_T') & (c05_09['ADJUSTMENT'] == 'B_ADJ_RX') ]

In [36]:
print(c05_09_total['REF_AREA'].nunique())
print(c05_09_total['COUNTERPART_AREA'].nunique())
print(c05_09_total['PRODUCT_HS'].nunique())

199
199
1


#### From 2010 to 2014

In [37]:
c10_14 = pd.read_csv('../data/raw/HS17_2D_DE_2010_To_2014.csv')

In [38]:
c10_14 = c10_14[['REF_AREA', 'COUNTERPART_AREA', 'PRODUCT_HS', 'TIME_PERIOD', 'OBS_VALUE','ADJUSTMENT']]

In [39]:
c10_14 = c10_14[c10_14['COUNTERPART_AREA'] != 'W']
c10_14 = c10_14[c10_14['REF_AREA'] != 'W']

Kosovo and Montenegro got independence from Serbia in 2008 and 2006 repentively. From 2005 to 2009 Montenegro was present bit not Kosovo. Also, SOuth Sudan has got independence from Sudan in 2011. I will check the availability of KOsovo and South Sudan here

In [40]:
c10_14_total = c10_14[(c10_14['PRODUCT_HS'] == '_T') & (c10_14['ADJUSTMENT'] == 'B_ADJ_RX')]

In [41]:
print(c10_14_total['REF_AREA'].nunique())
print(c10_14_total['COUNTERPART_AREA'].nunique())
print(c10_14_total['PRODUCT_HS'].nunique())

200
200
1


#### From 2015 to 2019

In [42]:
c15_19 = pd.read_csv('../data/raw/HS17_2D_DE_2015_To_2019.csv')

In [43]:
c15_19 = c15_19[['REF_AREA', 'COUNTERPART_AREA', 'PRODUCT_HS', 'TIME_PERIOD', 'OBS_VALUE', 'ADJUSTMENT']]

In [44]:
c15_19 = c15_19[c15_19['COUNTERPART_AREA'] != 'W']
c15_19 = c15_19[c15_19['REF_AREA'] != 'W']

In [45]:
c15_19_total = c15_19[(c15_19['PRODUCT_HS'] == '_T') & (c15_19['ADJUSTMENT'] == 'B_ADJ_RX')]

In [46]:
print(c15_19_total['REF_AREA'].nunique())
print(c15_19_total['COUNTERPART_AREA'].nunique())
print(c15_19_total['PRODUCT_HS'].nunique())

200
200
1


#### From 2020 to 2023

In [47]:
c020_23 = pd.read_csv('../data/raw/HS17_2D_DE_2020_To_2023.csv')

In [48]:
c020_23 = c020_23[['REF_AREA', 'COUNTERPART_AREA', 'PRODUCT_HS', 'TIME_PERIOD', 'OBS_VALUE', 'ADJUSTMENT']]

In [49]:
c020_23 = c020_23[c020_23['COUNTERPART_AREA'] != 'W']
c020_23 = c020_23[c020_23['REF_AREA'] != 'W']

In [50]:
c020_23_total = c020_23[(c020_23['PRODUCT_HS'] == '_T') & (c020_23['ADJUSTMENT'] == 'B_ADJ_RX')]

In [51]:
print(c020_23_total['REF_AREA'].nunique())
print(c020_23_total['COUNTERPART_AREA'].nunique())
print(c020_23_total['PRODUCT_HS'].nunique())

200
200
1


##### For all datasets I will do the following steps:
1. Concateninating the 5 dataframes for each HS code category to make a time series from 2000 to 2023
2. Renaming the exporter and importer columns
3. Removing self loops
4. Checking for duplicates and removing if there is any
5. Sorting values by country pair and time period
6. Reseting index
7. Saving in the cleaned file in csv format 

#### Appending

In [52]:
# appending the five dataframes for HS code 97
total = pd.concat([c00_04_total, c05_09_total, c10_14_total, c15_19_total, c020_23_total], axis = 0)

In [53]:
# renaming the exporter and importer column to match the fixed variable dataset
total = total.rename(columns = {'REF_AREA': 'iso_o'})
total = total.rename(columns = {'COUNTERPART_AREA': 'iso_d'})

In [54]:
total.shape

(692462, 6)

In [55]:
print(total['TIME_PERIOD'].nunique())
print(total['ADJUSTMENT'].nunique())

24
1


In [56]:
# removing self_loops
total = total[total['iso_o'] != total['iso_d']]

In [57]:
total.shape

(691591, 6)

In [58]:
total.head()

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT
21,SAU,ERI,_T,2004,35.604232,B_ADJ_RX
39,SAU,EST,_T,2004,0.968338,B_ADJ_RX
122,AFG,NZL,_T,2004,0.08543,B_ADJ_RX
245,AGO,SWE,_T,2004,0.092443,B_ADJ_RX
274,ALB,AUS,_T,2004,0.386606,B_ADJ_RX


In [59]:
column_subset = ['iso_o', 'iso_d', 'TIME_PERIOD', 'ADJUSTMENT']
duplicate_trade = total.duplicated(subset = column_subset, keep = 'last')
total_unique = total[~duplicate_trade]

In [60]:
total_unique.shape #no duplicate

(691591, 6)

In [61]:
total_unique = total_unique.sort_values(['iso_o', 'iso_d', 'TIME_PERIOD'])

In [62]:
total_unique = total_unique.reset_index(drop= True)

In [63]:
total_unique.tail()

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT
691586,ZWE,ZMB,_T,2019,62.496404,B_ADJ_RX
691587,ZWE,ZMB,_T,2020,61.145436,B_ADJ_RX
691588,ZWE,ZMB,_T,2021,77.800731,B_ADJ_RX
691589,ZWE,ZMB,_T,2022,93.541923,B_ADJ_RX
691590,ZWE,ZMB,_T,2023,100.392036,B_ADJ_RX


In [64]:
total_unique['TIME_PERIOD'].nunique()

24

In [65]:
total_unique = total_unique[total_unique['TIME_PERIOD'] <= 2020]

In [66]:
total_unique = total_unique[total_unique['iso_o'].isin(countries)]
total_unique = total_unique[total_unique['iso_d'].isin(countries)]

In [67]:
print(total_unique['iso_o'].nunique())
print(total_unique['iso_d'].nunique())
print(total_unique.groupby(['iso_o','iso_d']).ngroups)
print(total_unique['TIME_PERIOD'].nunique())

77
77
5852
21


In [68]:
#total_unique[(total_unique['iso_o'] == 'BGD') & (total_unique['iso_d'] == 'IND')]

In [69]:
assert total_unique['iso_o'].nunique() == total_unique['iso_d'].nunique() == co2_uni['iso_o'].nunique() == co2_uni['iso_d'].nunique()

In [70]:
assert set(co2_uni['iso_o'].unique()) == set(co2_uni['iso_d'].unique()) == set(total_unique['iso_o'].unique()) == set(total_unique['iso_d'].unique()) 

In [71]:
assert total_unique['TIME_PERIOD'].nunique() == total_unique['TIME_PERIOD'].nunique() == co2_uni['TIME_PERIOD'].nunique() == co2_uni['TIME_PERIOD'].nunique()

In [72]:
edgelist_trade = total_unique.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [73]:
edgelist_trade_rounded = edgelist_trade.round(4)

In [74]:
assert total_unique.groupby(['iso_o', 'iso_d']).ngroups == edgelist_trade_rounded.shape[0] 

##### Dyad-level fixed factors

In [75]:
dyad_fixed = pd.read_csv('../data/raw/dist_cepii.csv')

In [76]:
# removing self_loops
dyad_fixed = dyad_fixed[dyad_fixed['iso_o'] != dyad_fixed['iso_d']]

In [77]:
dyad_fixed = dyad_fixed[['iso_o', 'iso_d', 'contig', 'comlang_off', 'colony', 'comcol', 'dist']]

In [78]:
dyad_fixed['log_dist'] = np.log(dyad_fixed['dist'])

In [79]:
dyad_fixed.shape

(49952, 8)

In [80]:
# checking for duplicates
dupli = dyad_fixed.duplicated(keep = 'last')
#dyad_fixed[dupli].shape[0]
dyad_fixed_unique = dyad_fixed[~dupli]
print(dyad_fixed_unique.shape)
# There is no duplicates present in this dist_cepii data

(49952, 8)


In [81]:
dyad_fixed_unique= dyad_fixed_unique[dyad_fixed_unique['iso_o'].isin(countries)]
dyad_fixed_unique= dyad_fixed_unique[dyad_fixed_unique['iso_d'].isin(countries)]

In [82]:
print(dyad_fixed_unique['iso_o'].nunique())
print(dyad_fixed_unique['iso_d'].nunique())
print(dyad_fixed_unique.groupby(['iso_o', 'iso_d']).ngroups)

75
75
5550


In [83]:
set(co2_uni['iso_o']) - set(dyad_fixed['iso_o'])

{'ROU', 'WXD'}

##### Node level fixed factors

In [84]:
node = pd.read_csv('../data/raw/geo_cepii.csv', encoding='latin1')

In [85]:
node.shape

(238, 34)

In [86]:
# renaming the exporter column name to match it with the dyad level fixed variable data
node = node.rename(columns = {'iso3': 'iso_o'})

In [87]:
node = node[['iso_o', 'country', 'landlocked', 'continent', 'lat', 'lon', 'langoff_1','colonizer1']]

In [88]:
# removing duplicates
equivalent_columns = ['iso_o', 'country', 'landlocked', 'continent', 'langoff_1', 'colonizer1']
duplicates = node.duplicated(subset = equivalent_columns, keep = 'last')
print(node[duplicates].shape)
#print(node[duplicates])

(13, 8)


In [89]:
node_unique = node[~duplicates]
node_unique.shape

(225, 8)

### IMF

In [90]:
imf = pd.read_excel("../data/raw/IMF_classification.xlsx")

In [91]:
imf["development"] = 0

In [92]:
imf.loc[imf['category'] == 'Advanced Economies', 'development'] = 1

In [93]:
print(imf['category'].value_counts())
print(imf['development'].value_counts())

category
Emerging and Developing Economies    155
Advanced Economies                    41
Name: count, dtype: int64
development
0    155
1     41
Name: count, dtype: int64


In [94]:
q = set(imf['country'].unique()) - set(node_unique['country'].unique())

In [95]:
len(q)

36

In [96]:
q

{'Belgium',
 'Cabo Verde',
 'Democratic Republic of the Congo',
 'Eswatini',
 'Hong Kong SAR',
 'Kazakhstan',
 'Kosovo',
 'Kyrgyz Republic',
 'Lao P.D.R.',
 'Libya',
 'Macao SAR',
 'Micronesia',
 'Moldova',
 'Montenegro',
 'Myanmar',
 'North Macedonia',
 'Republic of Congo',
 'Russia',
 'Serbia',
 'Slovak Republic',
 'South Sudan',
 'St. Kitts and Nevis',
 'St. Lucia',
 'St. Vincent and the Grenadines',
 'Syria',
 'São Tomé and Príncipe',
 'Taiwan Province of China',
 'Tanzania',
 'The Bahamas',
 'The Gambia',
 'The Netherlands',
 'Timor-Leste',
 'Türkiye',
 'United States',
 'Vietnam',
 'West Bank and Gaza'}

In [97]:
node_unique.loc[node_unique['country'] == 'Bahamas', 'country'] = 'The Bahamas'
node_unique.loc[node_unique['country'] == 'Belgium and Luxembourg', 'country'] = 'Belgium'
node_unique.loc[node_unique['country'] ==  'Burma', 'country'] = 'Myanmar'
node_unique.loc[node_unique['country'] ==  'Cape Verde', 'country'] = 'Cabo Verde'
node_unique.loc[node_unique['country'] ==  'East Timor', 'country'] =  'Timor-Leste'
node_unique.loc[node_unique['country'] ==  'Gambia', 'country'] =  'The Gambia'
node_unique.loc[node_unique['country'] == 'Hong Kong', 'country'] = 'Hong Kong SAR'
node_unique.loc[node_unique['country'] == 'Kazakstan', 'country'] = 'Kazakhstan'
node_unique.loc[node_unique['country'] == "Korea, Dem. People's Rep. of", 'country'] = "Korea"
node_unique.loc[node_unique['country'] =='Kyrgyzstan', 'country'] = 'Kyrgyz Republic'
node_unique.loc[node_unique['country'] == "Lao People's Democratic Republic", 'country'] =  'Lao P.D.R.'
node_unique.loc[node_unique['country'] == 'Libyan Arab Jamahiriya', 'country'] = 'Libya'
node_unique.loc[node_unique['country'] == 'Macau (Aomen)', 'country'] = 'Macao SAR'
node_unique.loc[node_unique['country'] == 'Macedonia (the former Yugoslav Rep. of)', 'country'] = 'North Macedonia'
node_unique.loc[node_unique['country'] == 'Micronesia (Federated States of)', 'country'] = 'Micronesia'
node_unique.loc[node_unique['country'] == 'Moldova, Rep.of', 'country'] = 'Moldova'
node_unique.loc[node_unique['country'] == 'Netherlands', 'country'] = 'The Netherlands'
node_unique.loc[node_unique['country'] == 'Palestine', 'country'] = 'West Bank and Gaza'
node_unique.loc[node_unique['country'] == 'Russian Federation', 'country'] = 'Russia'
node_unique.loc[node_unique['country'] == 'Saint Kitts and Nevis', 'country'] = 'St. Kitts and Nevis'
node_unique.loc[node_unique['country'] == 'Saint Lucia', 'country'] = 'St. Lucia'
node_unique.loc[node_unique['country'] == 'Saint Vincent and the Grenadines', 'country'] = 'St. Vincent and the Grenadines'
node_unique.loc[node_unique['country'] == 'Sao Tome and Principe', 'country'] = 'São Tomé and Príncipe'
node_unique.loc[node_unique['country'] == 'Serbia and Montenegro', 'country'] = 'Serbia'
node_unique.loc[node_unique['country'] == 'Slovakia', 'country'] = 'Slovak Republic'
node_unique.loc[node_unique['country'] == 'Swaziland', 'country'] = 'Eswatini'
node_unique.loc[node_unique['country'] == 'Syrian Arab Republic', 'country'] =  'Syria'
node_unique.loc[node_unique['country'] ==  'Taiwan', 'country'] =   'Taiwan Province of China'
node_unique.loc[node_unique['country'] ==  'Tanzania, United Rep. of ', 'country'] =  'Tanzania'
node_unique.loc[node_unique['country'] ==  'Turkey', 'country'] =  'Türkiye'
node_unique.loc[node_unique['country'] ==  'United States of America', 'country'] =  'United States'
node_unique.loc[node_unique['country'] ==  'Viet Nam', 'country'] =  'Vietnam'
node_unique.loc[node_unique['country'] ==  'Congo', 'country'] =  'Republic of Congo'
node_unique.loc[node_unique['country'] ==  'Congo (Democratic Republic of the)', 'country'] =  'Democratic Republic of the Congo'

In [98]:
nodeUnique_imf = node_unique.merge(imf, on = 'country', how='left')

In [99]:
nodeUnique_imf = nodeUnique_imf[nodeUnique_imf['iso_o'].isin(countries)]

In [100]:
print(imf['country'].nunique())
print(node_unique['country'].nunique())
print(nodeUnique_imf['country'].nunique())

196
224
75


In [101]:
nodeUnique_imf['category'].value_counts()

category
Emerging and Developing Economies    38
Advanced Economies                   37
Name: count, dtype: int64

In [102]:
nodeUnique_imf['category'].isna().sum()

0

In [103]:
nodeUnique_imf[nodeUnique_imf['category'].isna()]

Unnamed: 0,iso_o,country,landlocked,continent,lat,lon,langoff_1,colonizer1,category,development


##### Merging the two datasets with time invariant factors

In [104]:
fixed = dyad_fixed_unique.merge(nodeUnique_imf, on = 'iso_o')

In [105]:
print(fixed['iso_o'].nunique())
print(fixed['iso_d'].nunique())

75
75


In [106]:
dyad_fixed_unique['iso_o'].nunique() == fixed['iso_o'].nunique() == node_unique['iso_o'].nunique()

False

In [107]:
fixed.shape

(5550, 17)

In [108]:
fixed.head(2)

Unnamed: 0,iso_o,iso_d,contig,comlang_off,colony,comcol,dist,log_dist,country,landlocked,continent,lat,lon,langoff_1,colonizer1,category,development
0,ARG,AUS,0,0,0,0,11801.36,9.37597,Argentina,0,America,-34.666668,-58.5,Spanish,ESP,Emerging and Developing Economies,0.0
1,ARG,AUT,0,0,0,0,11833.76,9.378712,Argentina,0,America,-34.666668,-58.5,Spanish,ESP,Emerging and Developing Economies,0.0


In [109]:
set(fixed['iso_o'].unique()) == set(fixed['iso_d'].unique())

True

#### contiguity edgelist

In [110]:
fixed_con = fixed[['iso_o', 'iso_d', 'contig']] 

In [111]:
print(fixed_con[(fixed_con['iso_o'] == 'BGD') & (fixed_con['iso_d'] == 'IND')])
print(fixed_con[(fixed_con['iso_o'] == 'IND') & (fixed_con['iso_d'] == 'BGD')])

    iso_o iso_d  contig
327   BGD   IND       1
     iso_o iso_d  contig
2372   IND   BGD       1


In [112]:
print(fixed_con['iso_o'].nunique())
print(fixed_con['iso_o'].nunique())
print(fixed_con.groupby(['iso_o', 'iso_d']).ngroups)

75
75
5550


#### Distance

In [113]:
distance = fixed[['iso_o', 'iso_d', 'log_dist']]

In [114]:
print(distance[(distance['iso_o'] == 'BGD') & (distance['iso_d'] == 'IND')])
print(distance[(distance['iso_o'] == 'IND') & (distance['iso_d'] == 'BGD')])

    iso_o iso_d  log_dist
327   BGD   IND  7.259776
     iso_o iso_d  log_dist
2372   IND   BGD  7.259776


In [115]:
print(distance['iso_o'].nunique())
print(distance['iso_d'].nunique())

75
75


In [116]:
distance.head(2)

Unnamed: 0,iso_o,iso_d,log_dist
0,ARG,AUS,9.37597
1,ARG,AUT,9.378712


#### Common colonizer

In [117]:
comcol = fixed[['iso_o', 'iso_d', 'comcol']]

In [118]:
print(comcol[(comcol['iso_o'] == 'BGD') & (comcol['iso_d'] == 'IND')])
print(comcol[(comcol['iso_o'] == 'IND') & (comcol['iso_d'] == 'BGD')])

    iso_o iso_d  comcol
327   BGD   IND       1
     iso_o iso_d  comcol
2372   IND   BGD       1


#### Colony

In [119]:
col = fixed[['iso_o', 'iso_d', 'colony']]

In [120]:
print(col[(col['iso_o'] == 'BGD') & (col['iso_d'] == 'GBR')])
print(col[(col['iso_o'] == 'IND') & (col['iso_d'] == 'GBR')])

    iso_o iso_d  colony
321   BGD   GBR       1
     iso_o iso_d  colony
2394   IND   GBR       1


In [121]:
print(col['iso_o'].nunique())
print(col['iso_d'].nunique())

75
75


#### Common Language

In [122]:
lang = fixed[['iso_o', 'iso_d', 'comlang_off']]

In [123]:
print(lang[(lang['iso_o'] == 'BGD') & (lang['iso_d'] == 'IND')])
print(lang[(lang['iso_o'] == 'IND') & (lang['iso_d'] == 'BGD')])

    iso_o iso_d  comlang_off
327   BGD   IND            0
     iso_o iso_d  comlang_off
2372   IND   BGD            0


In [124]:
print(lang['iso_o'].nunique())
print(lang['iso_d'].nunique())

75
75


In [125]:
e = list(set(lang['iso_o'].unique()))

In [126]:
len(e)

75

In [127]:
co2_uni = co2_uni[co2_uni['iso_o'].isin(e)]
co2_uni = co2_uni[co2_uni['iso_d'].isin(e)]
edgelist_co2 = edgelist_co2[edgelist_co2['iso_o'].isin(e)]
edgelist_co2 = edgelist_co2[edgelist_co2['iso_d'].isin(e)]

In [128]:
print(co2_uni['iso_o'].nunique())
print(co2_uni['iso_d'].nunique())
print(edgelist_co2['iso_o'].nunique())
print(edgelist_co2['iso_d'].nunique())

75
75
75
75


In [129]:
total_unique = total_unique[total_unique['iso_o'].isin(e)]
total_unique = total_unique[total_unique['iso_d'].isin(e)]
edgelist_trade_rounded = edgelist_trade_rounded[edgelist_trade_rounded['iso_o'].isin(e)]
edgelist_trade_rounded = edgelist_trade_rounded[edgelist_trade_rounded['iso_d'].isin(e)]

In [130]:
print(total_unique['iso_o'].nunique())
print(total_unique['iso_d'].nunique())
print(edgelist_trade_rounded['iso_o'].nunique())
print(edgelist_trade_rounded['iso_d'].nunique())

75
75
75
75


# Time Varying Data

### RTA

In [131]:
rta = pd.read_csv("../data/raw/rta_20241028.csv")

In [132]:
rta  = rta.rename(columns = {'exporter': 'iso_o'})
rta  = rta.rename(columns = {'importer': 'iso_d'})
rta  = rta.rename(columns = {'year': 'TIME_PERIOD'})

In [133]:
rta = rta[rta['iso_o'] != rta['iso_d']]

In [134]:
rta = rta[rta['iso_o'].isin(countries)].reset_index(drop= True)

In [135]:
print(rta.groupby(['iso_o', 'iso_d']).ngroups)

21204


In [136]:
rta = rta[(rta['TIME_PERIOD'] >= 2000) & (rta['TIME_PERIOD'] <= 2020)]

In [137]:
rta['TIME_PERIOD'].unique()

array([2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020],
      dtype=int64)

In [138]:
rta = rta[['iso_o', 'iso_d', 'TIME_PERIOD', 'rta']].reset_index(drop = True)

In [139]:
rta['iso_o'].nunique()

76

In [140]:
rta

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,rta
0,ARG,ABW,2000,0
1,ARG,ABW,2001,0
2,ARG,ABW,2002,0
3,ARG,ABW,2003,0
4,ARG,ABW,2004,0
...,...,...,...,...
445279,ZAF,ZWE,2016,1
445280,ZAF,ZWE,2017,1
445281,ZAF,ZWE,2018,1
445282,ZAF,ZWE,2019,1


In [141]:
rta_edgelist = rta.pivot_table(values = 'rta', index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD').reset_index()

In [142]:
rta_edgelist[(rta_edgelist['iso_o'] == 'BGD') & (rta_edgelist['iso_d'] == 'IND')]

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
1232,BGD,IND,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Internet Penetration

In [143]:
internet = pd.read_csv('../data/raw/702e5907-a97c-4e0a-9f8f-511ea9b80ab0_Data.csv')

In [144]:
internet = internet.iloc[:265, :]
internet = internet.rename(columns = {'Country Code': 'iso_o'})
internet = internet.rename(columns = {'1999 [YR1999]': 1999})
internet = internet.rename(columns = {'2000 [YR2000]': 2000})
internet = internet.rename(columns = {'2001 [YR2001]': 2001})
internet = internet.rename(columns = {'2002 [YR2002]': 2002})
internet = internet.rename(columns = {'2003 [YR2003]': 2003})
internet = internet.rename(columns = {'2004 [YR2004]': 2004})
internet = internet.rename(columns = {'2005 [YR2005]': 2005})
internet = internet.rename(columns = {'2006 [YR2006]': 2006})
internet = internet.rename(columns = {'2007 [YR2007]': 2007})
internet = internet.rename(columns = {'2008 [YR2008]': 2008})
internet = internet.rename(columns = {'2009 [YR2009]': 2009})
internet = internet.rename(columns = {'2010 [YR2010]': 2010})
internet = internet.rename(columns = {'2011 [YR2011]': 2011})
internet = internet.rename(columns = {'2012 [YR2012]': 2012})
internet = internet.rename(columns = {'2013 [YR2013]': 2013})
internet = internet.rename(columns = {'2014 [YR2014]': 2014})
internet = internet.rename(columns = {'2015 [YR2015]': 2015})
internet = internet.rename(columns = {'2016 [YR2016]': 2016})
internet = internet.rename(columns = {'2017 [YR2017]': 2017})
internet = internet.rename(columns = {'2018 [YR2018]': 2018})
internet = internet.rename(columns = {'2019 [YR2019]': 2019})
internet = internet.rename(columns = {'2020 [YR2020]': 2020})
internet = internet.drop(columns= ['Country Name', 'Series Name', 'Series Code'])

In [145]:
internet = internet[internet['iso_o'].isin(countries)]
internet = internet.reset_index(drop = True)

In [146]:
print(internet['iso_o'].nunique())

75


In [147]:
year = [1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
internet[year] = internet[year].replace('..', np.nan)
internet[year] = internet[year].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

In [148]:
missing = internet.isna().sum()
missing

iso_o    0
1999     0
2000     2
2001     0
2002     1
2003     2
2004     2
2005     1
2006     0
2007     0
2008     0
2009     0
2010     0
2011     0
2012     0
2013     0
2014     0
2015     0
2016     0
2017     0
2018     1
2019     0
2020     0
dtype: int64

In [149]:
mean_internet_2000 = internet[2000].mean()
internet[2000] = internet[2000].fillna(mean_internet_2000)
#internet['log_GDP'] = np.log(at2002_ordered['GDP'])
mean_internet_2002 = internet[2002].mean()
internet[2002] = internet[2002].fillna(mean_internet_2002)
mean_internet_2003 = internet[2003].mean()
internet[2003] = internet[2003].fillna(mean_internet_2003)
mean_internet_2004 = internet[2004].mean()
internet[2004] = internet[2004].fillna(mean_internet_2004)
mean_internet_2005 = internet[2005].mean()
internet[2005] = internet[2005].fillna(mean_internet_2005)
mean_internet_2018 = internet[2018].mean()
internet[2018] = internet[2018].fillna(mean_internet_2018)

In [150]:
internet.head()

Unnamed: 0,iso_o,1999,2000,2001,2002,2003,2004,2005,2006,2007,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ARG,3.28,7.04,9.78,10.9,11.9,16.0,17.7,20.9,25.9,...,51.0,55.8,59.9,64.7,68.0,71.0,74.3,77.7,79.9,85.5
1,AUS,40.8,46.8,52.7,23.694054,27.593699,31.496301,63.0,66.0,69.5,...,79.5,79.0,83.5,84.0,84.6,86.5,86.5,90.0,93.6,94.7
2,AUT,23.0,33.7,39.2,36.6,42.7,54.3,58.0,63.6,69.4,...,78.7,80.0,80.6,81.0,83.9,84.3,87.9,87.5,87.8,87.5
3,BGD,0.04,0.07,0.13,0.14,0.16,0.2,0.24,1.0,1.8,...,4.5,5.0,6.63,11.9,12.9,18.1,21.5,25.6,30.4,36.1
4,BLR,0.5,1.86,4.3,8.95,27.593699,31.496301,34.648108,16.2,19.7,...,39.6,46.9,54.2,59.0,62.2,71.1,74.4,79.1,82.8,85.1


In [151]:
internet_long = internet.set_index('iso_o').stack().reset_index()

In [152]:
internet_long.head(2)

Unnamed: 0,iso_o,level_1,0
0,ARG,1999,3.28
1,ARG,2000,7.04


In [153]:
internet_long = internet_long.rename(columns= {'level_1': 'TIME_PERIOD'})
internet_long = internet_long.rename(columns= {0: 'internet'})

In [154]:
assert internet_long['iso_o'].nunique() == internet['iso_o'].nunique()

### GDP per capita

In [155]:
gdp_pct = pd.read_csv('../data/raw/ad57150c-c77e-4bee-bb87-4174bb32e6a0_Data.csv')

In [156]:
gdp_pct = gdp_pct.iloc[:265, :]
gdp_pct = gdp_pct.rename(columns = {'Country Code': 'iso_o'})
gdp_pct = gdp_pct.rename(columns = {'1999 [YR1999]': 1999})
gdp_pct = gdp_pct.rename(columns = {'2000 [YR2000]': 2000})
gdp_pct = gdp_pct.rename(columns = {'2001 [YR2001]': 2001})
gdp_pct = gdp_pct.rename(columns = {'2002 [YR2002]': 2002})
gdp_pct = gdp_pct.rename(columns = {'2003 [YR2003]': 2003})
gdp_pct = gdp_pct.rename(columns = {'2004 [YR2004]': 2004})
gdp_pct = gdp_pct.rename(columns = {'2005 [YR2005]': 2005})
gdp_pct = gdp_pct.rename(columns = {'2006 [YR2006]': 2006})
gdp_pct = gdp_pct.rename(columns = {'2007 [YR2007]': 2007})
gdp_pct = gdp_pct.rename(columns = {'2008 [YR2008]': 2008})
gdp_pct = gdp_pct.rename(columns = {'2009 [YR2009]': 2009})
gdp_pct = gdp_pct.rename(columns = {'2010 [YR2010]': 2010})
gdp_pct = gdp_pct.rename(columns = {'2011 [YR2011]': 2011})
gdp_pct = gdp_pct.rename(columns = {'2012 [YR2012]': 2012})
gdp_pct = gdp_pct.rename(columns = {'2013 [YR2013]': 2013})
gdp_pct = gdp_pct.rename(columns = {'2014 [YR2014]': 2014})
gdp_pct = gdp_pct.rename(columns = {'2015 [YR2015]': 2015})
gdp_pct = gdp_pct.rename(columns = {'2016 [YR2016]': 2016})
gdp_pct = gdp_pct.rename(columns = {'2017 [YR2017]': 2017})
gdp_pct = gdp_pct.rename(columns = {'2018 [YR2018]': 2018})
gdp_pct = gdp_pct.rename(columns = {'2019 [YR2019]': 2019})
gdp_pct = gdp_pct.rename(columns = {'2020 [YR2020]': 2020})
gdp_pct = gdp_pct.drop(columns= ['Country Name', 'Series Name', 'Series Code'])

In [157]:
gdp_pct = gdp_pct[gdp_pct['iso_o'].isin(countries)]
gdp_pct = gdp_pct.reset_index(drop = True)

In [158]:
gdp_pct[year] = gdp_pct[year].replace('..', np.nan)
gdp_pct[year] = gdp_pct[year].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

In [159]:
missing1 = gdp_pct.isna().sum()
missing1

iso_o    0
1999     0
2000     0
2001     0
2002     0
2003     0
2004     0
2005     0
2006     0
2007     0
2008     0
2009     0
2010     0
2011     0
2012     0
2013     0
2014     0
2015     0
2016     0
2017     0
2018     0
2019     0
2020     0
dtype: int64

In [160]:
gdp_pct['iso_o'].nunique()

75

In [161]:
gdp_pct.head(3)

Unnamed: 0,iso_o,1999,2000,2001,2002,2003,2004,2005,2006,2007,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ARG,10838.32,10631.65,10051.94,8861.56,9545.53,10302.45,11099.75,11870.28,12811.9,...,14040.62,13754.43,13946.1,13456.13,13679.63,13265.89,13520.11,13058.33,12706.4,11393.05
1,AUS,44636.94,45859.52,46191.31,47486.28,48394.13,49902.02,50853.2,51553.97,52531.76,...,54111.36,55257.57,55728.02,56327.85,56739.03,57401.07,57750.65,58530.78,58924.73,58132.8
2,AUT,37468.87,38571.09,38929.75,39313.72,39569.29,40333.33,40989.01,42120.16,43568.83,...,44146.2,44221.46,43851.37,43838.83,43915.23,44362.67,45056.64,45951.58,46550.56,43428.7


In [162]:
gdp_pct_long = gdp_pct.set_index('iso_o').stack().reset_index()

In [163]:
gdp_pct_long.head(3)

Unnamed: 0,iso_o,level_1,0
0,ARG,1999,10838.32
1,ARG,2000,10631.65
2,ARG,2001,10051.94


In [164]:
gdp_pct_long = gdp_pct_long.rename(columns= {'level_1': 'TIME_PERIOD'})
gdp_pct_long = gdp_pct_long.rename(columns= {0: 'gdp_pct'})

In [165]:
assert gdp_pct_long['iso_o'].nunique() == gdp_pct_long['iso_o'].nunique()

In [166]:
assert gdp_pct_long['iso_o'].nunique() == gdp_pct_long['iso_o'].nunique() == internet['iso_o'].nunique() == internet_long['iso_o'].nunique()

### Mean Year of Schooling 

In [167]:
schooling = pd.read_excel("../data/raw/hdr-data.xlsx")

In [168]:
schooling.tail()

Unnamed: 0,countryIsoCode,country,indexCode,index,dimension,indicatorCode,indicator,year,value,note
4179,ZWE,Zimbabwe,HDI,Human Development Index,,eys,Expected Years of Schooling (years),2016,10.899038,
4180,ZWE,Zimbabwe,HDI,Human Development Index,,eys,Expected Years of Schooling (years),2017,10.930538,
4181,ZWE,Zimbabwe,HDI,Human Development Index,,eys,Expected Years of Schooling (years),2018,10.96213,
4182,ZWE,Zimbabwe,HDI,Human Development Index,,eys,Expected Years of Schooling (years),2019,10.993813,
4183,ZWE,Zimbabwe,HDI,Human Development Index,,eys,Expected Years of Schooling (years),2020,11.025587,


In [169]:
schooling = schooling[['countryIsoCode', 'year', 'value']]

In [170]:
schooling = schooling.rename(columns = {'countryIsoCode': 'iso_o'})

In [171]:
schooling = schooling.rename(columns = {'year': 'TIME_PERIOD'})

In [172]:
schooling.dtypes

iso_o           object
TIME_PERIOD      int64
value          float64
dtype: object

In [173]:
schooling

Unnamed: 0,iso_o,TIME_PERIOD,value
0,AFG,1999,5.564425
1,AFG,2000,5.856422
2,AFG,2001,6.148418
3,AFG,2002,6.440414
4,AFG,2003,6.732410
...,...,...,...
4179,ZWE,2016,10.899038
4180,ZWE,2017,10.930538
4181,ZWE,2018,10.962130
4182,ZWE,2019,10.993813


In [174]:
missing2 = schooling.isna().sum()
missing2

iso_o          0
TIME_PERIOD    0
value          0
dtype: int64

In [176]:
#mean_schooling_1999 = schooling[schooling['TIME_PERIOD'] == 1999].mean()
#schooling['value'] = schooling['value'].fillna(mean_schooling_1999)

In [177]:
schooling['iso_o'].nunique()

194

In [178]:
miss = countries - set(schooling['iso_o'])
print(miss)

{'WXD', 'TWN'}


In [179]:
schooling = schooling[schooling['iso_o'].isin(countries)].reset_index(drop= True)

In [180]:
schooling.shape

(1650, 3)

In [181]:
schooling['iso_o'].nunique()

75

In [182]:
schooling['TIME_PERIOD'].unique()

array([1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020],
      dtype=int64)

In [183]:
schooling.head(3)

Unnamed: 0,iso_o,TIME_PERIOD,value
0,ARG,1999,15.17326
1,ARG,2000,15.68757
2,ARG,2001,16.30578


In [184]:
#print(type(next(iter(countries))))
#print(schooling['iso_o'].apply(repr).head())

In [185]:
schooling = schooling.rename(columns = {'value': 'schooling'})

In [186]:
assert internet['iso_o'].nunique() == gdp_pct['iso_o'].nunique() == schooling['iso_o'].nunique()

In [187]:
assert set(internet['iso_o'].unique()) == set(gdp_pct['iso_o'].unique()) == set(schooling['iso_o'].unique())

#### Merging internet penetration, GDP per capita and Mean year of schooling

In [188]:
merged = internet_long.merge(gdp_pct_long, on = ['iso_o','TIME_PERIOD'], how = 'left').merge(schooling, on = ['iso_o','TIME_PERIOD'], how = 'left')

In [189]:
merged.head()

Unnamed: 0,iso_o,TIME_PERIOD,internet,gdp_pct,schooling
0,ARG,1999,3.28,10838.32,15.17326
1,ARG,2000,7.04,10631.65,15.68757
2,ARG,2001,9.78,10051.94,16.30578
3,ARG,2002,10.9,8861.56,16.392509
4,ARG,2003,11.9,9545.53,16.26639


### Pruning data to solve country list mismatch

OECD datasets have 77 countries.
RTA dataset has 76, while all geo-cepii, dist-cepii extracted datasets have 75 countries each, WDI and UNDP datasets have 75 countries as well. Hence, I will retain the 75 countries ubiquitous over all datasets.

In [190]:
common = list(set(merged['iso_o'].unique()))

In [191]:
set(fixed['iso_o'].unique()) - set(merged['iso_o'].unique())

{'TWN'}

In [192]:
set(nodeUnique_imf['iso_o'].unique()) - set(merged['iso_o'].unique())

{'TWN'}

In [193]:
set(merged['iso_o'].unique()) - set(nodeUnique_imf['iso_o'].unique())

{'ROU'}

In [194]:
com = set(merged['iso_o'].unique()) & set(fixed['iso_o'].unique()) & set(fixed['iso_o'].unique())\
& set(dyad_fixed_unique['iso_o'].unique()) & set(dyad_fixed_unique['iso_o'].unique()) & set(rta_edgelist['iso_o'].unique())\
& set(nodeUnique_imf['iso_o'].unique()) & set(fixed_con['iso_o'].unique()) & set(fixed_con['iso_o'].unique())\
& set(distance['iso_o'].unique()) & set(distance['iso_o'].unique()) & set(comcol['iso_o'].unique())\
& set(comcol['iso_o'].unique()) & set(col['iso_o'].unique()) & set(col['iso_o'].unique())\
& set(lang['iso_o'].unique()) & set(lang['iso_o'].unique()) & set(co2_uni['iso_o'].unique())\
& set(edgelist_co2['iso_o'].unique()) & set(total_unique['iso_o'].unique()) & set(edgelist_trade_rounded['iso_o'].unique())\
& set(internet['iso_o'].unique()) & set(internet_long['iso_o'].unique()) & set(gdp_pct['iso_o'].unique()) & set(gdp_pct_long['iso_o'].unique()) & set(schooling['iso_o'].unique()) & set(rta['iso_o'].unique()) & set(gdp_pct_long['iso_o'].unique()) & set(schooling['iso_o'].unique()) & set(rta['iso_o'].unique())

In [195]:
len(com)

74

In [196]:
fixed = fixed[fixed['iso_o'].isin(com)]
fixed = fixed[fixed['iso_d'].isin(com)]

In [197]:
dyad_fixed_unique = dyad_fixed_unique[dyad_fixed_unique['iso_o'].isin(com)]
dyad_fixed_unique = dyad_fixed_unique[dyad_fixed_unique['iso_d'].isin(com)]

In [198]:
nodeUnique_imf = nodeUnique_imf[nodeUnique_imf['iso_o'].isin(com)]

In [199]:
fixed_con = fixed_con[fixed_con['iso_o'].isin(com)]
fixed_con = fixed_con[fixed_con['iso_d'].isin(com)]

In [200]:
distance = distance[distance['iso_o'].isin(com)]
distance = distance[distance['iso_d'].isin(com)]

In [201]:
comcol = comcol[comcol['iso_o'].isin(com)]
comcol = comcol[comcol['iso_d'].isin(com)]

In [202]:
col = col[col['iso_o'].isin(com)]
col = col[col['iso_d'].isin(com)]

In [203]:
lang = lang[lang['iso_o'].isin(com)]
lang = lang[lang['iso_d'].isin(com)]

In [204]:
co2_uni = co2_uni[co2_uni['iso_o'].isin(com)]
co2_uni = co2_uni[co2_uni['iso_d'].isin(com)]

In [205]:
edgelist_co2 = edgelist_co2[edgelist_co2['iso_o'].isin(com)]
edgelist_co2 = edgelist_co2[edgelist_co2['iso_d'].isin(com)]

In [206]:
total_unique = total_unique[total_unique['iso_o'].isin(com)]
total_unique = total_unique[total_unique['iso_d'].isin(com)]

In [207]:
edgelist_trade_rounded = edgelist_trade_rounded[edgelist_trade_rounded['iso_o'].isin(com)]
edgelist_trade_rounded = edgelist_trade_rounded[edgelist_trade_rounded['iso_d'].isin(com)]

In [208]:
internet = internet[internet['iso_o'].isin(com)]
internet_long = internet_long[internet_long['iso_o'].isin(com)]

In [209]:
gdp_pct = gdp_pct[gdp_pct['iso_o'].isin(com)]
gdp_pct_long = gdp_pct_long[gdp_pct_long['iso_o'].isin(com)]

In [210]:
schooling = schooling[schooling['iso_o'].isin(com)]
merged = merged[merged['iso_o'].isin(com)]

In [211]:
rta = rta[rta['iso_o'].isin(com)]
rta = rta[rta['iso_d'].isin(com)]

In [212]:
rta_edgelist = rta_edgelist[rta_edgelist['iso_o'].isin(com)]
rta_edgelist = rta_edgelist[rta_edgelist['iso_d'].isin(com)]

In [213]:
# 1
print(fixed['iso_o'].nunique())
print(fixed['iso_d'].nunique())
# 2
print(dyad_fixed_unique['iso_o'].nunique())
print(dyad_fixed_unique['iso_d'].nunique())
# 3
print(fixed_con['iso_o'].nunique())
print(fixed_con['iso_d'].nunique())
# 4
print(distance['iso_o'].nunique())
print(distance['iso_d'].nunique())
# 5
print(comcol['iso_o'].nunique())
print(comcol['iso_d'].nunique())
# 6
print(col['iso_o'].nunique())
print(col['iso_d'].nunique())
# 7
print(lang['iso_o'].nunique())
print(lang['iso_d'].nunique())
# 8
print(co2_uni['iso_o'].nunique())
print(co2_uni['iso_d'].nunique())
# 9
print(edgelist_co2['iso_o'].nunique())
print(edgelist_co2['iso_d'].nunique())
# 10
print(total_unique['iso_o'].nunique())
print(total_unique['iso_d'].nunique())
# 11
print(edgelist_trade_rounded['iso_o'].nunique())
print(edgelist_trade_rounded['iso_d'].nunique())
# 12
print(rta['iso_o'].nunique())
print(rta['iso_d'].nunique())
# 13
print(rta_edgelist['iso_o'].nunique())
print(rta_edgelist['iso_d'].nunique())

74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74
74


In [214]:
# 1
print(nodeUnique_imf['iso_o'].nunique())
# 2
print(internet['iso_o'].nunique())
# 3
print(internet_long['iso_o'].nunique())
# 4
print(gdp_pct['iso_o'].nunique())
# 5
print(gdp_pct_long['iso_o'].nunique())
# 6
print(schooling['iso_o'].nunique())
# 7
print(merged['iso_o'].nunique())

74
74
74
74
74
74
74


In [215]:
fixed.to_csv("../data/cleaned/fixed.csv", encoding='utf-8', index=False)

In [216]:
dyad_fixed_unique.to_csv("../data/cleaned/dyads_fixed.csv", encoding='utf-8', index=False)

In [217]:
nodeUnique_imf.to_csv("../data/cleaned/nodal_fixed.csv", encoding='utf-8', index=False)

In [218]:
fixed_con.to_csv('../data/cleaned/contiguity_edgelist.csv', encoding = 'utf-8', index = False) 

In [219]:
distance.to_csv('../data/cleaned/distance_edgelist.csv', encoding = 'utf-8', index = False) 

In [220]:
comcol.to_csv('../data/cleaned/comcol_edgelist.csv', encoding = 'utf-8', index = False)

In [221]:
col.to_csv('../data/cleaned/colonizer_edgelist.csv', encoding = 'utf-8', index = False)

In [222]:
lang.to_csv('../data/cleaned/language_edgelist.csv', encoding = 'utf-8', index = False)

In [223]:
co2_uni.to_csv("../data/cleaned/emission_trade2000_2020.csv", encoding='utf-8', index=False)

In [224]:
edgelist_co2.to_csv("../data/cleaned/emission_trade_edgelist.csv", encoding='utf-8', index=False)

In [225]:
total_unique.to_csv("../data/cleaned/trade2000_2020.csv", encoding='utf-8', index=False)

In [226]:
edgelist_trade_rounded.to_csv("../data/cleaned/trade_edgelist.csv", encoding='utf-8', index=False)

In [227]:
internet.to_csv("../data/cleaned/internet.csv", encoding = 'utf-8', index = False)
internet_long.to_csv("../data/cleaned/internet_long.csv", encoding = 'utf-8', index = False)              

In [228]:
gdp_pct.to_csv("../data/cleaned/gdp_pct.csv", encoding = 'utf-8', index = False)
gdp_pct_long.to_csv("../data/cleaned/gdp_pct_long.csv", encoding = 'utf-8', index = False)  

In [229]:
schooling.to_csv("../data/cleaned/schooling.csv", encoding = 'utf-8', index = False)
merged.to_csv("../data/cleaned/nodal_time_series.csv", encoding = 'utf-8', index = False)  

In [230]:
rta.to_csv("../data/cleaned/rta_long.csv", encoding = 'utf-8', index = False)

In [231]:
rta_edgelist.to_csv("../data/cleaned/rta_edgelist.csv", encoding = 'utf-8', index = False)