In [246]:
import pandas as pd

**Preparing edgelist for SNA takes the following steps:** 

For edgelists on bilateral trade data: 
> 1. Loading total, unique, reproducible trade data and the fixed dataset from the cleaned subfolder of the data folder.
> 2. Checking for mismatched countries in all data frames using inner join method as the study will consider only the countries that are present in all data frames
> 3. Making an edge list for total goods trade and saving it in the 'cleaned' subfolder of the folder 'data'
> 4. Making an edge list for unique cultural goods and saving it in the 'cleaned' subfolder of the folder 'data'
> 5. Making an edge list for reproducible cultural goods and saving it in the 'cleaned' subfolder of the folder 'data'
> 6. Saving the edgelists into the 'cleaned' subfolder of 'data' folder

For fixed undireced distance, common language, contiguity, colonizer and common colonizer edgelists: 
> 1. Keeping only the relevant column with the origin and destination column to create a primary data frame
> 2. Checking for a single pair of countries whether the value is same for (A, B) and (B, A)
> 3. Adding log transformed distance column to the distance edgelist
> 4. Saving the edgelists into the 'cleaned' subfolder of 'data' folder

`So this notebook will end up making seven different edgelists- three for bilateral trade and four for bilateral time invariant factors.`

In [247]:
total = pd.read_csv('../data/cleaned/total2000_2023.csv')
unique = pd.read_csv('../data/cleaned/unique2000_2023.csv')
repro = pd.read_csv('../data/cleaned/reproducible2000_2023.csv')
books = pd.read_csv('../data/cleaned/books2000_2023.csv')
cinema = pd.read_csv('../data/cleaned/cinema2000_2023.csv')
tapes = pd.read_csv('../data/cleaned/tapes2000_2023.csv')
fixed = pd.read_csv('../data/cleaned/fixed.csv')

In [248]:
print(set(total['iso_o'].unique()) - set(unique['iso_o'].unique()))
print(set(total['iso_o'].unique()) - set(repro['iso_o'].unique()))
print(set(repro['iso_o'].unique()) - set(unique['iso_o'].unique()))
print(set(total['iso_d'].unique()) - set(unique['iso_d'].unique()))
print(set(total['iso_d'].unique()) - set(repro['iso_d'].unique()))
print(set(repro['iso_d'].unique()) - set(unique['iso_d'].unique()))
print(set(books['iso_d'].unique()) - set(cinema['iso_d'].unique()))
print(set(cinema['iso_d'].unique()) - set(tapes['iso_d'].unique()))
print(set(tapes['iso_d'].unique()) - set(cinema['iso_d'].unique()))

set()
set()
set()
set()
set()
set()
set()
set()
set()


##### To ensure seamless production of square matrices from the generated edgelists, I am checking if the exporter and importer countries are the same for all three datasets (total, unique and reproducible).

In [249]:
assert total['iso_o'].nunique() == total['iso_d'].nunique() # checking if the number of items are same in both sets

In [250]:
set(total['iso_o'].unique()) == set(total['iso_d'].unique()) # checking if exactly the items are same in both sets

True

In [251]:
assert unique['iso_o'].nunique() == unique['iso_d'].nunique() # checking if the number of items are same in both sets

In [252]:
set(unique['iso_o'].unique()) == set(unique['iso_d'].unique()) # checking if exactly the items are same in both sets

True

In [253]:
assert repro['iso_o'].nunique() == repro['iso_d'].nunique()

In [254]:
set(repro['iso_o'].unique()) == set(repro['iso_d'].unique()) # checking if exactly the items are same in both sets

True

In [255]:
print(total['iso_o'].nunique())
print(total['iso_d'].nunique())
print(repro['iso_o'].nunique())
print(repro['iso_d'].nunique())
print(unique['iso_o'].nunique())
print(unique['iso_d'].nunique())
print(books['iso_o'].nunique())
print(books['iso_d'].nunique())
print(cinema['iso_o'].nunique())
print(cinema['iso_d'].nunique())
print(tapes['iso_o'].nunique())
print(tapes['iso_d'].nunique())
print(fixed['iso_o'].nunique())
print(fixed['iso_d'].nunique())

201
201
201
201
201
201
201
201
200
201
201
201
224
224


###### So, there are 201 countries in trade datasets, where the fixed dataset has 224 countries.

In [256]:
f_u_ego = set(fixed['iso_o'].unique()) - set(unique['iso_o'].unique())
f_r_ego = set(fixed['iso_o'].unique()) - set(repro['iso_o'].unique())
f_t_ego = set(fixed['iso_o'].unique()) - set(total['iso_o'].unique())
f_u_alter = set(fixed['iso_d'].unique()) - set(unique['iso_d'].unique())
f_r_alter = set(fixed['iso_d'].unique()) - set(repro['iso_d'].unique())
f_t_alter = set(fixed['iso_d'].unique()) - set(total['iso_d'].unique())

In [257]:
len(f_u_ego) == len(f_r_ego) == len(f_t_ego) == len(f_u_alter) == len(f_r_alter) == len(f_t_alter)

True

In [258]:
u_f_ego = set(unique['iso_o'].unique()) - set(fixed['iso_o'].unique())
r_f_ego = set(repro['iso_o'].unique()) - set(fixed['iso_o'].unique())
t_f_ego = set(total['iso_o'].unique()) - set(fixed['iso_o'].unique())
u_f_alter = set(unique['iso_d'].unique()) - set(fixed['iso_d'].unique())
r_f_alter = set(repro['iso_d'].unique()) - set(fixed['iso_d'].unique())
t_f_alter = set(total['iso_d'].unique()) - set(fixed['iso_d'].unique())

In [259]:
len(u_f_ego) == len(r_f_ego) == len(t_f_ego) == len(u_f_alter) == len(r_f_alter) == len(t_f_alter)

True

In [260]:
len(u_f_ego)

9

In [261]:
len(f_u_ego)

32

###### So, the difference from the fixed dataset to trade datasets are all same and reverse is true as well. 

###### So, there are 192 countries present in both datasets. This study will prune other countries for seamless social network analysis. 

In [262]:
# Finding out common elements of both sets:
# Reference: https://www.geeksforgeeks.org/python-print-common-elements-two-lists/
m = list(set(fixed['iso_o'].unique()) & set(unique['iso_o'].unique()))
n = list(set(fixed['iso_o'].unique()) & set(repro['iso_o'].unique()))
o = list(set(fixed['iso_o'].unique()) & set(total['iso_o'].unique()))
p = list(set(fixed['iso_d'].unique()) & set(unique['iso_d'].unique()))
q = list(set(fixed['iso_d'].unique()) & set(repro['iso_d'].unique()))
r = list(set(fixed['iso_d'].unique()) & set(total['iso_d'].unique()))

In [263]:
len(m) == len(n) == len(o) == len(p) == len(q) == len(r)

True

In [264]:
len(m)

192

In [265]:
from collections import Counter

In [266]:
Counter(m) == Counter(n) == Counter(o) == Counter(p) == Counter(q) == Counter(r)

True

##### keeping only the common countries in all dyad level datasets.

In [267]:
unique_com = unique[unique['iso_o'].isin(m) & unique['iso_d'].isin(m)]
repro_com = repro[repro['iso_o'].isin(m) & repro['iso_d'].isin(m)]
total_com = total[total['iso_o'].isin(m) & total['iso_d'].isin(m)]
fixed_com = fixed[fixed['iso_o'].isin(m) & fixed['iso_d'].isin(m)]

In [268]:
print(total_com['iso_o'].nunique())
print(total_com['iso_d'].nunique())
print(repro_com['iso_o'].nunique())
print(repro_com['iso_d'].nunique())
print(unique_com['iso_o'].nunique())
print(unique_com['iso_d'].nunique())
print(fixed_com['iso_o'].nunique())
print(fixed_com['iso_d'].nunique())

192
192
192
192
192
192
192
192


In [269]:
total_com.head()

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,OBS_VALUE,hysteresis_total
0,ABW,AFG,2017,0.065851,0.0
1,ABW,AFG,2018,0.398446,0.065851
2,ABW,AFG,2019,0.254087,0.418202
3,ABW,AGO,2005,0.002843,0.0
4,ABW,AGO,2006,0.060552,0.002843


### Edgelist for total trade & Hysteresis effect

In [270]:
set(total_com['iso_o'].unique()) == set(total_com['iso_d'].unique())

True

In [271]:
edgelist_total = total_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [272]:
edgelist_total_rounded = edgelist_total.round(4)

In [273]:
assert total_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_total_rounded.shape[0]

In [274]:
edgelist_total_rounded.head(3)

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ABW,AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0659,0.3984,0.2541,0.0,0.0,0.0,0.0
1,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0028,0.0606,0.0239,...,0.0372,0.0,0.0,0.0,0.0,0.0058,0.0,0.0,0.0,0.015
2,ABW,ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0003,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [275]:
edgelist_total_rounded.to_csv("../data/cleaned/total_edgelist.csv", encoding='utf-8', index=False)

In [276]:
total_com.head(3)

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,OBS_VALUE,hysteresis_total
0,ABW,AFG,2017,0.065851,0.0
1,ABW,AFG,2018,0.398446,0.065851
2,ABW,AFG,2019,0.254087,0.418202


In [277]:
edgelist_total_hyst = total_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'hysteresis_total', fill_value = 0).reset_index()

In [278]:
edgelist_total_hyst.head()

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ABW,AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.065851,0.418202,0.0,0.0,0.0,0.0
1,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.002843,0.061405,...,4.822801,0.0,0.0,0.0,0.0,1.484037,0.0,0.0,0.0,0.451015
2,ABW,ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ABW,ARE,0.0,0.0,0.0,0.000101,0.0,0.0,0.0,0.049586,...,0.159691,0.177917,0.053394,0.462883,0.528631,1.254906,1.021522,0.607121,0.471481,1.367017
4,ABW,ARG,0.0,0.034091,0.01023,0.028921,11.989098,0.0,22.533631,15.074821,...,0.0,0.0,0.0,6.371348,1.911455,0.57359,0.17208,0.0,0.051938,0.016071


In [279]:
edgelist_total_hyst.to_csv("../data/cleaned/edgelist_total_hyst.csv", encoding='utf-8', index=False)

In [280]:
edgelist_total_hyst = edgelist_total_hyst.add_prefix('H_')

In [281]:
edgelist_total_hyst = edgelist_total_hyst.rename(columns = {'H_iso_o': 'iso_o'})
edgelist_total_hyst = edgelist_total_hyst.rename(columns = {'H_iso_d': 'iso_d'})

In [282]:
assert total_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_total_rounded.shape[0] == edgelist_total_hyst.shape[0]

In [283]:
total_merged = edgelist_total_rounded.merge(edgelist_total_hyst, on = ['iso_o', 'iso_d'])

In [284]:
total_merged.head(3)

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,H_2014,H_2015,H_2016,H_2017,H_2018,H_2019,H_2020,H_2021,H_2022,H_2023
0,ABW,AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.065851,0.418202,0.0,0.0,0.0,0.0
1,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0028,0.0606,0.0239,...,4.822801,0.0,0.0,0.0,0.0,1.484037,0.0,0.0,0.0,0.451015
2,ABW,ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [285]:
total_merged.columns

Index([ 'iso_o',  'iso_d',     2000,     2001,     2002,     2003,     2004,
           2005,     2006,     2007,     2008,     2009,     2010,     2011,
           2012,     2013,     2014,     2015,     2016,     2017,     2018,
           2019,     2020,     2021,     2022,     2023, 'H_2000', 'H_2001',
       'H_2002', 'H_2003', 'H_2004', 'H_2005', 'H_2006', 'H_2007', 'H_2008',
       'H_2009', 'H_2010', 'H_2011', 'H_2012', 'H_2013', 'H_2014', 'H_2015',
       'H_2016', 'H_2017', 'H_2018', 'H_2019', 'H_2020', 'H_2021', 'H_2022',
       'H_2023'],
      dtype='object', name='TIME_PERIOD')

In [286]:
total_merged = total_merged[['iso_o','iso_d', 2000,'H_2000',2001,'H_2001',2002,'H_2002',2003,'H_2003',2004,'H_2004',2005,'H_2005',2006,'H_2006', 2007,'H_2007',2008,'H_2008',2009,'H_2009',2010,'H_2010',2011,'H_2011',2012,'H_2012',2013,'H_2013',2014,'H_2014',2015,'H_2015',2016,'H_2016',2017,'H_2017',2018,'H_2018',2019,'H_2019',2020,'H_2020',2021,'H_2021',2022,'H_2022',2023,'H_2023']]

In [287]:
assert total_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_total_rounded.shape[0] == edgelist_total_hyst.shape[0] == total_merged.shape[0]

In [288]:
total_merged.to_csv("../data/cleaned/edgelist_total.csv", encoding='utf-8', index=False)

### Edgelist for unique cultural goods trade & Hysteresis Effect

In [289]:
set(unique_com['iso_o'].unique()) == set(unique_com['iso_d'].unique())

True

In [290]:
edgelist_unique = unique_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [291]:
edgelist_unique_rounded = edgelist_unique.round(4)

In [292]:
edgelist_unique_rounded['iso_o'].nunique()

192

In [293]:
assert unique_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_unique_rounded.shape[0] # making sure we have same number of dyads in both panel data and edgelist

In [294]:
edgelist_unique_rounded.to_csv("../data/cleaned/unique_edgelist.csv", encoding='utf-8', index=False)

In [295]:
unique_com.head(3)

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT,hysteresis_unique
0,ABW,AGO,HS17_97,2014,2.8e-05,B_ADJ_RX,0.0
1,ABW,ARE,HS17_97,2005,0.000641,B_ADJ_RX,0.0
2,ABW,ARE,HS17_97,2014,5.3e-05,B_ADJ_RX,0.000641


In [296]:
edgelist_unique_hyst = unique_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'hysteresis_unique', fill_value = 0).reset_index()

In [297]:
edgelist_unique_hyst.head(3)

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABW,ARE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000641,0.0,0.0,0.0,0.0,0.0,0.000245,0.0,9.3e-05,0.000159
2,ABW,ATG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [298]:
edgelist_unique_hyst.to_csv("../data/cleaned/edgelist_unique_hyst.csv", encoding='utf-8', index=False)

In [299]:
edgelist_unique_hyst = edgelist_unique_hyst.add_prefix('H_')

In [300]:
edgelist_unique_hyst = edgelist_unique_hyst.rename(columns = {'H_iso_o': 'iso_o'})
edgelist_unique_hyst = edgelist_unique_hyst.rename(columns = {'H_iso_d': 'iso_d'})

In [301]:
unique_merged = edgelist_unique_rounded.merge(edgelist_unique_hyst, on = ['iso_o', 'iso_d'])

In [302]:
unique_merged = unique_merged[['iso_o','iso_d', 2000,'H_2000',2001,'H_2001',2002,'H_2002',2003,'H_2003',2004,'H_2004',2005,'H_2005',2006,'H_2006', 2007,'H_2007',2008,'H_2008',2009,'H_2009',2010,'H_2010',2011,'H_2011',2012,'H_2012',2013,'H_2013',2014,'H_2014',2015,'H_2015',2016,'H_2016',2017,'H_2017',2018,'H_2018',2019,'H_2019',2020,'H_2020',2021,'H_2021',2022,'H_2022',2023,'H_2023']]

In [303]:
unique_merged.head()

TIME_PERIOD,iso_o,iso_d,2000,H_2000,2001,H_2001,2002,H_2002,2003,H_2003,...,2019,H_2019,2020,H_2020,2021,H_2021,2022,H_2022,2023,H_2023
0,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABW,ARE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000245,0.0,0.0,0.0001,9.3e-05,0.0001,0.000159
2,ABW,ATG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ABW,AUS,0.0,0.0,0.0,0.0,0.0002,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABW,AUT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [304]:
assert unique_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_unique_rounded.shape[0] == edgelist_unique_hyst.shape[0] == unique_merged.shape[0]

In [305]:
unique_merged.to_csv("../data/cleaned/edgelist_unique.csv", encoding='utf-8', index=False)

### Edgelist for reproducible cultural goods trade & Hysteresis effect

In [306]:
set(repro_com['iso_o'].unique()) == set(repro_com['iso_d'].unique())

True

In [307]:
edgelist_repro = repro_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [308]:
edgelist_repro_rounded = edgelist_repro.round(4)

In [309]:
edgelist_repro_rounded['iso_o'].nunique()

192

In [310]:
assert repro_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_repro_rounded.shape[0]

In [311]:
edgelist_repro_rounded.to_csv("../data/cleaned/reproducible_edgelist.csv", encoding='utf-8', index=False)

In [312]:
edgelist_repro_hyst = repro_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'hysteresis_repro', fill_value = 0).reset_index()

In [313]:
edgelist_repro_hyst.to_csv("../data/cleaned/edgelist_repro_hyst.csv", encoding='utf-8', index=False)

In [314]:
edgelist_repro_hyst = edgelist_repro_hyst.add_prefix('H_')

In [315]:
edgelist_repro_hyst = edgelist_repro_hyst.rename(columns = {'H_iso_o': 'iso_o'})
edgelist_repro_hyst = edgelist_repro_hyst.rename(columns = {'H_iso_d': 'iso_d'})

In [316]:
repro_merged = edgelist_repro_rounded.merge(edgelist_repro_hyst, on = ['iso_o', 'iso_d'])

In [317]:
repro_merged = repro_merged[['iso_o','iso_d', 2000,'H_2000',2001,'H_2001',2002,'H_2002',2003,'H_2003',2004,'H_2004',2005,'H_2005',2006,'H_2006', 2007,'H_2007',2008,'H_2008',2009,'H_2009',2010,'H_2010',2011,'H_2011',2012,'H_2012',2013,'H_2013',2014,'H_2014',2015,'H_2015',2016,'H_2016',2017,'H_2017',2018,'H_2018',2019,'H_2019',2020,'H_2020',2021,'H_2021',2022,'H_2022',2023,'H_2023']]

In [318]:
repro_merged.head()

TIME_PERIOD,iso_o,iso_d,2000,H_2000,2001,H_2001,2002,H_2002,2003,H_2003,...,2019,H_2019,2020,H_2020,2021,H_2021,2022,H_2022,2023,H_2023
0,ABW,AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ABW,ARE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0196,0.11647,0.0061,0.054564,0.0019,0.022476,0.01,0.008654,0.0921,0.012594
3,ABW,ARG,0.0001,0.0,0.0,0.0,0.0003,0.000104,0.0,0.000289,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABW,ATG,0.0,0.0,0.0002,0.0,0.0004,0.000193,0.0063,0.00045,...,0.0002,0.002042,0.0001,0.000832,0.0,0.0,0.0011,0.000347,0.0,0.0


In [319]:
assert repro_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_repro_rounded.shape[0] == edgelist_repro_hyst.shape[0] == repro_merged.shape[0]

In [320]:
repro_com.head(2)

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,OBS_VALUE,PRODUCT_HS,product_count,hysteresis_repro
0,ABW,AFG,2019,0.012061,"('HS17_49', 'HS17_85')",2,0.0
1,ABW,AGO,2008,0.000165,"('HS17_85',)",1,0.0


In [321]:
repro_merged.to_csv("../data/cleaned/edgelist_repro.csv", encoding='utf-8', index=False)

### Edgelist for contiguity

In [322]:
set(fixed_com['iso_o'].unique()) == set(fixed_com['iso_d'].unique())

True

In [323]:
fixed_com_con = fixed_com[['iso_o', 'iso_d', 'contig']]

In [324]:
fixed_com_con.shape

(36672, 3)

In [325]:
print(fixed_com_con[(fixed_com_con['iso_o'] == 'BGD') & (fixed_com_con['iso_d'] == 'IND')])
print(fixed_com_con[(fixed_com_con['iso_o'] == 'IND') & (fixed_com_con['iso_d'] == 'BGD')])

     iso_o iso_d  contig
4105   BGD   IND       1
      iso_o iso_d  contig
20534   IND   BGD       1


In [326]:
print(fixed_com_con['iso_o'].nunique())
print(fixed_com_con['iso_o'].nunique())
print(fixed_com_con.groupby(['iso_o', 'iso_d']).ngroups)

192
192
36672


In [327]:
fixed_com_con.to_csv('../data/cleaned/contiguity_edgelist.csv', encoding = 'utf-8', index = False) 

### Edgelist for distance

In [328]:
fixed_com.head(2)

Unnamed: 0,iso_o,iso_d,contig,comlang_off,colony,comcol,dist,log_dist,country,landlocked,continent,lat,lon,langoff_1,colonizer1,category,development
0,ABW,AFG,0,0,0,0,13257.81,9.492342,Aruba,0,America,12.55,-70.099998,Dutch,NLD,Emerging and Developing Economies,0.0
1,ABW,AGO,0,0,0,0,9516.913,9.160826,Aruba,0,America,12.55,-70.099998,Dutch,NLD,Emerging and Developing Economies,0.0


In [329]:
distance = fixed_com[['iso_o', 'iso_d', 'log_dist']]

In [330]:
print(distance[(distance['iso_o'] == 'BGD') & (distance['iso_d'] == 'IND')])
print(distance[(distance['iso_o'] == 'IND') & (distance['iso_d'] == 'BGD')])

     iso_o iso_d  log_dist
4105   BGD   IND  7.259776
      iso_o iso_d  log_dist
20534   IND   BGD  7.259776


In [331]:
distance.shape

(36672, 3)

In [332]:
print(distance['iso_o'].nunique())
print(distance['iso_d'].nunique())

192
192


In [333]:
distance.head(2)

Unnamed: 0,iso_o,iso_d,log_dist
0,ABW,AFG,9.492342
1,ABW,AGO,9.160826


In [334]:
distance.to_csv('../data/cleaned/distance_edgelist.csv', encoding = 'utf-8', index = False) 

### Edgelist for common colonizer

In [335]:
comcol = fixed_com[['iso_o', 'iso_d', 'comcol']]

In [336]:
comcol.shape

(36672, 3)

In [337]:
print(comcol[(comcol['iso_o'] == 'BGD') & (comcol['iso_d'] == 'IND')])
print(comcol[(comcol['iso_o'] == 'IND') & (comcol['iso_d'] == 'BGD')])

     iso_o iso_d  comcol
4105   BGD   IND       1
      iso_o iso_d  comcol
20534   IND   BGD       1


In [338]:
print(comcol['iso_o'].nunique())
print(comcol['iso_d'].nunique())

192
192


In [339]:
comcol.to_csv('../data/cleaned/comcol_edgelist.csv', encoding = 'utf-8', index = False)

### Edgelist for colonizer

In [340]:
col = fixed_com[['iso_o', 'iso_d', 'colony']]

In [341]:
col.shape

(36672, 3)

In [342]:
print(col[(col['iso_o'] == 'BGD') & (col['iso_d'] == 'GBR')])
print(col[(col['iso_o'] == 'IND') & (col['iso_d'] == 'GBR')])

     iso_o iso_d  colony
4084   BGD   GBR       1
      iso_o iso_d  colony
20587   IND   GBR       1


In [343]:
print(col['iso_o'].nunique())
print(col['iso_d'].nunique())

192
192


In [344]:
col.to_csv('../data/cleaned/colonizer_edgelist.csv', encoding = 'utf-8', index = False)

### Edgelist for common language

In [345]:
lang = fixed_com[['iso_o', 'iso_d', 'comlang_off']]

In [346]:
lang.shape

(36672, 3)

In [347]:
print(lang[(lang['iso_o'] == 'BGD') & (lang['iso_d'] == 'IND')])
print(lang[(lang['iso_o'] == 'IND') & (lang['iso_d'] == 'BGD')])

     iso_o iso_d  comlang_off
4105   BGD   IND            0
      iso_o iso_d  comlang_off
20534   IND   BGD            0


In [348]:
print(lang['iso_o'].nunique())
print(lang['iso_d'].nunique())

192
192


In [349]:
lang.to_csv('../data/cleaned/language_edgelist.csv', encoding = 'utf-8', index = False)