In [1]:
import pandas as pd

**Preparing edgelist for SNA takes the following steps:** 

For edgelists on bilateral trade data: 
> 1. Loading total, unique, reproducible trade data and the fixed dataset from the cleaned subfolder of the data folder.
> 2. Checking for mismatched countries in all data frames using inner join method as the study will consider only the countries that are present in all data frames
> 3. Making an edge list for total goods trade and saving it in the 'cleaned' subfolder of the folder 'data'
> 4. Making an edge list for unique cultural goods and saving it in the 'cleaned' subfolder of the folder 'data'
> 5. Making an edge list for reproducible cultural goods and saving it in the 'cleaned' subfolder of the folder 'data'
> 6. Saving the edgelists into the 'cleaned' subfolder of 'data' folder

For fixed undireced distance, common language, contiguity, colonizer and common colonizer edgelists: 
> 1. Keeping only the relevant column with the origin and destination column to create a primary data frame
> 2. Checking for a single pair of countries whether the value is same for (A, B) and (B, A)
> 3. Adding log transformed distance column to the distance edgelist
> 4. Saving the edgelists into the 'cleaned' subfolder of 'data' folder

`So this notebook will end up making seven different edgelists- three for bilateral trade and four for bilateral time invariant factors.`

In [2]:
total = pd.read_csv('../data/cleaned/total2000_2023.csv')
unique = pd.read_csv('../data/cleaned/unique2000_2023.csv')
repro = pd.read_csv('../data/cleaned/reproducible2000_2023.csv')
books = pd.read_csv('../data/cleaned/books2000_2023.csv')
cinema = pd.read_csv('../data/cleaned/cinema2000_2023.csv')
tapes = pd.read_csv('../data/cleaned/tapes2000_2023.csv')
fixed = pd.read_csv('../data/cleaned/fixed.csv')

In [3]:
print(set(total['iso_o'].unique()) - set(unique['iso_o'].unique()))
print(set(total['iso_o'].unique()) - set(repro['iso_o'].unique()))
print(set(repro['iso_o'].unique()) - set(unique['iso_o'].unique()))
print(set(total['iso_d'].unique()) - set(unique['iso_d'].unique()))
print(set(total['iso_d'].unique()) - set(repro['iso_d'].unique()))
print(set(repro['iso_d'].unique()) - set(unique['iso_d'].unique()))
print(set(books['iso_d'].unique()) - set(cinema['iso_d'].unique()))
print(set(cinema['iso_d'].unique()) - set(tapes['iso_d'].unique()))
print(set(tapes['iso_d'].unique()) - set(cinema['iso_d'].unique()))

set()
set()
set()
set()
set()
set()
set()
set()
set()


##### To ensure seamless production of square matrices from the generated edgelists, I am checking if the exporter and importer countries are the same for all three datasets (total, unique and reproducible).

In [4]:
assert total['iso_o'].nunique() == total['iso_d'].nunique() # checking if the number of items are same in both sets

In [5]:
set(total['iso_o'].unique()) == set(total['iso_d'].unique()) # checking if exactly the items are same in both sets

True

In [6]:
assert unique['iso_o'].nunique() == unique['iso_d'].nunique() # checking if the number of items are same in both sets

In [7]:
set(unique['iso_o'].unique()) == set(unique['iso_d'].unique()) # checking if exactly the items are same in both sets

True

In [8]:
assert repro['iso_o'].nunique() == repro['iso_d'].nunique()

In [9]:
set(repro['iso_o'].unique()) == set(repro['iso_d'].unique()) # checking if exactly the items are same in both sets

True

In [10]:
print(total['iso_o'].nunique())
print(total['iso_d'].nunique())
print(repro['iso_o'].nunique())
print(repro['iso_d'].nunique())
print(unique['iso_o'].nunique())
print(unique['iso_d'].nunique())
print(books['iso_o'].nunique())
print(books['iso_d'].nunique())
print(cinema['iso_o'].nunique())
print(cinema['iso_d'].nunique())
print(tapes['iso_o'].nunique())
print(tapes['iso_d'].nunique())
print(fixed['iso_o'].nunique())
print(fixed['iso_d'].nunique())

201
201
201
201
201
201
201
201
200
201
201
201
224
224


###### So, there are 201 countries in trade datasets, where the fixed dataset has 224 countries.

In [11]:
f_u_ego = set(fixed['iso_o'].unique()) - set(unique['iso_o'].unique())
f_r_ego = set(fixed['iso_o'].unique()) - set(repro['iso_o'].unique())
f_t_ego = set(fixed['iso_o'].unique()) - set(total['iso_o'].unique())
f_u_alter = set(fixed['iso_d'].unique()) - set(unique['iso_d'].unique())
f_r_alter = set(fixed['iso_d'].unique()) - set(repro['iso_d'].unique())
f_t_alter = set(fixed['iso_d'].unique()) - set(total['iso_d'].unique())

In [12]:
len(f_u_ego) == len(f_r_ego) == len(f_t_ego) == len(f_u_alter) == len(f_r_alter) == len(f_t_alter)

True

In [13]:
u_f_ego = set(unique['iso_o'].unique()) - set(fixed['iso_o'].unique())
r_f_ego = set(repro['iso_o'].unique()) - set(fixed['iso_o'].unique())
t_f_ego = set(total['iso_o'].unique()) - set(fixed['iso_o'].unique())
u_f_alter = set(unique['iso_d'].unique()) - set(fixed['iso_d'].unique())
r_f_alter = set(repro['iso_d'].unique()) - set(fixed['iso_d'].unique())
t_f_alter = set(total['iso_d'].unique()) - set(fixed['iso_d'].unique())

In [14]:
len(u_f_ego) == len(r_f_ego) == len(t_f_ego) == len(u_f_alter) == len(r_f_alter) == len(t_f_alter)

True

In [15]:
len(u_f_ego)

9

In [16]:
len(f_u_ego)

32

###### So, the difference from the fixed dataset to trade datasets are all same and reverse is true as well. 

###### So, there are 192 countries present in both datasets. This study will prune other countries for seamless social network analysis. 

In [17]:
# Finding out common elements of both sets:
# Reference: https://www.geeksforgeeks.org/python-print-common-elements-two-lists/
m = list(set(fixed['iso_o'].unique()) & set(unique['iso_o'].unique()))
n = list(set(fixed['iso_o'].unique()) & set(repro['iso_o'].unique()))
o = list(set(fixed['iso_o'].unique()) & set(total['iso_o'].unique()))
p = list(set(fixed['iso_d'].unique()) & set(unique['iso_d'].unique()))
q = list(set(fixed['iso_d'].unique()) & set(repro['iso_d'].unique()))
r = list(set(fixed['iso_d'].unique()) & set(total['iso_d'].unique()))

In [18]:
len(m) == len(n) == len(o) == len(p) == len(q) == len(r)

True

In [19]:
len(m)

192

In [20]:
from collections import Counter

In [21]:
Counter(m) == Counter(n) == Counter(o) == Counter(p) == Counter(q) == Counter(r)

True

##### keeping only the common countries in all dyad level datasets.

In [22]:
unique_com = unique[unique['iso_o'].isin(m) & unique['iso_d'].isin(m)]
repro_com = repro[repro['iso_o'].isin(m) & repro['iso_d'].isin(m)]
total_com = total[total['iso_o'].isin(m) & total['iso_d'].isin(m)]
fixed_com = fixed[fixed['iso_o'].isin(m) & fixed['iso_d'].isin(m)]

In [23]:
print(total_com['iso_o'].nunique())
print(total_com['iso_d'].nunique())
print(repro_com['iso_o'].nunique())
print(repro_com['iso_d'].nunique())
print(unique_com['iso_o'].nunique())
print(unique_com['iso_d'].nunique())
print(fixed_com['iso_o'].nunique())
print(fixed_com['iso_d'].nunique())

192
192
192
192
192
192
192
192


### Edgelist for total trade & Hysteresis effect

In [24]:
set(total_com['iso_o'].unique()) == set(total_com['iso_d'].unique())

True

In [25]:
edgelist_total = total_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [26]:
edgelist_total_rounded = edgelist_total.round(4)

In [27]:
assert total_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_total_rounded.shape[0]

In [28]:
edgelist_total_rounded.to_csv("../data/cleaned/total_edgelist.csv", encoding='utf-8', index=False)

In [29]:
total_com.head(3)

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,OBS_VALUE,hysteresis_total
0,ABW,AFG,2017,0.065851,0.0
1,ABW,AFG,2018,0.398446,0.065851
2,ABW,AFG,2019,0.254087,0.418202


In [30]:
edgelist_total_hyst = total_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'hysteresis_total', fill_value = 0).reset_index()

In [31]:
assert total_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_total_rounded.shape[0] == edgelist_total_hyst.shape[0]

In [32]:
edgelist_total_hyst.head(3)

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ABW,AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.065851,0.418202,0.0,0.0,0.0,0.0
1,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.002843,0.061405,...,4.822801,0.0,0.0,0.0,0.0,1.484037,0.0,0.0,0.0,0.451015
2,ABW,ALB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
edgelist_total_hyst.to_csv("../data/cleaned/edgelist_total_hyst.csv", encoding='utf-8', index=False)

### Edgelist for unique cultural goods trade & Hysteresis Effect

In [34]:
set(unique_com['iso_o'].unique()) == set(unique_com['iso_d'].unique())

True

In [35]:
edgelist_unique = unique_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [36]:
edgelist_unique_rounded = edgelist_unique.round(4)

In [37]:
edgelist_unique_rounded['iso_o'].nunique()

192

In [38]:
assert unique_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_unique_rounded.shape[0] # making sure we have same number of dyads in both panel data and edgelist

In [39]:
edgelist_unique_rounded.to_csv('../data/cleaned/unique_edgelist.csv', encoding = 'utf-8', index = False) 

In [40]:
unique_com.head(3)

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT,hysteresis_unique
0,ABW,AGO,HS17_97,2014,2.8e-05,B_ADJ_RX,0.0
1,ABW,ARE,HS17_97,2005,0.000641,B_ADJ_RX,0.0
2,ABW,ARE,HS17_97,2014,5.3e-05,B_ADJ_RX,0.000641


In [41]:
edgelist_unique_hyst = unique_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'hysteresis_unique', fill_value = 0).reset_index()

In [42]:
edgelist_unique_hyst.head(3)

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABW,ARE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000641,0.0,0.0,0.0,0.0,0.0,0.000245,0.0,9.3e-05,0.000159
2,ABW,ATG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
assert unique_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_unique_rounded.shape[0] == edgelist_unique_hyst.shape[0]

In [44]:
edgelist_unique_hyst.to_csv("../data/cleaned/edgelist_unique_hyst.csv", encoding='utf-8', index=False)

### Edgelist for reproducible cultural goods trade & Hysteresis effect

In [45]:
set(repro_com['iso_o'].unique()) == set(repro_com['iso_d'].unique())

True

In [46]:
edgelist_repro = repro_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'OBS_VALUE', fill_value = 0).reset_index()

In [47]:
edgelist_repro_rounded = edgelist_repro.round(4)

In [48]:
edgelist_repro_rounded['iso_o'].nunique()

192

In [49]:
assert repro_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_repro_rounded.shape[0]

In [50]:
edgelist_repro_rounded.to_csv('../data/cleaned/reproducible_edgelist.csv', encoding = 'utf-8', index = False) 

In [51]:
edgelist_repro_hyst = repro_com.pivot_table(index = ['iso_o', 'iso_d'], columns = 'TIME_PERIOD', values = 'hysteresis_repro', fill_value = 0).reset_index()

In [52]:
assert repro_com.groupby(['iso_o', 'iso_d']).ngroups == edgelist_repro_rounded.shape[0] == edgelist_repro_hyst.shape[0]

In [53]:
repro_com.head(2)

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,OBS_VALUE,PRODUCT_HS,product_count,hysteresis_repro
0,ABW,AFG,2019,0.012061,"('HS17_49', 'HS17_85')",2,0.0
1,ABW,AGO,2008,0.000165,"('HS17_85',)",1,0.0


In [54]:
edgelist_repro_hyst.head(3)

TIME_PERIOD,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,ABW,AFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABW,AGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ABW,ARE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000953,0.019174,0.022367,0.11647,0.054564,0.022476,0.008654,0.012594


In [55]:
edgelist_repro_hyst.to_csv("../data/cleaned/edgelist_repro_hyst.csv", encoding='utf-8', index=False)

### Edgelist for contiguity

In [56]:
set(fixed_com['iso_o'].unique()) == set(fixed_com['iso_d'].unique())

True

In [57]:
fixed_com_con = fixed_com[['iso_o', 'iso_d', 'contig']]

In [58]:
fixed_com_con.shape

(36672, 3)

In [59]:
print(fixed_com_con[(fixed_com_con['iso_o'] == 'BGD') & (fixed_com_con['iso_d'] == 'IND')])
print(fixed_com_con[(fixed_com_con['iso_o'] == 'IND') & (fixed_com_con['iso_d'] == 'BGD')])

     iso_o iso_d  contig
4105   BGD   IND       1
      iso_o iso_d  contig
20534   IND   BGD       1


In [60]:
print(fixed_com_con['iso_o'].nunique())
print(fixed_com_con['iso_o'].nunique())
print(fixed_com_con.groupby(['iso_o', 'iso_d']).ngroups)

192
192
36672


In [61]:
fixed_com_con.to_csv('../data/cleaned/contiguity_edgelist.csv', encoding = 'utf-8', index = False) 

### Edgelist for distance

In [62]:
fixed_com.head(2)

Unnamed: 0,iso_o,iso_d,contig,comlang_off,colony,comcol,dist,log_dist,country,landlocked,continent,lat,lon,langoff_1,colonizer1,category,development
0,ABW,AFG,0,0,0,0,13257.81,9.492342,Aruba,0,America,12.55,-70.099998,Dutch,NLD,Emerging and Developing Economies,0.0
1,ABW,AGO,0,0,0,0,9516.913,9.160826,Aruba,0,America,12.55,-70.099998,Dutch,NLD,Emerging and Developing Economies,0.0


In [63]:
distance = fixed_com[['iso_o', 'iso_d', 'log_dist']]

In [64]:
print(distance[(distance['iso_o'] == 'BGD') & (distance['iso_d'] == 'IND')])
print(distance[(distance['iso_o'] == 'IND') & (distance['iso_d'] == 'BGD')])

     iso_o iso_d  log_dist
4105   BGD   IND  7.259776
      iso_o iso_d  log_dist
20534   IND   BGD  7.259776


In [65]:
distance.shape

(36672, 3)

In [66]:
print(distance['iso_o'].nunique())
print(distance['iso_d'].nunique())

192
192


In [67]:
distance.head(2)

Unnamed: 0,iso_o,iso_d,log_dist
0,ABW,AFG,9.492342
1,ABW,AGO,9.160826


In [68]:
distance.to_csv('../data/cleaned/distance_edgelist.csv', encoding = 'utf-8', index = False) 

### Edgelist for common colonizer

In [69]:
comcol = fixed_com[['iso_o', 'iso_d', 'comcol']]

In [70]:
comcol.shape

(36672, 3)

In [71]:
print(comcol[(comcol['iso_o'] == 'BGD') & (comcol['iso_d'] == 'IND')])
print(comcol[(comcol['iso_o'] == 'IND') & (comcol['iso_d'] == 'BGD')])

     iso_o iso_d  comcol
4105   BGD   IND       1
      iso_o iso_d  comcol
20534   IND   BGD       1


In [72]:
print(comcol['iso_o'].nunique())
print(comcol['iso_d'].nunique())

192
192


In [73]:
comcol.to_csv('../data/cleaned/comcol_edgelist.csv', encoding = 'utf-8', index = False)

### Edgelist for colonizer

In [74]:
col = fixed_com[['iso_o', 'iso_d', 'colony']]

In [75]:
col.shape

(36672, 3)

In [76]:
print(col[(col['iso_o'] == 'BGD') & (col['iso_d'] == 'GBR')])
print(col[(col['iso_o'] == 'IND') & (col['iso_d'] == 'GBR')])

     iso_o iso_d  colony
4084   BGD   GBR       1
      iso_o iso_d  colony
20587   IND   GBR       1


In [77]:
print(col['iso_o'].nunique())
print(col['iso_d'].nunique())

192
192


In [78]:
col.to_csv('../data/cleaned/colonizer_edgelist.csv', encoding = 'utf-8', index = False)

### Edgelist for common language

In [79]:
lang = fixed_com[['iso_o', 'iso_d', 'comlang_off']]

In [80]:
lang.shape

(36672, 3)

In [81]:
print(lang[(lang['iso_o'] == 'BGD') & (lang['iso_d'] == 'IND')])
print(lang[(lang['iso_o'] == 'IND') & (lang['iso_d'] == 'BGD')])

     iso_o iso_d  comlang_off
4105   BGD   IND            0
      iso_o iso_d  comlang_off
20534   IND   BGD            0


In [82]:
print(lang['iso_o'].nunique())
print(lang['iso_d'].nunique())

192
192


In [83]:
lang.to_csv('../data/cleaned/language_edgelist.csv', encoding = 'utf-8', index = False)