In [277]:
import pandas as pd

**OECD BIMTS 2 digit HS code level bulk data comes in six files spliting on the time period where each consists of consecutive five years. Here I am considering 2000 to 2023, which comes in five different csv files. Cleaning and preparing of the data take the following steps:**  

1. Reading data for each five year and keeping only the HS codes those are relevant to this study
2. Removing 'world' from both reference and counterpart area which is coded as 'W' 
3. Making five dataframes- one for the **total** export mentioned in the product HS as the category *_T* and four others for each cultural products and naming those as **unique** for HS code *97*, **cinema** for HS code *37*, **books** for HS code *49* and **tapes** for HS code *85* for every five year data. Subsetting each dataset on the adjustment column value **B_ADJ_RX** which represents the reconciled bilateral trade flow, adjusted for re-exports. 
4. Cleaning this data takes the following steps:
   > - a. Renaming the exporter and importer columns
   > - b. Removing self loops and duplicate rows and keeping the first instance only
   > - c. Counting hysteresis column for all dataframes
   > - d. Sorting values by country pair and time period
   > - e. Reseting index   
8. Aggregating the three different reproducible cultural goods data on country pair and time, while tracking the HS codes and number of products traded among each pair in a particular year in the column 'PRODUCT_HS' and 'product_count'
10. So this notebook will save six different csv files in the 'cleaned' subfolder of the folder 'data', for four different HS code level data and two for aggregated reproducible cultural goods and non-cultural goods data. 

In [278]:
c00_04 = pd.read_csv('../data/raw/HS17_2D_DE_2000_To_2004.csv')
c05_09 = pd.read_csv('../data/raw/HS17_2D_DE_2005_To_2009.csv')
c10_14 = pd.read_csv('../data/raw/HS17_2D_DE_2010_To_2014.csv')
c15_19 = pd.read_csv('../data/raw/HS17_2D_DE_2015_To_2019.csv')
c020_23 = pd.read_csv('../data/raw/HS17_2D_DE_2020_To_2023.csv')

In [279]:
trade = [c00_04, c05_09, c10_14, c15_19, c020_23]

In [280]:
useful = ['REF_AREA', 'COUNTERPART_AREA', 'PRODUCT_HS', 'TIME_PERIOD', 'OBS_VALUE', 'ADJUSTMENT']

In [281]:
trade_list = []
for datafile in trade: 
    datafile = datafile[useful]
    datafile = datafile[datafile['ADJUSTMENT'] == 'B_ADJ_RX']
    datafile = datafile.rename(columns = {'REF_AREA': 'iso_o'})
    datafile = datafile.rename(columns = {'COUNTERPART_AREA': 'iso_d'})
    datafile = datafile[datafile['iso_o'] != 'W']
    datafile = datafile[datafile['iso_d'] != 'W']
    datafile = datafile[datafile['iso_o'] != datafile['iso_d']]
    trade_list.append(datafile)

In [282]:
type(trade_list)

list

In [283]:
complete_data = pd.concat(trade_list).sort_values(['TIME_PERIOD','iso_o','iso_d']).reset_index(drop = True)

In [284]:
complete_data.shape

(20893174, 6)

In [285]:
complete_data.shape

(20893174, 6)

In [286]:
print(complete_data['iso_o'].nunique())
print(complete_data['iso_d'].nunique())
print(complete_data['TIME_PERIOD'].nunique())
print(complete_data['PRODUCT_HS'].nunique())

201
201
24
98


### Non-cultural products

In [287]:
non_cultural = complete_data[(complete_data['PRODUCT_HS'] != '_T') & (complete_data['PRODUCT_HS'] != 'HS17_97') & (complete_data['PRODUCT_HS'] != 'HS17_49') & (complete_data['PRODUCT_HS'] != 'HS17_37') & (complete_data['PRODUCT_HS'] != 'HS17_85')]

In [288]:
non_cultural_duplicate = non_cultural.duplicated(keep = 'last')
non_cultural_unique = non_cultural[~non_cultural_duplicate]

In [289]:
non_cultural.shape

(19079263, 6)

In [290]:
memory = 0.3
def compute_hysteresis(series):
    '''It takes a numeric panda series, initializes an empty list to store the hysteresis values, and two values for 
    previous year's trade volume and cummulative trade volume. Then it counts the husteresis value for a particular year and 
    appends it to the list of hysteresis values.'''
    hyst = []
    prev_cum = 0
    prev_val = 0
    for val in series:
        hyst_val = memory * prev_cum + prev_val
        hyst.append(hyst_val)
        prev_cum = hyst_val
        prev_val = val
    return hyst

In [291]:
non_cultural_unique_grouped = non_cultural_unique.groupby(['iso_o', 'iso_d', 'TIME_PERIOD'])['OBS_VALUE'].sum().reset_index()

In [292]:
non_cultural_unique_grouped['hysteresis_total'] = (non_cultural_unique_grouped.groupby(['iso_o', 'iso_d'])['OBS_VALUE'].transform(compute_hysteresis))

In [293]:
print(non_cultural_unique['PRODUCT_HS'].nunique())

93


In [294]:
print(non_cultural_unique_grouped['iso_o'].nunique())
print(non_cultural_unique_grouped['iso_d'].nunique())

201
201


In [295]:
non_cultural_unique_grouped = non_cultural_unique_grouped.sort_values(['iso_o', 'iso_d', 'TIME_PERIOD'])

In [296]:
non_cultural_unique_grouped.head(2)

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,OBS_VALUE,hysteresis_total
0,ABW,AFG,2017,0.065851,0.0
1,ABW,AFG,2018,0.398446,0.065851


In [297]:
non_cultural_unique_grouped.to_csv("../data/cleaned/total2000_2023.csv", encoding = 'utf-8', index = False)

### Total trade

In [298]:
total = complete_data[complete_data['PRODUCT_HS'] == '_T']

In [299]:
total.shape

(691591, 6)

In [300]:
total_duplicate = total.duplicated(keep = 'last')
total_unique = total[~total_duplicate]

In [301]:
total_unique.shape

(691591, 6)

### Unique cultural products

In [302]:
unique = complete_data[complete_data['PRODUCT_HS'] == 'HS17_97']

In [303]:
unique.shape

(174305, 6)

In [304]:
unique_duplicate = unique.duplicated(keep = 'last')
unique_unique = unique[~unique_duplicate]

In [305]:
unique_unique.shape

(174305, 6)

In [306]:
unique_unique['hysteresis_unique'] = (unique_unique.groupby(['iso_o', 'iso_d'])['OBS_VALUE'].transform(compute_hysteresis))

In [307]:
unique_unique = unique_unique.sort_values(['iso_o', 'iso_d', 'TIME_PERIOD'])

In [308]:
unique_unique.head()

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT,hysteresis_unique
11312574,ABW,AGO,HS17_97,2014,2.8e-05,B_ADJ_RX,0.0
3671347,ABW,ARE,HS17_97,2005,0.000641,B_ADJ_RX,0.0
11312577,ABW,ARE,HS17_97,2014,5.3e-05,B_ADJ_RX,0.000641
16993269,ABW,ARE,HS17_97,2020,1.9e-05,B_ADJ_RX,0.000245
18914685,ABW,ARE,HS17_97,2022,0.000132,B_ADJ_RX,9.3e-05


In [309]:
unique_unique.to_csv("../data/cleaned/unique2000_2023.csv", encoding='utf-8', index=False)

### Books

In [310]:
books = complete_data[complete_data['PRODUCT_HS'] == 'HS17_49']

In [311]:
books.shape

(339113, 6)

In [312]:
books_duplicate = books.duplicated(keep = 'last')
books_unique = books[~books_duplicate]

In [313]:
books_unique = books_unique.sort_values(['iso_o', 'iso_d', 'TIME_PERIOD'])
books_unique = books_unique.reset_index(drop= True)
books_unique['hysteresis_books'] = (books_unique.groupby(['iso_o', 'iso_d'])['OBS_VALUE'].transform(compute_hysteresis))

In [314]:
books_unique.shape

(339113, 7)

In [315]:
books_unique.head(3)

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT,hysteresis_books
0,ABW,AFG,HS17_49,2019,0.003429,B_ADJ_RX,0.0
1,ABW,ARE,HS17_49,2013,0.000953,B_ADJ_RX,0.0
2,ABW,ARE,HS17_49,2018,0.001486,B_ADJ_RX,0.000953


### Cinema

In [316]:
cinema = complete_data[complete_data['PRODUCT_HS'] == 'HS17_37']

In [317]:
cinema.shape

(138525, 6)

In [318]:
cinema_duplicate = cinema.duplicated(keep = 'last')
cinema_unique = cinema[~cinema_duplicate]

In [319]:
cinema_unique.shape

(138525, 6)

In [320]:
cinema_unique = cinema_unique.sort_values(['iso_o', 'iso_d', 'TIME_PERIOD'])
cinema_unique = cinema_unique.reset_index(drop= True)
cinema_unique['hysteresis_cinema'] = (cinema_unique.groupby(['iso_o', 'iso_d'])['OBS_VALUE'].transform(compute_hysteresis))

In [321]:
cinema_unique.head(3)

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT,hysteresis_cinema
0,ABW,AUS,HS17_37,2001,0.000258,B_ADJ_RX,0.0
1,ABW,BEL,HS17_37,2008,9.4e-05,B_ADJ_RX,0.0
2,ABW,BEL,HS17_37,2009,1.6e-05,B_ADJ_RX,9.4e-05


### Tapes

In [322]:
tapes = complete_data[complete_data['PRODUCT_HS'] == 'HS17_85']

In [323]:
tapes_duplicate = tapes.duplicated(keep = 'last')
tapes_unique = tapes[~tapes_duplicate]

In [324]:
tapes_unique = tapes_unique.sort_values(['iso_o', 'iso_d', 'TIME_PERIOD'])
tapes_unique = tapes_unique.reset_index(drop= True)
tapes_unique['hysteresis_tapes'] = (tapes_unique.groupby(['iso_o', 'iso_d'])['OBS_VALUE'].transform(compute_hysteresis))

In [325]:
tapes_unique.head()

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT,hysteresis_tapes
0,ABW,AFG,HS17_85,2019,0.008632,B_ADJ_RX,0.0
1,ABW,AGO,HS17_85,2008,0.000165,B_ADJ_RX,0.0
2,ABW,AGO,HS17_85,2011,7.6e-05,B_ADJ_RX,0.000165
3,ABW,ARE,HS17_85,2016,0.018888,B_ADJ_RX,0.0
4,ABW,ARE,HS17_85,2017,0.016615,B_ADJ_RX,0.018888


In [326]:
tapes_unique.shape

(470377, 7)

In [327]:
tapes_unique.to_csv("../data/cleaned/tapes2000_2023.csv", encoding='utf-8', index=False)
books_unique.to_csv("../data/cleaned/books2000_2023.csv", encoding='utf-8', index=False)
cinema_unique.to_csv("../data/cleaned/cinema2000_2023.csv", encoding='utf-8', index=False)

### Reproducible-cultural goods

##### Steps to aggregate three different reproducible cultural products datasets:
1. Concatenating tapes, books and cinemas dataframes on columns. 
2. Grouping the concatenaded data by country pair and year,
3. Summing over the obs_value which is the exchange rate converted US dollar value of trade for that particular year
4. Adding a column which will record the HS codes of the products traded on that year

In [328]:
tapes_unique = tapes_unique.drop('hysteresis_tapes', axis = 1)
books_unique = books_unique.drop('hysteresis_books', axis = 1)
cinema_unique = cinema_unique.drop('hysteresis_cinema', axis = 1)

In [329]:
rp = pd.concat([tapes_unique, books_unique, cinema_unique], axis = 0)

In [330]:
print(rp['iso_o'].nunique())
print(rp['iso_d'].nunique())
print(rp['TIME_PERIOD'].nunique())
print(rp['PRODUCT_HS'].nunique())
print(rp.groupby(['iso_o','iso_d']).ngroups)

201
201
24
3
33443


In [331]:
rp_agg = rp.groupby(['iso_o', 'iso_d', 'TIME_PERIOD']).agg({'OBS_VALUE': 'sum', 'PRODUCT_HS': lambda x :tuple(sorted(set(x)))}).reset_index()

In [332]:
rp_agg['product_count'] = rp_agg['PRODUCT_HS'].apply(len)

In [333]:
rp_agg.head(2)

Unnamed: 0,iso_o,iso_d,TIME_PERIOD,OBS_VALUE,PRODUCT_HS,product_count
0,ABW,AFG,2019,0.012061,"(HS17_49, HS17_85)",2
1,ABW,AGO,2008,0.000165,"(HS17_85,)",1


In [334]:
print(rp_agg['iso_o'].nunique())
print(rp_agg['iso_d'].nunique())
print(rp_agg['TIME_PERIOD'].nunique())
print(rp_agg['PRODUCT_HS'].nunique())
print(rp_agg.groupby(['iso_o','iso_d']).ngroups)

201
201
24
7
33443


In [335]:
print(rp_agg['PRODUCT_HS'].value_counts())

PRODUCT_HS
(HS17_49, HS17_85)             185011
(HS17_85,)                     148323
(HS17_37, HS17_49, HS17_85)    132380
(HS17_49,)                      21362
(HS17_37, HS17_85)               4663
(HS17_37,)                       1122
(HS17_37, HS17_49)                360
Name: count, dtype: int64


In [336]:
print(rp_agg['product_count'].value_counts())

product_count
2    190034
1    170807
3    132380
Name: count, dtype: int64


In [337]:
rp_agg['hysteresis_repro'] = (rp_agg.groupby(['iso_o', 'iso_d'])['OBS_VALUE'].transform(compute_hysteresis))

In [338]:
tapes_unique

Unnamed: 0,iso_o,iso_d,PRODUCT_HS,TIME_PERIOD,OBS_VALUE,ADJUSTMENT
0,ABW,AFG,HS17_85,2019,0.008632,B_ADJ_RX
1,ABW,AGO,HS17_85,2008,0.000165,B_ADJ_RX
2,ABW,AGO,HS17_85,2011,0.000076,B_ADJ_RX
3,ABW,ARE,HS17_85,2016,0.018888,B_ADJ_RX
4,ABW,ARE,HS17_85,2017,0.016615,B_ADJ_RX
...,...,...,...,...,...,...
470372,ZWE,ZMB,HS17_85,2019,5.184637,B_ADJ_RX
470373,ZWE,ZMB,HS17_85,2020,4.691017,B_ADJ_RX
470374,ZWE,ZMB,HS17_85,2021,5.290171,B_ADJ_RX
470375,ZWE,ZMB,HS17_85,2022,5.784952,B_ADJ_RX


In [339]:
assert unique_unique['iso_o'].nunique() == rp_agg['iso_o'].nunique() == non_cultural_unique_grouped['iso_o'].nunique()

In [340]:
assert unique_unique['iso_d'].nunique() == rp_agg['iso_d'].nunique() == non_cultural_unique_grouped['iso_d'].nunique()

In [341]:
rp_agg.to_csv("../data/cleaned/reproducible2000_2023.csv", encoding='utf-8', index=False)