In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

### Read the monthly trade data from Jan 2000 to Sep 2023

The time range is that the trade data for most countries are updated till Sep 2023

In [2]:
# Specify the path to the folder containing CSV files
folder_path = '../data/raw/MonthlyTradeData'

# Get a list of all files in the folder
files = [file for file in os.listdir(folder_path)]

# Create an empty list to store DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, encoding='unicode_escape')
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame if needed
monthly_TradeData = pd.concat(dfs)

monthly_TradeData = monthly_TradeData[(monthly_TradeData['Period']<=202309)]


# Keep the necessary columns
monthly_TradeData = monthly_TradeData[['Period','ReporterISO','ReporterDesc','PartnerISO','PartnerDesc','PrimaryValue']]

# Save to csv
monthly_TradeData.to_csv('../data/processed/monthly_TradeData.csv',index=False)

In [3]:
monthly_TradeData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 564999 entries, 0 to 37901
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Period        564999 non-null  int64  
 1   ReporterISO   564999 non-null  object 
 2   ReporterDesc  564999 non-null  object 
 3   PartnerISO    564999 non-null  object 
 4   PartnerDesc   564999 non-null  object 
 5   PrimaryValue  564999 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 30.2+ MB


### Get trade volumn data

In [4]:
# Get the world monthly trade data

world_trade_data = monthly_TradeData[monthly_TradeData['PartnerDesc'] == 'World']
world_trade_data = world_trade_data[['Period','ReporterDesc','PrimaryValue']]

# Group by 'Period' and sum the 'PrimaryValue' within each period
trade_volume = world_trade_data.groupby('Period')['PrimaryValue'].sum().reset_index()

# Assuming 'Period' is currently an integer column
trade_volume['Period'] = pd.to_datetime(trade_volume['Period'], format='%Y%m')

# Extract year and month
trade_volume['year'] = trade_volume['Period'].dt.year
trade_volume['month'] = trade_volume['Period'].dt.month

# Format 'Period' in the desired way
trade_volume['period'] = trade_volume['Period'].dt.strftime('%YM%m')

# Drop unnecessary columns
trade_volume = trade_volume[['period', 'PrimaryValue']]

trade_volume.to_csv('../data/processed/trade_volume.csv',index=False)

### Monthly trade data

In [5]:
# Pre-processing data

# Define a function to strip whitespace for categorical values
def strip_categorical(value):
    if isinstance(value, str):
        return value.strip()
    return value

def replace_and_delete_countries(df):
    # Apply the function to the entire DataFrame
    df = df.applymap(strip_categorical)
    
    # Replace "Other Asia, nes" (S19) to Taiwan,China (TWM)
    df['ReporterISO'] = df['ReporterISO'].replace('S19', 'TWN')
    df.loc[df['ReporterISO'] == 'TWN', 'ReporterDesc'] = 'China, Taiwan'
    
    df['PartnerISO'] = df['PartnerISO'].replace('S19', 'TWN')
    df.loc[df['PartnerISO'] == 'TWN', 'PartnerDesc'] = 'China, Taiwan'
    
    # Delete other desc containing ', nes'
    # Replace values containing ', nes' with NaN
    df.loc[df['ReporterDesc'].str.contains(', nes', case=False, na=False), 'ReporterDesc'] = np.nan
    df.loc[df['PartnerDesc'].str.contains(', nes', case=False, na=False), 'PartnerDesc'] = np.nan
    # Delete rows containing Nan
    df = df.dropna()
    
    # Delete 'Free Zone', 'Bunkers', 'Special Categories', 'World', 'Neutral zone' 
    to_delete = ['Free Zone','Free Zones','Bunkers','Special Categories','World','Neutral Zone']
    
    # Delete rows where source is in the list of countries
    df = df[~df['ReporterDesc'].isin(to_delete)]
    # Delete rows where target is in the list of countries
    df = df[~df['PartnerDesc'].isin(to_delete)]

    return df


In [6]:
monthly_TradeData = replace_and_delete_countries(monthly_TradeData)
monthly_TradeData.info()

# Save to csv
monthly_TradeData.to_csv('../data/processed/monthly_TradeData.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 545537 entries, 0 to 37901
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Period        545537 non-null  int64  
 1   ReporterISO   545537 non-null  object 
 2   ReporterDesc  545537 non-null  object 
 3   PartnerISO    545537 non-null  object 
 4   PartnerDesc   545537 non-null  object 
 5   PrimaryValue  545537 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 29.1+ MB


### Get the country list

In [7]:
# Creaet Appendix for country name and ISO code
def get_country_list(df):
    df = df[['ReporterISO','ReporterDesc','PartnerISO','PartnerDesc']]

    # Obtain the countries in the links
    countries_code = pd.concat([df[['ReporterISO']].rename(columns={'ReporterISO': 'ISO'}),df[['PartnerISO']].rename(columns={'PartnerISO': 'ISO'})]).drop_duplicates()
    
    # Match company info with selected companies
    countries = pd.merge(countries_code,df,left_on='ISO', right_on='ReporterISO', how='left')
    countries = countries[['ISO','ReporterDesc']].rename(columns={'ReporterDesc':'Name'}).drop_duplicates()
    
    countries = pd.merge(countries, df, left_on='ISO',right_on='PartnerISO', how='left')
    countries = countries[['ISO','PartnerDesc']].rename(columns={'PartnerDesc':'Name'}).drop_duplicates()
    
    return countries



In [8]:
countries = get_country_list(monthly_TradeData)
countries.to_csv('../data/processed/countries.csv',index=False, encoding='utf')

In [9]:
countries.info()


<class 'pandas.core.frame.DataFrame'>
Index: 238 entries, 0 to 545525
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ISO     238 non-null    object
 1   Name    238 non-null    object
dtypes: object(2)
memory usage: 5.6+ KB


### Read EPU Data https://www.policyuncertainty.com

In [156]:
# Define the range of rows you want to read (37:321)
epu = pd.read_excel('../data/raw/Global_Policy_Uncertainty_Data.xlsx').iloc[36:321]

# Assuming df is your DataFrame
epu['period'] = epu['Year'].astype(str) + 'M' + epu['Month'].astype(int).apply(lambda x: f'{x:02d}')

# Select columns
epu = epu[["period", "GEPU_current"]]

epu.to_csv('../data/processed/epu.csv', index=False, encoding='utf')

In [157]:
epu

Unnamed: 0,period,GEPU_current
36,2000M01,63.738693
37,2000M02,58.820887
38,2000M03,61.289983
39,2000M04,64.557214
40,2000M05,87.164595
...,...,...
316,2023M05,228.814279
317,2023M06,231.438806
318,2023M07,228.340938
319,2023M08,213.460434


### Get feature data

TPU -- Trade policy uncertainty index https://www.policyuncertainty.com/global_monthly.html

In [186]:
tpu = pd.read_excel('../data/raw/tpu_web_latest.xlsx', sheet_name='TPU_MONTHLY', index_col=None)
tpu = tpu[['DATE','TPU']]

# rename 'month' to 'period'
tpu = tpu.rename(columns={'DATE': 'period'})

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
tpu['period'] = pd.to_datetime(tpu['period']).dt.strftime('%YM%m')
tpu = tpu[(tpu['period'] >= '2000M01') & (tpu['period'] <= '2023M09')]

In [196]:
tpu.info()

<class 'pandas.core.frame.DataFrame'>
Index: 285 entries, 480 to 764
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   period  285 non-null    object 
 1   TPU     285 non-null    float64
dtypes: float64(1), object(1)
memory usage: 6.7+ KB


GPR -- Geopolitical Risk Index https://www.matteoiacoviello.com/gpr.htm

In [184]:
gpr = pd.read_excel('../data/raw/data_gpr_export.xls', sheet_name='Data', index_col=None)

# rename 'month' to 'period'
gpr = gpr.rename(columns={'month': 'period'})

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
gpr['period'] = pd.to_datetime(gpr['period']).dt.strftime('%YM%m')
gpr = gpr[(gpr['period'] >= '2000M01') & (gpr['period'] <= '2023M09')]


In [198]:
gpr.info()

Unnamed: 0,period,GPR,GPRT,GPRA,GPRH,GPRHT,GPRHA,SHARE_GPR,N10,SHARE_GPRH,...,GPRHC_SWE,GPRHC_THA,GPRHC_TUN,GPRHC_TUR,GPRHC_TWN,GPRHC_UKR,GPRHC_USA,GPRHC_VEN,GPRHC_VNM,GPRHC_ZAF
1200,2000M01,64.457809,65.573601,64.250412,52.966461,60.844524,49.488827,1.933337,33362.0,1.910726,...,0.048170,0.040141,0.000000,0.056198,0.056198,0.024085,1.742132,0.056198,0.016057,0.072254
1201,2000M02,63.541721,57.651360,68.082169,54.460442,58.056553,52.426670,1.905860,32951.0,1.964620,...,0.119068,0.025515,0.000000,0.068039,0.136078,0.025515,1.828542,0.017010,0.034019,0.068039
1202,2000M03,50.101986,55.030735,40.816406,39.381897,52.421207,27.266571,1.502751,36533.0,1.420673,...,0.023038,0.015359,0.015359,0.023038,0.230379,0.038397,1.328521,0.015359,0.061434,0.099831
1203,2000M04,48.682739,53.304905,40.917027,39.660149,53.651951,27.073410,1.460182,33352.0,1.430710,...,0.041590,0.016636,0.008318,0.016636,0.058227,0.041590,1.380802,0.024954,0.049909,0.066545
1204,2000M05,79.482880,77.269699,83.227814,60.203789,68.131752,56.935181,2.383997,35193.0,2.171807,...,0.024221,0.000000,0.024221,0.024221,0.096884,0.032295,2.026482,0.016147,0.056515,0.137252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1480,2023M05,109.578575,125.211044,92.322296,85.072197,109.005470,69.918442,3.286682,15791.0,3.068916,...,0.125628,0.000000,0.017947,0.520459,0.233309,1.758794,3.068916,0.053841,0.125628,0.179469
1481,2023M06,114.317734,135.593628,96.421173,102.859367,143.600098,90.577881,3.428828,15603.0,3.710575,...,0.204082,0.018553,0.000000,0.204082,0.445269,2.523191,3.673469,0.018553,0.111317,0.185529
1482,2023M07,107.562927,124.753227,91.610710,92.695450,127.522690,77.385178,3.226225,15405.0,3.343919,...,0.410984,0.037362,0.000000,0.597796,0.485709,2.036241,3.325238,0.018681,0.130768,0.112087
1483,2023M08,105.108551,123.208466,85.469650,75.499611,106.828003,54.451187,3.152609,15733.0,2.723592,...,0.110416,0.000000,0.036805,0.312845,0.184026,1.472212,2.686787,0.018403,0.055208,0.165624


GEM -- Global economic monitor index https://databank.worldbank.org/source/global-economic-monitor-(gem)#

In [202]:
gem = pd.read_excel('../data/raw/P_Data_Extract_From_Global_Economic_Monitor_(GEM).xlsx', sheet_name='Data', index_col=None)
gem = gem.transpose().iloc[3:]

# Make the first row the columns
gem.columns = gem.iloc[0]
gem = gem[1:]

# Move the existing index to a regular column
gem.reset_index(inplace=True)

# Rename the column 'Series Code' to 'period'
gem = gem.rename(columns={'index': 'period'})

In [203]:
# Keep only the part before the blank in the rows under column 'period'
gem['period'] = gem['period'].str.split().str[0]

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
gem = gem[gem['period'].str.match(r'^\d{4}M\d{2}$')]
gem = gem[(gem['period'] >= '2000M01') & (gem['period'] <= '2023M09')]

# Reset the index
gem = gem.reset_index(drop=True)

# Replace '..' to NA
gem.replace('..', np.nan, inplace=True)

# Keep the columns
gem = gem[['period','CPTOTSAXMZGY', 'CPTOTSAXN','DXGSRMRCHSACD', 'DMGSRMRCHSACD', 'IPTOTSAKD', 'IMPCOV', 'NEER', 'REER', 'RETSALESSA', 'TOTRESV' ,'UNEMPSA_']] 

In [204]:
gem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   period         285 non-null    object 
 1   CPTOTSAXMZGY   285 non-null    float64
 2   CPTOTSAXN      285 non-null    float64
 3   DXGSRMRCHSACD  285 non-null    float64
 4   DMGSRMRCHSACD  285 non-null    float64
 5   IPTOTSAKD      285 non-null    float64
 6   IMPCOV         285 non-null    float64
 7   NEER           285 non-null    float64
 8   REER           285 non-null    float64
 9   RETSALESSA     285 non-null    float64
 10  TOTRESV        285 non-null    float64
 11  UNEMPSA_       285 non-null    float64
dtypes: float64(11), object(1)
memory usage: 26.8+ KB


Construct feature matrix

In [210]:
# merge tpu, gpr, gem into the features data
merged_data = pd.merge(tpu, gpr, on='period')
features = pd.merge(merged_data, gem, on='period')

# save to csv
features.to_csv('../data/processed/features.csv', index=False)

In [212]:
features.head()

Unnamed: 0,period,TPU,GPR,GPRT,GPRA,GPRH,GPRHT,GPRHA,SHARE_GPR,N10,...,CPTOTSAXN,DXGSRMRCHSACD,DMGSRMRCHSACD,IPTOTSAKD,IMPCOV,NEER,REER,RETSALESSA,TOTRESV,UNEMPSA_
0,2000M01,32.117348,64.457809,65.573601,64.250412,52.966461,60.844524,49.488827,1.933337,33362.0,...,76.089042,484771.067659,784349.502592,996722300000.0,2.476091,883.756875,104.048961,61.870157,1942120.0,8.614909
1,2000M02,28.175856,63.541721,57.65136,68.082169,54.460442,58.056553,52.42667,1.90586,32951.0,...,76.318397,496700.992153,943444.35281,1010232000000.0,2.063868,896.988312,104.329262,62.339465,1947145.0,8.620393
2,2000M03,36.838506,50.101986,55.030735,40.816406,39.381897,52.421207,27.266571,1.502751,36533.0,...,76.49801,497995.743462,846133.068963,1014447000000.0,2.330691,903.555596,104.515925,62.045241,1972075.0,8.500496
3,2000M04,35.28104,48.682739,53.304905,40.917027,39.660149,53.651951,27.07341,1.460182,33352.0,...,76.454511,482444.39569,860083.949923,1018283000000.0,2.299383,910.134906,104.679618,60.628472,1977663.0,8.448069
4,2000M05,49.327563,79.48288,77.269699,83.227814,60.203789,68.131752,56.935181,2.383997,35193.0,...,76.617165,506354.66755,894550.760808,1029954000000.0,2.225832,931.339312,105.262834,60.970381,1991119.0,8.425447
