In [397]:
import pandas as pd
import numpy as np
import os
from os import listdir

import matplotlib.pyplot as plt

### Read the monthly trade data from Jan 2000 to Sep 2023

The time range is that the trade data for most countries are updated till Sep 2023

In [398]:
# Specify the path to the folder containing CSV files
folder_path = '../data/raw/MonthlyTradeData'

# Get a list of all files in the folder
files = [file for file in os.listdir(folder_path)]

# Create an empty list to store DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, encoding='unicode_escape')
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame if needed
monthly_TradeData = pd.concat(dfs)

monthly_TradeData = monthly_TradeData[(monthly_TradeData['Period']<=202309)]


# Keep the necessary columns
monthly_TradeData = monthly_TradeData[['Period','ReporterISO','ReporterDesc','PartnerISO','PartnerDesc','PrimaryValue']]

# Save to csv
monthly_TradeData.to_csv('../data/processed/monthly_TradeData.csv',index=False)

In [399]:
monthly_TradeData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 564999 entries, 0 to 37901
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Period        564999 non-null  int64  
 1   ReporterISO   564999 non-null  object 
 2   ReporterDesc  564999 non-null  object 
 3   PartnerISO    564999 non-null  object 
 4   PartnerDesc   564999 non-null  object 
 5   PrimaryValue  564999 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 30.2+ MB


### Monthly trade data

In [401]:
# Pre-processing data

# Define a function to strip whitespace for categorical values
def strip_categorical(value):
    if isinstance(value, str):
        return value.strip()
    return value

def replace_and_delete_countries(df):
    # Apply the function to the entire DataFrame
    df = df.applymap(strip_categorical)
    
    # Replace "Other Asia, nes" (S19) to Taiwan,China (TWM)
    df['ReporterISO'] = df['ReporterISO'].replace('S19', 'TWN')
    df.loc[df['ReporterISO'] == 'TWN', 'ReporterDesc'] = 'China, Taiwan'
    
    df['PartnerISO'] = df['PartnerISO'].replace('S19', 'TWN')
    df.loc[df['PartnerISO'] == 'TWN', 'PartnerDesc'] = 'China, Taiwan'
    
    # Delete other desc containing ', nes'
    # Replace values containing ', nes' with NaN
    df.loc[df['ReporterDesc'].str.contains(', nes', case=False, na=False), 'ReporterDesc'] = np.nan
    df.loc[df['PartnerDesc'].str.contains(', nes', case=False, na=False), 'PartnerDesc'] = np.nan
    # Delete rows containing Nan
    df = df.dropna()
    
    # Delete 'Free Zone', 'Bunkers', 'Special Categories', 'World', 'Neutral zone' 
    to_delete = ['Free Zone','Free Zones','Bunkers','Special Categories','World','Neutral Zone']
    
    # Delete rows where source is in the list of countries
    df = df[~df['ReporterDesc'].isin(to_delete)]
    # Delete rows where target is in the list of countries
    df = df[~df['PartnerDesc'].isin(to_delete)]

    return df


In [402]:
monthly_TradeData = replace_and_delete_countries(monthly_TradeData)
monthly_TradeData.info()

# Save to csv
monthly_TradeData.to_csv('../data/processed/monthly_TradeData.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 545537 entries, 0 to 37901
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Period        545537 non-null  int64  
 1   ReporterISO   545537 non-null  object 
 2   ReporterDesc  545537 non-null  object 
 3   PartnerISO    545537 non-null  object 
 4   PartnerDesc   545537 non-null  object 
 5   PrimaryValue  545537 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 29.1+ MB


### Get the country list

In [403]:
# Creaet Appendix for country name and ISO code
def get_country_list(df):
    df = df[['ReporterISO','ReporterDesc','PartnerISO','PartnerDesc']]

    # Obtain the countries in the links
    countries_code = pd.concat([df[['ReporterISO']].rename(columns={'ReporterISO': 'ISO'}),df[['PartnerISO']].rename(columns={'PartnerISO': 'ISO'})]).drop_duplicates()
    
    # Match company info with selected companies
    countries = pd.merge(countries_code,df,left_on='ISO', right_on='ReporterISO', how='left')
    countries = countries[['ISO','ReporterDesc']].rename(columns={'ReporterDesc':'Name'}).drop_duplicates()
    
    countries = pd.merge(countries, df, left_on='ISO',right_on='PartnerISO', how='left')
    countries = countries[['ISO','PartnerDesc']].rename(columns={'PartnerDesc':'Name'}).drop_duplicates()
    
    return countries



In [404]:
countries = get_country_list(monthly_TradeData)
countries.to_csv('../data/processed/countries.csv',index=False, encoding='utf')

In [405]:
countries.info()


<class 'pandas.core.frame.DataFrame'>
Index: 238 entries, 0 to 545525
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ISO     238 non-null    object
 1   Name    238 non-null    object
dtypes: object(2)
memory usage: 5.6+ KB



Read EPU Data https://www.policyuncertainty.com/global_monthly.html

In [406]:
# Define the range of rows you want to read (37:321)
epu = pd.read_excel('../data/raw/Global_Policy_Uncertainty_Data.xlsx').iloc[36:321]

# Assuming df is your DataFrame
epu['period'] = epu['Year'].astype(str) + 'M' + epu['Month'].astype(int).apply(lambda x: f'{x:02d}')

# Select columns
epu = epu[["period", "GEPU_current"]]

epu.to_csv('../data/processed/epu.csv', index=False, encoding='utf')

In [407]:
epu

Unnamed: 0,period,GEPU_current
36,2000M01,63.738693
37,2000M02,58.820887
38,2000M03,61.289983
39,2000M04,64.557214
40,2000M05,87.164595
...,...,...
316,2023M05,228.814279
317,2023M06,231.438806
318,2023M07,228.340938
319,2023M08,213.460434


### Get feature data

TPU -- Trade policy uncertainty index https://www.policyuncertainty.com/trade_cimpr.html

In [408]:
tpu = pd.read_excel('../data/raw/tpu_web_latest.xlsx', sheet_name='TPU_MONTHLY', index_col=None)
tpu = tpu[['DATE','TPU']]

# rename 'month' to 'period'
tpu = tpu.rename(columns={'DATE': 'period'})

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
tpu['period'] = pd.to_datetime(tpu['period']).dt.strftime('%YM%m')
tpu = tpu[(tpu['period'] >= '2000M01') & (tpu['period'] <= '2023M09')]

In [409]:
tpu.info()

<class 'pandas.core.frame.DataFrame'>
Index: 285 entries, 480 to 764
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   period  285 non-null    object 
 1   TPU     285 non-null    float64
dtypes: float64(1), object(1)
memory usage: 6.7+ KB


GPR -- Geopolitical Risk Index https://www.matteoiacoviello.com/gpr.htm

In [484]:
gpr = pd.read_excel('../data/raw/data_gpr_export.xls', sheet_name='Data', index_col=None)

# rename 'month' to 'period'
gpr = gpr.rename(columns={'month': 'period'})

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
gpr['period'] = pd.to_datetime(gpr['period']).dt.strftime('%YM%m')
gpr = gpr[(gpr['period'] >= '2000M01') & (gpr['period'] <= '2023M09')]

# don't keep country specific index
gpr = gpr[['period', 'GPR']]

In [485]:
gpr

Unnamed: 0,month,GPR,GPRT,GPRA,GPRH,GPRHT,GPRHA,SHARE_GPR,N10,SHARE_GPRH,...,GPRHC_SWE,GPRHC_THA,GPRHC_TUN,GPRHC_TUR,GPRHC_TWN,GPRHC_UKR,GPRHC_USA,GPRHC_VEN,GPRHC_VNM,GPRHC_ZAF
0,1900-01-01,,,,87.927849,64.717491,110.453522,,,3.171932,...,0.012947,0.077680,0.000000,0.038840,0.000000,0.000000,2.718799,0.051787,0.012947,1.152253
1,1900-02-01,,,,86.566490,71.936844,96.250488,,,3.122822,...,0.000000,0.083647,0.000000,0.125471,0.000000,0.000000,2.732469,0.027882,0.000000,1.143176
2,1900-03-01,,,,72.140701,57.475853,84.499428,,,2.602422,...,0.012883,0.115949,0.000000,0.180366,0.000000,0.000000,2.151507,0.025767,0.000000,0.863180
3,1900-04-01,,,,54.419449,37.326603,65.858208,,,1.963141,...,0.000000,0.040064,0.000000,0.066774,0.000000,0.000000,1.776175,0.000000,0.000000,0.641026
4,1900-05-01,,,,64.405197,48.200008,74.373955,,,2.323370,...,0.000000,0.163043,0.000000,0.081522,0.000000,0.000000,1.970109,0.013587,0.000000,0.788043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1483,2023-08-01,105.108551,123.208466,85.469650,75.499611,106.828003,54.451187,3.152609,15733.0,2.723592,...,0.110416,0.000000,0.036805,0.312845,0.184026,1.472212,2.686787,0.018403,0.055208,0.165624
1484,2023-09-01,99.504120,127.283379,69.159340,69.533272,96.874405,56.809864,2.984511,15882.0,2.508361,...,0.148644,0.055741,0.074322,0.483092,0.204385,1.374954,2.508361,0.074322,0.074322,0.111483
1485,2023-10-01,201.619873,208.637039,224.572159,139.391083,170.954269,142.826645,6.047354,16007.0,5.028430,...,0.177683,0.017768,0.071073,0.390903,0.319829,1.812367,4.992893,0.071073,0.035537,0.017768
1486,2023-11-01,156.189804,145.355194,186.503296,137.351929,146.582382,162.898315,4.684732,15241.0,4.954868,...,0.019205,0.249664,0.000000,0.211254,0.345688,1.229115,4.954868,0.000000,0.230459,0.000000


GEM -- Global economic monitor index https://databank.worldbank.org/source/global-economic-monitor-(gem)#

In [482]:
gem = pd.read_excel('../data/raw/P_Data_Extract_From_Global_Economic_Monitor_(GEM).xlsx', sheet_name='Data', index_col=None)
gem = gem.transpose().iloc[3:]

# Make the first row the columns
gem.columns = gem.iloc[0]
gem = gem[1:]

# Move the existing index to a regular column
gem.reset_index(inplace=True)

# Rename the column 'Series Code' to 'period'
gem = gem.rename(columns={'index': 'period'})


In [483]:
# Keep only the part before the blank in the rows under column 'period'
gem['period'] = gem['period'].str.split().str[0]

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
gem = gem[gem['period'].str.match(r'^\d{4}M\d{2}$')]
gem = gem[(gem['period'] >= '2000M01') & (gem['period'] <= '2023M09')]

# Reset the index
gem = gem.reset_index(drop=True)

# Replace '..' to NA
gem.replace('..', np.nan, inplace=True)

# Keep the columns
gem = gem[['period','CPTOTSAXN','DXGSRMRCHSACD', 'DMGSRMRCHSACD', 'IPTOTSAKD', 'IMPCOV', 'NEER', 'REER', 'RETSALESSA', 'TOTRESV' ,'UNEMPSA_']] 


In [414]:
gem.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   period         285 non-null    object 
 1   CPTOTSAXN      285 non-null    float64
 2   DXGSRMRCHSACD  285 non-null    float64
 3   DMGSRMRCHSACD  285 non-null    float64
 4   IPTOTSAKD      285 non-null    float64
 5   IMPCOV         285 non-null    float64
 6   NEER           285 non-null    float64
 7   REER           285 non-null    float64
 8   RETSALESSA     285 non-null    float64
 9   TOTRESV        285 non-null    float64
 10  UNEMPSA_       285 non-null    float64
dtypes: float64(10), object(1)
memory usage: 24.6+ KB


In [415]:
gem.head()

Series Code,period,CPTOTSAXN,DXGSRMRCHSACD,DMGSRMRCHSACD,IPTOTSAKD,IMPCOV,NEER,REER,RETSALESSA,TOTRESV,UNEMPSA_
0,2000M01,76.089042,484771.067659,784349.502592,996722300000.0,2.476091,883.756875,104.048961,61.870157,1942120.0,8.614909
1,2000M02,76.318397,496700.992153,943444.35281,1010232000000.0,2.063868,896.988312,104.329262,62.339465,1947145.0,8.620393
2,2000M03,76.49801,497995.743462,846133.068963,1014447000000.0,2.330691,903.555596,104.515925,62.045241,1972075.0,8.500496
3,2000M04,76.454511,482444.39569,860083.949923,1018283000000.0,2.299383,910.134906,104.679618,60.628472,1977663.0,8.448069
4,2000M05,76.617165,506354.66755,894550.760808,1029954000000.0,2.225832,931.339312,105.262834,60.970381,1991119.0,8.425447


Construct feature matrix

In [416]:
# merge tpu, gpr, gem into the features data
merged_data = pd.merge(tpu, gpr, on='period')
features = pd.merge(merged_data, gem, on='period')

# save to csv
features.to_csv('../data/processed/features.csv', index=False)

In [417]:
features.head()

Unnamed: 0,period,TPU,GPR,CPTOTSAXN,DXGSRMRCHSACD,DMGSRMRCHSACD,IPTOTSAKD,IMPCOV,NEER,REER,RETSALESSA,TOTRESV,UNEMPSA_
0,2000M01,32.117348,64.457809,76.089042,484771.067659,784349.502592,996722300000.0,2.476091,883.756875,104.048961,61.870157,1942120.0,8.614909
1,2000M02,28.175856,63.541721,76.318397,496700.992153,943444.35281,1010232000000.0,2.063868,896.988312,104.329262,62.339465,1947145.0,8.620393
2,2000M03,36.838506,50.101986,76.49801,497995.743462,846133.068963,1014447000000.0,2.330691,903.555596,104.515925,62.045241,1972075.0,8.500496
3,2000M04,35.28104,48.682739,76.454511,482444.39569,860083.949923,1018283000000.0,2.299383,910.134906,104.679618,60.628472,1977663.0,8.448069
4,2000M05,49.327563,79.48288,76.617165,506354.66755,894550.760808,1029954000000.0,2.225832,931.339312,105.262834,60.970381,1991119.0,8.425447


In [418]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   period         285 non-null    object 
 1   TPU            285 non-null    float64
 2   GPR            285 non-null    float64
 3   CPTOTSAXN      285 non-null    float64
 4   DXGSRMRCHSACD  285 non-null    float64
 5   DMGSRMRCHSACD  285 non-null    float64
 6   IPTOTSAKD      285 non-null    float64
 7   IMPCOV         285 non-null    float64
 8   NEER           285 non-null    float64
 9   REER           285 non-null    float64
 10  RETSALESSA     285 non-null    float64
 11  TOTRESV        285 non-null    float64
 12  UNEMPSA_       285 non-null    float64
dtypes: float64(12), object(1)
memory usage: 29.1+ KB


test

In [506]:
df = pd.read_csv('../data/raw/IPHITEK2S.csv', encoding='unicode_escape')

# rename 'month' to 'period'
df = df.rename(columns={'DATE': 'period','IPHITEK2S':'ip'})

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
df['period'] = pd.to_datetime(df['period']).dt.strftime('%YM%m')
df = df[(df['period'] >= '2000M01') & (df['period'] <= '2023M09')]

df.to_csv('../data/processed/ip_index.csv', index = False)
df

Unnamed: 0,period,ip
396,2000M01,13.8021
397,2000M02,14.2730
398,2000M03,14.7545
399,2000M04,15.2583
400,2000M05,15.8426
...,...,...
676,2023M05,136.5650
677,2023M06,137.4773
678,2023M07,141.5320
679,2023M08,143.2088
