In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

### Read the monthly trade data from Jan 2000 to Sep 2023

The time range is that the trade data for most countries are updated till Sep 2023

In [2]:
# Specify the path to the folder containing CSV files
folder_path = '../data/raw/MonthlyTradeData'

# Get a list of all files in the folder
files = [file for file in os.listdir(folder_path)]

# Create an empty list to store DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path, encoding='unicode_escape')
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame if needed
monthly_TradeData = pd.concat(dfs)

monthly_TradeData = monthly_TradeData[(monthly_TradeData['Period']<=202309)]


# Keep the necessary columns
monthly_TradeData = monthly_TradeData[['Period','ReporterISO','ReporterDesc','PartnerISO','PartnerDesc','PrimaryValue']]

# Save to csv
monthly_TradeData.to_csv('../data/processed/monthly_TradeData.csv',index=False)

In [3]:
monthly_TradeData.info()

<class 'pandas.core.frame.DataFrame'>
Index: 564999 entries, 0 to 37901
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Period        564999 non-null  int64  
 1   ReporterISO   564999 non-null  object 
 2   ReporterDesc  564999 non-null  object 
 3   PartnerISO    564999 non-null  object 
 4   PartnerDesc   564999 non-null  object 
 5   PrimaryValue  564999 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 30.2+ MB


### Get trade volumn data

In [4]:
# Get the world monthly trade data

world_trade_data = monthly_TradeData[monthly_TradeData['PartnerDesc'] == 'World']
world_trade_data = world_trade_data[['Period','ReporterDesc','PrimaryValue']]

# Group by 'Period' and sum the 'PrimaryValue' within each period
trade_volume = world_trade_data.groupby('Period')['PrimaryValue'].sum().reset_index()

# Assuming 'Period' is currently an integer column
trade_volume['Period'] = pd.to_datetime(trade_volume['Period'], format='%Y%m')

# Extract year and month
trade_volume['year'] = trade_volume['Period'].dt.year
trade_volume['month'] = trade_volume['Period'].dt.month

# Format 'Period' in the desired way
trade_volume['period'] = trade_volume['Period'].dt.strftime('%YM%m')

# Drop unnecessary columns
trade_volume = trade_volume[['period', 'PrimaryValue']]

trade_volume.to_csv('../data/processed/trade_volume.csv',index=False)

### Monthly trade data

In [5]:
# Pre-processing data

# Define a function to strip whitespace for categorical values
def strip_categorical(value):
    if isinstance(value, str):
        return value.strip()
    return value

def replace_and_delete_countries(df):
    # Apply the function to the entire DataFrame
    df = df.applymap(strip_categorical)
    
    # Replace "Other Asia, nes" (S19) to Taiwan,China (TWM)
    df['ReporterISO'] = df['ReporterISO'].replace('S19', 'TWN')
    df.loc[df['ReporterISO'] == 'TWN', 'ReporterDesc'] = 'China, Taiwan'
    
    df['PartnerISO'] = df['PartnerISO'].replace('S19', 'TWN')
    df.loc[df['PartnerISO'] == 'TWN', 'PartnerDesc'] = 'China, Taiwan'
    
    # Delete other desc containing ', nes'
    # Replace values containing ', nes' with NaN
    df.loc[df['ReporterDesc'].str.contains(', nes', case=False, na=False), 'ReporterDesc'] = np.nan
    df.loc[df['PartnerDesc'].str.contains(', nes', case=False, na=False), 'PartnerDesc'] = np.nan
    # Delete rows containing Nan
    df = df.dropna()
    
    # Delete 'Free Zone', 'Bunkers', 'Special Categories', 'World', 'Neutral zone' 
    to_delete = ['Free Zone','Free Zones','Bunkers','Special Categories','World','Neutral Zone']
    
    # Delete rows where source is in the list of countries
    df = df[~df['ReporterDesc'].isin(to_delete)]
    # Delete rows where target is in the list of countries
    df = df[~df['PartnerDesc'].isin(to_delete)]

    return df


In [6]:
monthly_TradeData = replace_and_delete_countries(monthly_TradeData)
monthly_TradeData.info()

# Save to csv
monthly_TradeData.to_csv('../data/processed/monthly_TradeData.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 545537 entries, 0 to 37901
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Period        545537 non-null  int64  
 1   ReporterISO   545537 non-null  object 
 2   ReporterDesc  545537 non-null  object 
 3   PartnerISO    545537 non-null  object 
 4   PartnerDesc   545537 non-null  object 
 5   PrimaryValue  545537 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 29.1+ MB


### Get the country list

In [7]:
# Creaet Appendix for country name and ISO code
def get_country_list(df):
    df = df[['ReporterISO','ReporterDesc','PartnerISO','PartnerDesc']]

    # Obtain the countries in the links
    countries_code = pd.concat([df[['ReporterISO']].rename(columns={'ReporterISO': 'ISO'}),df[['PartnerISO']].rename(columns={'PartnerISO': 'ISO'})]).drop_duplicates()
    
    # Match company info with selected companies
    countries = pd.merge(countries_code,df,left_on='ISO', right_on='ReporterISO', how='left')
    countries = countries[['ISO','ReporterDesc']].rename(columns={'ReporterDesc':'Name'}).drop_duplicates()
    
    countries = pd.merge(countries, df, left_on='ISO',right_on='PartnerISO', how='left')
    countries = countries[['ISO','PartnerDesc']].rename(columns={'PartnerDesc':'Name'}).drop_duplicates()
    
    return countries



In [8]:
countries = get_country_list(monthly_TradeData)
countries.to_csv('../data/processed/countries.csv',index=False, encoding='utf')

In [9]:
countries.info()


<class 'pandas.core.frame.DataFrame'>
Index: 238 entries, 0 to 545525
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ISO     238 non-null    object
 1   Name    238 non-null    object
dtypes: object(2)
memory usage: 5.6+ KB


### Read EPU DATA

In [10]:
# Define the range of rows you want to read (37:321)
epu = pd.read_excel('../data/raw/Global_Policy_Uncertainty_Data.xlsx').iloc[36:321]

# Assuming df is your DataFrame
epu['period'] = epu['Year'].astype(str) + 'M' + epu['Month'].astype(int).apply(lambda x: f'{x:02d}')

# Select columns
epu = epu[["period", "GEPU_current","GEPU_ppp"]]

epu.to_csv('../data/processed/epu.csv', index=False, encoding='utf')

In [11]:
epu

Unnamed: 0,period,GEPU_current,GEPU_ppp
36,2000M01,63.738693,61.930168
37,2000M02,58.820887,53.231096
38,2000M03,61.289983,58.165654
39,2000M04,64.557214,61.171080
40,2000M05,87.164595,84.551517
...,...,...,...
316,2023M05,228.814279,227.546243
317,2023M06,231.438806,240.770257
318,2023M07,228.340938,238.605431
319,2023M08,213.460434,226.492244


### Get economical indicators

In [126]:
gem = pd.read_excel('../data/raw/P_Data_Extract_From_Global_Economic_Monitor_(GEM).xlsx', sheet_name='Data', index_col=None)
gem = gem.transpose().iloc[3:]

# Make the first row the columns
gem.columns = gem.iloc[0]
gem = gem[1:]

# Move the existing index to a regular column
gem.reset_index(inplace=True)

# Rename the column 'Series Code' to 'period'
gem = gem.rename(columns={'index': 'period'})

In [129]:
# Keep only the part before the blank in the rows under column 'period'
gem['period'] = gem['period'].str.split().str[0]

# Keep rows only when 'period' matches the format '2000M01' and is between 2000M01 and 2023M09
gem = gem[gem['period'].str.match(r'^\d{4}M\d{2}$')]
gem = gem[(gem['period'] >= '2000M01') & (gem['period'] <= '2023M09')]

# Reset the index
gem = gem.reset_index(drop=True)

# Replace '..' to NA
gem.replace('..', np.nan, inplace=True)

# Drop the columns with missing values
gem = gem.drop(gem.columns[[1, 2, 4, 7, 8, 15, 16, 17, 18, 29, 32, 33, 34, 37, 38, 39, 40, 41]], axis=1) 

In [130]:
gem.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   period         285 non-null    object 
 1   CPTOTSAXMZGY   285 non-null    float64
 2   CPTOTSAXN      285 non-null    float64
 3   CPTOTNSXN      285 non-null    float64
 4   DXGSRMRCHNSKD  284 non-null    float64
 5   DXGSRMRCHSAKD  284 non-null    float64
 6   DXGSRMRCHNSCD  284 non-null    float64
 7   DXGSRMRCHSACD  285 non-null    float64
 8   DXGSRMRCHNSXD  284 non-null    float64
 9   DXGSRMRCHSAXD  284 non-null    float64
 10  DMGSRMRCHNSKD  285 non-null    float64
 11  DMGSRMRCHSAKD  285 non-null    float64
 12  DMGSRMRCHNSCD  285 non-null    float64
 13  DMGSRMRCHSACD  285 non-null    float64
 14  DMGSRMRCHNSXD  285 non-null    float64
 15  DMGSRMRCHSAXD  285 non-null    float64
 16  IPTOTSAKD      285 non-null    float64
 17  IPTOTNSKD      285 non-null    float64
 18  IMPCOV    

In [118]:
gem

Series Code,period,CPTOTSAXMZGY,CPTOTSAXN,CPTOTNSXN,DXGSRMRCHNSKD,DXGSRMRCHSAKD,DXGSRMRCHNSCD,DXGSRMRCHSACD,DXGSRMRCHNSXD,DXGSRMRCHSAXD,...,DMGSRMRCHNSXD,DMGSRMRCHSAXD,IPTOTSAKD,IPTOTNSKD,IMPCOV,NEER,REER,RETSALESSA,TOTRESV,UNEMPSA_
0,2000M01,3.331233,76.089042,76.187607,603570.494027,646992.754705,549745.057492,484771.067659,0.910822,0.749268,...,1.158197,1.169108,996722297161.402954,1006717398677.819946,2.476091,883.756875,104.048961,61.870157,1942120.491463,8.614909
1,2000M02,3.217972,76.318397,76.498747,645874.06741,668974.209085,581562.534648,496700.992153,0.900427,0.742482,...,1.240184,1.33128,1010231591451.280029,1052488832529.719971,2.063868,896.988312,104.329262,62.339465,1947144.763624,8.620393
2,2000M03,3.349364,76.49801,76.722562,712067.80501,675380.405799,648577.497194,497995.743462,0.910837,0.737356,...,1.181816,1.183658,1014446901200.689941,1132079427767.77002,2.330691,903.555596,104.515925,62.045241,1972074.537912,8.500496
3,2000M04,3.309353,76.454511,76.79724,646830.553236,656800.385386,587409.248873,482444.39569,0.908135,0.734537,...,1.142846,1.249991,1018282923026.209961,1035701575222.280029,2.299383,910.134906,104.679618,60.628472,1977662.800024,8.448069
4,2000M05,3.26087,76.617165,76.893728,689188.09898,696843.801889,618193.695929,506354.66755,0.896988,0.72664,...,1.256758,1.203989,1029953678352.949951,1081396014216.920044,2.225832,931.339312,105.262834,60.970381,1991119.35196,8.425447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,2023M05,6.162588,160.950844,161.714278,1672907.064792,1424598.443873,2578162.677328,1818976.591413,1.541127,1.276835,...,2.999744,1.694048,1850681471999.689941,1877598944154.570068,4.21073,103.076979,104.164061,101.669815,13319501.540237,5.756757
281,2023M06,5.505415,161.872466,162.730666,1747828.860253,1428851.360884,2609381.557185,1783450.870157,1.492927,1.248171,...,2.998843,1.743142,1857853598008.159912,1983728714628.689941,4.255415,103.082491,104.06007,101.581524,13367343.690657,5.67182
282,2023M07,4.663212,163.743367,164.561039,1715651.111318,1414302.241717,2482137.645621,1776108.339534,1.446761,1.25582,...,2.694866,1.545504,1854203312227.719971,1834557463089.27002,4.417223,102.642779,103.388379,101.394009,13443345.99006,5.74869
283,2023M08,4.83871,165.781302,166.253043,1784765.211186,1437152.288776,2488170.323064,1803101.451389,1.394116,1.254635,...,2.915684,1.643972,1867221764955.860107,1819872012895.429932,4.241427,103.092791,103.754428,101.496583,13327210.132698,5.781676


In [None]:
..