In [14]:
import pandas as pd
import numpy as np


In [15]:
# read IC trade data from 1996-2021
df1 = pd.read_csv('../data/raw/TradeData8542_1.csv',encoding='unicode_escape')
df2 = pd.read_csv('../data/raw/TradeData8542_2.csv',encoding='unicode_escape')
df3 = pd.read_csv('../data/raw/TradeData8542_3.csv',encoding='unicode_escape')

# Concate the data frames
df = pd.concat([df1,df2,df3])
df = df[(df['Period']<=2021)]

# Keep the necessary columns
df = df[['Period','ReporterISO','ReporterDesc','PartnerISO','PartnerDesc','PrimaryValue']]
df

Unnamed: 0,Period,ReporterISO,ReporterDesc,PartnerISO,PartnerDesc,PrimaryValue
0,1998,ALB,Albania,W00,World,30646.000
1,1998,ALB,Albania,DEU,Germany,30646.000
2,1998,DZA,Algeria,W00,World,1766.000
3,1998,DZA,Algeria,FRA,France,681.000
4,1998,DZA,Algeria,GRC,Greece,541.000
...,...,...,...,...,...,...
71868,2014,ZMB,Zambia,W00,World,2876.389
71869,2014,ZMB,Zambia,COD,Dem. Rep. of the Congo,565.609
71870,2014,ZMB,Zambia,ITA,Italy,51.359
71871,2014,ZMB,Zambia,ZAF,South Africa,1673.942


In [16]:
print(df['Period'].unique())

[1998 1997 1996 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000
 1999 2011 2012 2013 2021 2020 2019 2018 2017 2016 2015 2014]


In [17]:
# Pre-processing data

# Define a function to strip whitespace for categorical values
def strip_categorical(value):
    if isinstance(value, str):
        return value.strip()
    return value

# Apply the function to the entire DataFrame
df = df.applymap(strip_categorical)

# Replace "Other Asia, nes" (S19) to Taiwan,China (TWM)
df['ReporterISO'] = df['ReporterISO'].replace('S19', 'TWN')
df.loc[df['ReporterISO'] == 'TWN', 'ReporterDesc'] = 'China, Taiwan'

df['PartnerISO'] = df['PartnerISO'].replace('S19', 'TWN')
df.loc[df['PartnerISO'] == 'TWN', 'PartnerDesc'] = 'China, Taiwan'

# Delete other desc containing ', nes'
# Replace values containing ', nes' with NaN
df.loc[df['ReporterDesc'].str.contains(', nes', case=False, na=False), 'ReporterDesc'] = np.nan
df.loc[df['PartnerDesc'].str.contains(', nes', case=False, na=False), 'PartnerDesc'] = np.nan
# Delete rows containing Nan
df = df.dropna()

# Delete 'Free Zone', 'Bunkers', 'Special Categories', 'Southern African Customs Union (...1999)'
to_delete = ['Free Zone','Free Zones','Bunkers','Special Categories','Southern African Customs Union (...1999)']

# Delete rows where source is in the list of countries
df = df[~df['ReporterDesc'].isin(to_delete)]
# Delete rows where target is in the list of countries
df = df[~df['PartnerDesc'].isin(to_delete)]


In [5]:
# Extract total export trade statistics of countries
df_world = df[df['PartnerISO']=='W00']
df_world.to_csv('../data/processed/world_export.csv', index=False)

In [6]:
# Delete 'World'
to_delete = ['World']

# Delete rows where source is in the list of countries
df = df[~df['ReporterDesc'].isin(to_delete)]
# Delete rows where target is in the list of countries
df = df[~df['PartnerDesc'].isin(to_delete)]

In [7]:
# Extract Taiwan export data
df_taiwan = df[df['ReporterISO'] == 'TWN']
df_taiwan.to_csv('../data/processed/export_tw.csv', index=False)

In [8]:
df_taiwan['Period'].unique()

array([1998, 1997, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002,
       2001, 2000, 1999, 2011, 2012, 2013, 2021, 2020, 2019, 2018, 2017,
       2016, 2015, 2014])

In [9]:
df.info() #120324
df.to_csv('../data/processed/trade.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 130324 entries, 1 to 71872
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Period        130324 non-null  int64  
 1   ReporterISO   130324 non-null  object 
 2   ReporterDesc  130324 non-null  object 
 3   PartnerISO    130324 non-null  object 
 4   PartnerDesc   130324 non-null  object 
 5   PrimaryValue  130324 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 7.0+ MB


In [10]:
# Read gravity data and export gravity involving Taiwan
gravity =pd.read_csv('../data/raw//Gravity_V202211.csv')
# keep data after year 2014
gravity = gravity[(gravity['year']>=1996)]
gravity['year'] = gravity['year'].astype(str)

# consider gravity involving Taiwan
gravity_tw = gravity[gravity['iso3_o']=='TWN']

gravity_tw = gravity_tw[['year','iso3_d','dist','gdp_source_o', 'gdp_source_d','gdp_ppp_o','gdp_ppp_d','fta_wto']]
gravity_tw.to_csv('../data/processed/gravity_tw.csv', index=False)

In [11]:
# Creaet Appendix for country name and ISO code
df = df[['ReporterISO','ReporterDesc','PartnerISO','PartnerDesc']]

# Obtain the countries in the links
countries_code = pd.concat([df[['ReporterISO']].rename(columns={'ReporterISO': 'ISO'}),df[['PartnerISO']].rename(columns={'PartnerISO': 'ISO'})]).drop_duplicates()

# Match company info with selected companies
countries = pd.merge(countries_code,df,left_on='ISO', right_on='ReporterISO', how='left')
countries = countries[['ISO','ReporterDesc']].rename(columns={'ReporterDesc':'Name'}).drop_duplicates()

countries = pd.merge(countries, df, left_on='ISO',right_on='PartnerISO', how='left')
countries = countries[['ISO','PartnerDesc']].rename(columns={'PartnerDesc':'Name'}).drop_duplicates()

countries.to_csv('../data/processed/countries.csv', index=False)