This notebook cleans datasets on tariffs, trade, macroeconomic indicators, and income classification data, then merges them into a single panel dataset. 

The following datasets are used:  
Tariffs: World Bank's Integrated Trade Solution, UNCTAD TRAINS. Analysis uses weighted average tariffs, defined as import-value-weighted ad valorem averages by year.  
Trade: World Bank, measures exports and imports of goods and services as percent of GDP.  
Penn World Table (PWT) version 11.0: provides macroeconomic variables such as real GDP and population.  
Income Groups: World Bank's income classification, defines annual income classifications based on countries' gross national income per capita. 

In [44]:
# Import packages
import pandas as pd
import numpy as np
from pathlib import Path

In [45]:
# Import data
data_dir = Path("data/raw")
tariffs = pd.read_csv(data_dir / "tariffs.csv")
trade = pd.read_csv(data_dir / "trade.csv")
pwt = pd.read_excel(data_dir / "pwt110.xlsx")
income_groups = pd.read_csv(data_dir / "income_groups.csv")

# Data Exploration

Check the datasets to look at countries, time range, and variables in each dataset. The aim is to identify the intersection of countries and years across all datasets, to determine the scope of the final analysis.

In [46]:
# Look at tariff data
print(tariffs.columns)

# Print unique countries
countries_trf = tariffs['Reporter Name'].unique()
print(f'Total countries: {len(countries_trf)}')

# Print time range 
years_trf = tariffs['Tariff Year']
years_trf = years_trf.sort_values(ascending=True)
years_trf = years_trf.unique()
print(f'Time range: {years_trf.min()}-{years_trf.max()}')
print(f'Total years: {len(years_trf)}')

Index(['Reporter', 'Product', 'Partner', 'Tariff Year', 'DutyType',
       'Selected Nomen', 'Native Nomen', 'Reporter Name', 'Product Name',
       'Partner Name', 'Trade Year', 'Trade Source', 'Simple Average',
       'Simple Tariff Line Average', 'Weighted Average', 'Variance',
       'Standard Deviation', 'Sum Of Rates', 'Sum Of SAvgRates',
       'Count_Of_SAvgRates_Cases', 'Sum_Of_Squared_Rates', 'Minimum Rate',
       'Maximum Rate', 'Nbr of AVE Lines', 'Nbr of NA Lines',
       'Nbr of Free Lines', 'Nbr of Dutiable Lines', 'Nbr of Total Lines',
       'Nbr of DomesticPeaks', 'Nbr of InternationalPeaks', 'Nbr Line 0 to 5',
       'Nbr Line 5 to 10', 'Nbr Line 10 to 20', 'Nbr Line 20 to 50',
       'Nbr Line 50 to 100', 'Nbr Line more than 100', 'SumRateByWghtTrdValue',
       'SumWghtTrdValue4NotNull', 'Imports Value in 1000 USD',
       'Free Imports in 1000 USD', 'Dutiable Imports in 1000 USD',
       'Specific Duty Imports in 1000 USD', 'Binding Coverage'],
      dtype='objec

In [47]:
# Look at trade data
print(trade.columns)

# Print unique countries
countries_trd = trade['Country Name'].unique()
print(f'Total countries: {len(countries_trd)}')

# Print unique years
year_cols = [col.split()[0] for col in trade.columns if col.split()[0].isdigit()]
years_trd = sorted(int(y) for y in year_cols)
print(f"Time range: {min(years_trd)}–{max(years_trd)}")
print(f"Total years: {len(years_trd)}")

Index(['Series Name', 'Series Code', 'Country Name', 'Country Code',
       '1975 [YR1975]', '1976 [YR1976]', '1977 [YR1977]', '1978 [YR1978]',
       '1979 [YR1979]', '1980 [YR1980]', '1981 [YR1981]', '1982 [YR1982]',
       '1983 [YR1983]', '1984 [YR1984]', '1985 [YR1985]', '1986 [YR1986]',
       '1987 [YR1987]', '1988 [YR1988]', '1989 [YR1989]', '1990 [YR1990]',
       '1991 [YR1991]', '1992 [YR1992]', '1993 [YR1993]', '1994 [YR1994]',
       '1995 [YR1995]', '1996 [YR1996]', '1997 [YR1997]', '1998 [YR1998]',
       '1999 [YR1999]', '2000 [YR2000]', '2001 [YR2001]', '2002 [YR2002]',
       '2003 [YR2003]', '2004 [YR2004]', '2005 [YR2005]', '2006 [YR2006]',
       '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]', '2010 [YR2010]',
       '2011 [YR2011]', '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]',
       '2015 [YR2015]', '2016 [YR2016]', '2017 [YR2017]', '2018 [YR2018]',
       '2019 [YR2019]', '2020 [YR2020]', '2021 [YR2021]', '2022 [YR2022]',
       '2023 [YR2023]', '2024 [

In [48]:
# Look at economic data
print(pwt.columns)

# Print unique countries
countries_pwt = pwt['country'].unique()
print(f'Total countries: {len(countries_pwt)}')

# Print time range
years_pwt = pwt['year'].unique()
print(f'Time range: {years_pwt.min()}-{years_pwt.max()}')
print(f'Total years: {len(years_pwt)}')

Index(['countrycode', 'country', 'currency_unit', 'year', 'rgdpe', 'rgdpo',
       'pop', 'emp', 'avh', 'hc', 'ccon', 'cda', 'cgdpe', 'cgdpo', 'cn', 'ck',
       'ctfp', 'cwtfp', 'rgdpna', 'rconna', 'rdana', 'rnna', 'rkna', 'rtfpna',
       'rwtfpna', 'labsh', 'irr', 'delta', 'xr', 'pl_con', 'pl_da', 'pl_gdpo',
       'i_cig', 'i_xm', 'i_xr', 'i_outlier', 'i_irr', 'cor_exp', 'csh_c',
       'csh_i', 'csh_g', 'csh_x', 'csh_m', 'csh_r', 'pl_c', 'pl_i', 'pl_g',
       'pl_x', 'pl_m', 'pl_n', 'pl_k'],
      dtype='object')
Total countries: 185
Time range: 1950-2023
Total years: 74


In [49]:
# Look at income groups data
print(income_groups.columns)

# Print unique countries
countries_inc = income_groups['Entity'].unique()
print(f'Total countries: {len(countries_inc)}')

# Print time range
years_inc = income_groups['Year'].unique()
print(f'Time range: {years_inc.min()}-{years_inc.max()}')
print(f'Total years: {len(years_inc)}')

Index(['Entity', 'Code', 'Year', 'World Bank's income classification'], dtype='object')
Total countries: 226
Time range: 1987-2024
Total years: 38


In [50]:
# Print intersecting countries of datasets
countries_intersection = list(set(countries_trf) & set(countries_trd) & set(countries_pwt) & set(countries_inc))
countries_intersection.sort()
print(f'Countries in all datasets: {len(countries_intersection)}')

# Print intersecting years of datasets
years_intersection = set(years_trf) & set(years_trd) & set(years_pwt)
years_intersection = sorted(int(y) for y in years_intersection)
print(f"Time period in all datasets: {years_intersection[0]}–{years_intersection[-1]}")

Countries in all datasets: 144
Time period in all datasets: 1988–2023


# Data Cleaning

In [51]:
# Clean tariff data
tariffs = tariffs[['Reporter Name', 'Tariff Year', 
                   'Weighted Average', 'Standard Deviation']].rename(columns={
    'Reporter Name': 'country',
    'Tariff Year': 'year',
    'Weighted Average': 'tariff',
    'Standard Deviation': 'tariff_sd'
})

In [52]:
# Clean trade data
# Original dataset encodes years as '1980 [YR1980]' etc.
raw_year_cols = [col for col in trade.columns if '[' in col]

# Extract numeric year from each column name
clean_year_map = {col: col.split(' ')[0] for col in raw_year_cols}
trade = trade.rename(columns = clean_year_map)

# Identify years
year_columns = [col for col in trade.columns if col.isdigit()]

trade = trade.melt(
    id_vars = 'Country Name',
    value_vars = year_columns,
    var_name = 'Year',
    value_name = 'trade_gdp'
)

trade['Year'] = trade['Year'].astype(int)

In [53]:
# Clean income group data
income_groups = income_groups.rename(columns = {'Entity': 'country',
                                                'Year': 'year',
                                                'World Bank\'s income classification': 'income_group'})
income_groups = income_groups.drop(columns = ['Code'])

In [54]:
# Clean PWT data
pwt = pwt.filter(['country', 'year', 'rgdpe', 'rgdpo', 'pop'])
pwt = pwt.sort_values(['country', 'year'])

# Compute Real GDP per capita
pwt['rgdp_pc'] = pwt['rgdpe'] / pwt['pop']

# Compute GDP per capita growth
pwt['log_gdppc_growth'] = (np.log(pwt['rgdp_pc']) 
                                - np.log(pwt.groupby('country')['rgdp_pc'].shift(1))
                                ) * 100


# Merge datasets

In [55]:
# Rename columns
trade = trade.rename(columns={'Country Name': 'country', 'Year': 'year'})

# Merge datasets 
panel = pwt.merge(trade, on=['country', 'year'], how='left')
panel = panel.merge(tariffs, on=['country', 'year'], how='left')
panel = panel.merge(income_groups, on=['country', 'year'], how='left')

# Drop rows with missing data
panel = panel.dropna(subset=['log_gdppc_growth', 'tariff', 'trade_gdp', 'income_group'])
panel = panel.reset_index(drop=True)

# Keep relevant variables
panel = panel[['country', 'year', 'log_gdppc_growth', 'tariff', 'tariff_sd', 'trade_gdp', 'income_group']]

In [56]:
# Export dataset
output_dir = Path('data/processed')
output_dir.mkdir(parents=True, exist_ok=True)
panel.to_csv(output_dir / 'panel_data.csv', index=False)