In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/combined_trade_data.csv")

# Only keep cols [period, reporterCode, partnerCode, cmdCode, primaryValue]
df = df[["period", "reporterCode", "partnerCode", "cmdCode", "primaryValue"]]

# Save to csv different name
df.to_csv("data/combined_trade_data.csv", index=False)


In [6]:
# Read the HSBEC mapping file, treating cmdCode as string
hsbec_df = pd.read_csv("HSBEC.csv", dtype={"From HS 2022": str})
hsbec_df.columns = ["cmdCode", "bec"]

# Make sure cmdCode is 6 digits by padding with leading zeros
hsbec_df["cmdCode"] = hsbec_df["cmdCode"].str.zfill(6)

# Extract first digit of BEC code
hsbec_df["bec_1digit"] = hsbec_df["bec"].astype(str).str[0]

# Read the trade data, treating cmdCode as string
df = pd.read_csv("data/combined_trade_data.csv", dtype={"cmdCode": str})

# Make sure cmdCode is 6 digits in the trade data as well
df["cmdCode"] = df["cmdCode"].str.zfill(6)

# Merge trade data with HSBEC mapping
df = df.merge(hsbec_df[["cmdCode", "bec_1digit"]], on="cmdCode", how="left")

# Keep only the columns we need
df = df[
    ["period", "reporterCode", "partnerCode", "cmdCode", "primaryValue", "bec_1digit"]
]

# Fill missing values with 8 in bec_1digit
df["bec_1digit"] = df["bec_1digit"].fillna(8)

# Cast bec_1digit to int
df["bec_1digit"] = df["bec_1digit"].astype(int)

# Save to csv
df.to_csv("data/combined_trade_data_with_bec.csv", index=False)

In [8]:
df = pd.read_csv("data/combined_trade_data_with_bec.csv")

# Only take years from 2006 onwards
df = df[df["period"] >= 2006]

# Drop cmdCode column
df = df.drop(columns=["cmdCode"])

# Save to csv
df.to_csv("data/trade_data_06_23.csv", index=False)

In [9]:
# Read the data
df = pd.read_csv("data/trade_data_06_23.csv")

# Group by and sum
grouped_df = (
    df.groupby(["period", "partnerCode", "reporterCode", "bec_1digit"])["primaryValue"]
    .sum()
    .reset_index()
)

# Sort by period, partnerCode, reporterCode
grouped_df = grouped_df.sort_values(
    ["period", "partnerCode", "reporterCode", "bec_1digit"]
)

# Save to csv
grouped_df.to_csv("data/trade_data_grouped.csv", index=False)

In [13]:
df = pd.read_csv("data/trade_data_grouped.csv")

# Calculate total trade for each country pair and year
totals = (
    df.groupby(["period", "partnerCode", "reporterCode"])["primaryValue"]
    .sum()
    .reset_index()
)
totals = totals.rename(columns={"primaryValue": "total_trade"})

# Merge totals back to original dataframe
df = df.merge(totals, on=["period", "partnerCode", "reporterCode"])

# Calculate proportions
df["proportion"] = df["primaryValue"] / df["total_trade"]

# Create a pivot table to split BEC categories into columns
pivot_df = df.pivot_table(
    index=["period", "partnerCode", "reporterCode"],
    columns="bec_1digit",
    values="proportion",
    fill_value=0,
).reset_index()

# Add total trade column back
pivot_df = pivot_df.merge(totals, on=["period", "partnerCode", "reporterCode"])

# Rename columns to be more descriptive
# Make sure all BEC categories (1-8) exist, fill with 0 if not
for i in range(1, 9):
    if i not in pivot_df.columns:
        pivot_df[i] = 0
    pivot_df = pivot_df.rename(columns={i: f"bec_{i}_prop"})

# Rename total_trade to total_trade_volume
pivot_df = pivot_df.rename(columns={"total_trade": "total_trade_volume"})

# Sort by period, partnerCode, reporterCode
pivot_df = pivot_df.sort_values(["period", "partnerCode", "reporterCode"])

# Save to csv
pivot_df.to_csv("data/trade_data_bec_proportions.csv", index=False)

In [17]:
country_codes = {
    "702": "SGP",  # Singapore
    "156": "CHN",  # China
    "458": "MYS",  # Malaysia
    "842": "USA",  # United States
    "344": "HKG",  # Hong Kong
    "360": "IDN",  # Indonesia
    "410": "KOR",  # Republic of Korea
    "392": "JPN",  # Japan
    "764": "THA",  # Thailand
    "36": "AUS",  # Australia
    "704": "VNM",  # Vietnam
    "699": "IND",  # India
    "784": "ARE",  # United Arab Emirates
    "608": "PHL",  # Philippines
    "276": "DEU",  # Germany
    "251": "FRA",  # France
    "757": "CHE",  # Switzerland
    "528": "NLD",  # Netherlands
}

df = pd.read_csv("data/trade_data_bec_proportions.csv")

# Convert reporter and partner codes to ISO3
df["reporter_iso3"] = df["reporterCode"].astype(str).map(country_codes)
df["partner_iso3"] = df["partnerCode"].astype(str).map(country_codes)

# Reorder columns to put ISO3 codes right after the original codes
cols = df.columns.tolist()
bec_cols = [col for col in cols if "bec_" in col]
new_order = (
    ["period", "partnerCode", "partner_iso3", "reporterCode", "reporter_iso3"]
    + bec_cols
    + ["total_trade_volume"]
)

df = df[new_order]

# Drop code columns
df = df.drop(columns=["reporterCode", "partnerCode"])

# Drop total_trade_volume column
df = df.drop(columns=["total_trade_volume"])

# Save to csv
df.to_csv("data/cleaned_comtrade_data.csv", index=False)