In [1]:
import pandas as pd
import os

In [2]:
# Set this to the folder where your CSVs are stored
DATA_DIR = "ForParticipants/csv_data_extracted"

# File names
FILES = {
    'usa_2023': 'trade_s_usa_state_m_hs_2023.csv',
    'usa_2024': 'trade_s_usa_state_m_hs_2024.csv',
    'chn_2023': 'trade_s_chn_m_hs_2023.csv',
    'chn_2024': 'trade_s_chn_m_hs_2024.csv'
}

# Number of rows to extract for sample
SAMPLE_ROWS = 5000

In [4]:
SAMPLE_ROWS = 5000

def extract_sample(file_path, sample_rows=SAMPLE_ROWS):
    chunks = pd.read_csv(
        file_path,
        chunksize=100000,
        dtype=str,
        low_memory=False,
        on_bad_lines='skip',  # Skips corrupt lines
        engine='c'            # Revert to default C engine for chunking
    )
    
    sample = []
    total_rows = 0
    for chunk in chunks:
        sample.append(chunk)
        total_rows += len(chunk)
        if total_rows >= sample_rows:
            break
    df_sample = pd.concat(sample).head(sample_rows).reset_index(drop=True)
    return df_sample

In [5]:
# Output directory for sample files
os.makedirs("samples", exist_ok=True)

# Extract samples
for label, filename in FILES.items():
    full_path = os.path.join(DATA_DIR, filename)
    print(f"Extracting from {filename} ...")
    df_sample = extract_sample(full_path)
    output_file = f"samples/sample_{label}.csv"
    df_sample.to_csv(output_file, index=False)
    print(f"Saved sample to {output_file}")

Extracting from trade_s_usa_state_m_hs_2023.csv ...
Saved sample to samples/sample_usa_2023.csv
Extracting from trade_s_usa_state_m_hs_2024.csv ...
Saved sample to samples/sample_usa_2024.csv
Extracting from trade_s_chn_m_hs_2023.csv ...
Saved sample to samples/sample_chn_2023.csv
Extracting from trade_s_chn_m_hs_2024.csv ...
Saved sample to samples/sample_chn_2024.csv


In [7]:
df_data = pd.read_parquet("ForParticipants/csv_data_extracted/harmonized_trade_data.parquet")
df_data.head(-5)

Unnamed: 0,origin,destination,hs6,hs4,trade_flow,month,value
0,CHN,\N,030359,0303,Exports,2023-01-01,45000
1,CHN,\N,070310,0703,Exports,2023-01-01,2200
2,CHN,\N,070320,0703,Exports,2023-01-01,600
3,CHN,\N,190230,1902,Exports,2023-01-01,28917
4,CHN,\N,200870,2008,Exports,2023-01-01,11520
...,...,...,...,...,...,...,...
29982696,USA,ZWE,880710,8807,Exports,2024-12-01,0
29982697,USA,ZWE,901480,9014,Exports,2024-12-01,0
29982698,USA,ZWE,901819,9018,Exports,2024-12-01,0
29982699,USA,ZWE,902300,9023,Exports,2024-12-01,0


In [21]:
print(df_data["hs4"].value_counts())

hs4
8708    253090
6204    243772
9405    198836
8536    192985
9403    187561
         ...  
5104       144
2705       140
8908       132
0205       120
1203        56
Name: count, Length: 1230, dtype: int64


In [14]:
df_data.size

209878942

In [12]:
print(df_data["destination"].unique())

['\\N' 'ABW' 'AGO' 'ANT' 'ARE' 'ARG' 'ARM' 'ATG' 'AUT' 'AZE' 'BDI' 'BEL'
 'BEN' 'BFA' 'BGD' 'BGR' 'BHR' 'BHS' 'BIH' 'BLR' 'BLZ' 'BMU' 'BOL' 'BRA'
 'BRB' 'BRN' 'BTN' 'BWA' 'CAF' 'CAN' 'CHE' 'CHN' 'CIV' 'CMR' 'COD' 'COG'
 'COM' 'CPV' 'CRI' 'CUB' 'CUW' 'CYM' 'DJI' 'DMA' 'DNK' 'DOM' 'DZA' 'ESP'
 'EST' 'FIN' 'FJI' 'FRA' 'FSM' 'GLP' 'GRC' 'HRV' 'HUN' 'IDN' 'IND' 'IRL'
 'IRN' 'ISL' 'ITA' 'JPN' 'KAZ' 'KGZ' 'KHM' 'KIR' 'LAO' 'LBN' 'LKA' 'MAC'
 'MDG' 'MDV' 'MHL' 'MLI' 'MMR' 'MNG' 'MOZ' 'MRT' 'MUS' 'MWI' 'MYT' 'NAM'
 'NCL' 'NER' 'NFK' 'NGA' 'NOR' 'NRU' 'NZL' 'PAK' 'PHL' 'PLW' 'PNG' 'POL'
 'PRK' 'PRT' 'PSE' 'PYF' 'QAT' 'REU' 'ROU' 'RUS' 'RWA' 'SDN' 'SEN' 'SLB'
 'SLV' 'SPM' 'SWE' 'TCA' 'THA' 'TJK' 'TKM' 'TLS' 'TON' 'TTO' 'TUR' 'TUV'
 'UKR' 'USA' 'VAT' 'VUT' 'WLF' 'WSM' 'AUS' 'CHL' 'COL' 'CYP' 'CZE' 'DEU'
 'EGY' 'ERI' 'ESH' 'ETH' 'FRO' 'GAB' 'GBR' 'GEO' 'GHA' 'GIN' 'GRD' 'GRL'
 'GTM' 'HKG' 'HND' 'HTI' 'IRQ' 'ISR' 'JAM' 'JOR' 'KNA' 'KOR' 'LCA' 'LIE'
 'LTU' 'LUX' 'LVA' 'MAF' 'MCO' 'MDA' 'MEX' 'MKD' 'M