In [1]:
import pandas as pd
import os

In [2]:
# Set this to the folder where your CSVs are stored
DATA_DIR = "ForParticipants/csv_data_extracted"

# File names
FILES = {
    'usa_2023': 'trade_s_usa_state_m_hs_2023.csv',
    'usa_2024': 'trade_s_usa_state_m_hs_2024.csv',
    'chn_2023': 'trade_s_chn_m_hs_2023.csv',
    'chn_2024': 'trade_s_chn_m_hs_2024.csv'
}

# Number of rows to extract for sample
SAMPLE_ROWS = 5000

In [4]:
SAMPLE_ROWS = 5000

def extract_sample(file_path, sample_rows=SAMPLE_ROWS):
    chunks = pd.read_csv(
        file_path,
        chunksize=100000,
        dtype=str,
        low_memory=False,
        on_bad_lines='skip',  # Skips corrupt lines
        engine='c'            # Revert to default C engine for chunking
    )
    
    sample = []
    total_rows = 0
    for chunk in chunks:
        sample.append(chunk)
        total_rows += len(chunk)
        if total_rows >= sample_rows:
            break
    df_sample = pd.concat(sample).head(sample_rows).reset_index(drop=True)
    return df_sample

In [5]:
# Output directory for sample files
os.makedirs("samples", exist_ok=True)

# Extract samples
for label, filename in FILES.items():
    full_path = os.path.join(DATA_DIR, filename)
    print(f"Extracting from {filename} ...")
    df_sample = extract_sample(full_path)
    output_file = f"samples/sample_{label}.csv"
    df_sample.to_csv(output_file, index=False)
    print(f"Saved sample to {output_file}")

Extracting from trade_s_usa_state_m_hs_2023.csv ...
Saved sample to samples/sample_usa_2023.csv
Extracting from trade_s_usa_state_m_hs_2024.csv ...
Saved sample to samples/sample_usa_2024.csv
Extracting from trade_s_chn_m_hs_2023.csv ...
Saved sample to samples/sample_chn_2023.csv
Extracting from trade_s_chn_m_hs_2024.csv ...
Saved sample to samples/sample_chn_2024.csv


In [11]:
df_data = pd.read_parquet("harmonized_trade_data.parquet")
df_data.head()

Unnamed: 0,origin,destination,hs6,hs4,trade_flow,month,value
0,,JPN,710410,7104,Exports,2023-01-01,646
1,,JPN,710491,7104,Exports,2023-01-01,3426
2,,JPN,711311,7113,Exports,2023-01-01,26
3,,JPN,711419,7114,Exports,2023-01-01,124
4,,JPN,711590,7115,Exports,2023-01-01,50


In [9]:
print(df_data["origin"].value_counts())

Series([], Name: count, dtype: int64)


In [12]:
df_data.size

585775904