In [27]:
import pandas as pd
import numpy as np
import os

In [28]:
dta_dir = "./src/xx/"
data_name = []
for i in range(1,11):
    if i < 10:
        i = "0" + str(i)
    data_name.append(os.path.join(dta_dir, str(i) + "_.csv"))

In [29]:
def process_code(code):
    try:
        return str(code)[:2]
    except Exception:
        return None


def sord(df):
    new_df = pd.DataFrame(columns=['time', 'code', 'IM_PQ', 'IM_Q', 'EX_PQ', 'EX_Q'])

    pd.DataFrame(columns=['time', 'code', 'IM_PQ', 'IM_Q', 'EX_PQ', 'EX_Q'])
    
    # 处理进口金额
    import_amount = df[df['指标'] == '进口金额（美元）']
    import_amount = import_amount[['时间', '商品', '数值']].rename(columns={'时间': 'time', '商品': 'code', '数值': 'IM_PQ'})
    
    # 处理进口量（仅第一数量）
    import_quantity = df[df['指标'] == '进口数量（第一数量）']
    import_quantity = import_quantity[['时间', '商品', '数值']].rename(columns={'时间': 'time', '商品': 'code', '数值': 'IM_Q'})
    
    # 处理出口金额
    export_amount = df[df['指标'] == '出口金额（美元）']
    export_amount = export_amount[['时间', '商品', '数值']].rename(columns={'时间': 'time', '商品': 'code', '数值': 'EX_PQ'})
    
    # 处理出口量（仅第一数量）
    export_quantity = df[df['指标'] == '出口数量（第一数量）']
    export_quantity = export_quantity[['时间', '商品', '数值']].rename(columns={'时间': 'time', '商品': 'code', '数值': 'EX_Q'})
    
    # 合并数据
    new_df = (
        import_amount
        .merge(export_amount, on=['time', 'code'], how='outer')
        .merge(import_quantity, on=['time', 'code'], how='outer')
        .merge(export_quantity, on=['time', 'code'], how='outer')
    )
    
    # 填充缺失值为 "空"
    new_df = new_df.fillna(pd.NA)    
    new_df.head()
    return new_df

def remove2024(df):
    df['year'] = df['year'].astype(int)
    df = df[df['year'] != 2024]
    return df

In [30]:
# 初始化一个空列表来存储每个 DataFrame
dataframes = []

# 循环读取每个文件并处理
for file in data_name:
    df = pd.read_csv(file, encoding='gbk')[:-2]  # 读取文件并删除最后两行
    df['商品'] = df['商品'].str[:2]  # 仅保留 '商品' 列的前两位
    p_df = sord(df)
    dataframes.append(p_df)  # 将处理后的 DataFrame 添加到列表中

# 合并所有 DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

In [31]:
# 防止除以零或 NaN 的情况，计算 IM_P
combined_df["IM_P"] = combined_df.apply(
    lambda row: row["IM_PQ"] / row["IM_Q"] if row["IM_Q"] not in [0, np.nan] else np.nan, axis=1
)

# 防止除以零或 NaN 的情况，计算 EX_P
combined_df["EX_P"] = combined_df.apply(
    lambda row: row["EX_PQ"] / row["IM_Q"] if row["IM_Q"] not in [0, np.nan] else np.nan, axis=1
)

# 从 time 列提取年份并创建 year 列
combined_df["year"] = combined_df["time"].str[-4:]
combined_df = remove2024(combined_df)

In [32]:
gdp = pd.read_csv(os.path.join(dta_dir, "gdp.csv"), encoding='utf-8')

In [33]:
# 确保 'year' 列为整数类型
combined_df['year'] = combined_df['year'].astype(int)
gdp['year'] = gdp['year'].astype(int)
df = combined_df.merge(gdp, on='year', how='left')

In [34]:
df["lgdp"] = df["gdp"].apply(np.log)
df["lpopu"] = df["population"].apply(np.log)
df["lexp"] = df['EX_P'].apply(np.log)
df["limp"] = df['IM_P'].apply(np.log)
df["lexq"] = df['EX_Q'].apply(np.log)
df["limq"] = df['IM_Q'].apply(np.log)


In [35]:
df.to_csv(os.path.join(dta_dir, "combined.csv"), encoding='utf-8')

In [36]:
os.path.join(dta_dir, "combined.csv")

'./src/xx/combined.csv'

In [37]:
df

Unnamed: 0,time,code,IM_PQ,EX_PQ,IM_Q,EX_Q,IM_P,EX_P,year,gdp,population,lgdp,lpopu,lexp,limp,lexq,limq
0,01-2017,01,2.581732e+07,7423557.0,3.074607e+06,1431711.0,8.396949,2.414473,2017,8.155096e+13,7577110140,32.032249,22.748398,0.881481,2.127868,14.174381,14.938688
1,01-2018,01,4.649614e+07,10660629.0,6.565793e+06,1839541.0,7.081572,1.623662,2018,8.668687e+13,7661177849,32.093324,22.759432,0.484684,1.957496,14.425027,15.697384
2,01-2019,01,6.180629e+07,33634859.0,1.324855e+07,9658840.0,4.665137,2.538758,2019,8.794557e+13,7742724795,32.107739,22.770020,0.931675,1.540117,16.083384,16.399399
3,01-2020,01,5.984081e+07,7975401.0,8.928196e+06,875300.0,6.702452,0.893282,2020,8.557772e+13,7821271846,32.080446,22.780113,-0.112852,1.902473,13.682322,16.004725
4,01-2021,01,1.175894e+08,4399592.0,1.575820e+07,547633.0,7.462108,0.279194,2021,9.752703e+13,7888963821,32.211151,22.788731,-1.275849,2.009838,13.213361,16.572872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,12-2019,10,5.698233e+08,97690379.0,1.870392e+09,231614894.0,0.304654,0.052230,2019,8.794557e+13,7742724795,32.107739,22.770020,-2.952100,-1.188577,19.260587,21.349414
836,12-2020,10,1.516607e+09,69288249.0,5.392723e+09,113192217.0,0.281232,0.012848,2020,8.557772e+13,7821271846,32.080446,22.780113,-4.354530,-1.268575,18.544598,22.408316
837,12-2021,10,1.540750e+09,110000249.0,4.587456e+09,235822702.0,0.335861,0.023978,2021,9.752703e+13,7888963821,32.211151,22.788731,-3.730598,-1.091057,19.278591,22.246592
838,12-2022,10,1.191414e+09,93209822.0,2.961046e+09,160532758.0,0.402363,0.031479,2022,1.012251e+14,7951595433,32.248367,22.796638,-3.458445,-0.910402,18.894009,21.808808
