<a href="https://colab.research.google.com/github/RICQW/pdf-input-url/blob/main/clean_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install camelot-py
!pip install numpy
!pip install pandas

In [None]:
import camelot
import numpy as np
import pandas as pd

In [None]:
# 读取 PDF 表格
tables = camelot.read_pdf("https://cdn.jsdelivr.net/gh/RICQW/pdf-input-url/11.pdf", pages="all")

# 初始化 result
result = []

row_counter = 1  # 连续行编号
num_cols = 6     # 根据你的表格列数修改

# 1️⃣ 读取并清理内容
for table in tables:
    df = table.df
    for r in range(len(df)):
        for c in range(len(df.columns)):
            cleaned = df.iloc[r, c].replace(" ", "").replace("\n", "")
            cell_id = f"R{row_counter}C{c+1}"
            result.append((cell_id, cleaned))
        row_counter += 1

# 2️⃣ 按行分组
rows = [result[i:i+num_cols] for i in range(0, len(result), num_cols)]

# 3️⃣ 处理序号行：合并下一行内容到上一行
i = 1  # 从第二行开始，跳过第一行
while i < len(rows) - 1:  # 保证有下一行
    current_row = rows[i]
    if any(cell[1] == "序号" for cell in current_row):
        prev_row = rows[i-1]
        next_row = rows[i+1]
        if rows[i+1][0][1] == "":
          print(1)
          for j in range(len(current_row)):
              prev_row[j] = (prev_row[j][0], prev_row[j][1] + next_row[j][1])
          # 删除序号行和下一行
          rows.pop(i+1)
        rows.pop(i)
        continue  # 不增加 i，下一行已经移位
    i += 1

# 4️⃣ 重新编号
final_result = []
row_counter = 1
for row in rows:
    for c_idx, cell in enumerate(row):
        cell_id = f"R{row_counter}C{c_idx+1}"
        final_result.append((cell_id, cell[1]))
    row_counter += 1

# 5️⃣ 输出最终结果
print(final_result)


In [None]:
records = []
for cell, value in final_result:
    # 提取行号、列号
    r = int(cell.split('C')[0].replace('R', ''))
    c = int(cell.split('C')[1])
    records.append((r, c, value))

df = pd.DataFrame(records, columns=['row', 'col', 'value'])

# 透视成表格
table = df.pivot(index='row', columns='col', values='value')

# 按行列排序
table = table.sort_index().sort_index(axis=1)

# 保存为 CSV（不写行号）
table.to_csv("final_result.csv", index=False)

In [None]:
col_to_check = 2  # 第三列（从 0 开始）

# 遍历 DataFrame，从第二行开始
for i in range(1, len(table)):
    val = table.iat[i, col_to_check]

    if pd.isna(val) or val == "":
        # 第三列为空，将整行“剪切”接到上一行对应列
        for c in range(table.shape[1]):
            upper_val = str(table.iat[i-1, c])
            curr_val = table.iat[i, c]
            if pd.notna(curr_val) and curr_val != "":
                table.iat[i-1, c] = upper_val + str(curr_val)
                table.iat[i, c] = ""  # 剪切后清空当前单元格

# 把空字符串转换为 NaN
table.replace("", np.nan, inplace=True)

# 删除所有空行（整行都为空的行）
table = table.dropna(how='all').reset_index(drop=True)

# 输出 CSV
table.to_csv("final_result.csv", index=False, header=False)


In [None]:
# ====== 从第一行开始遍历第一列，执行“合并单元格（填充）”逻辑 ======
first_col = table.iloc[:, 0]  # 第一列
filled_col = first_col.copy()

current_value = ""  # 记录当前组的值（最近的非空值）

for i in range(0, len(filled_col)):  # 从第一行开始
    if filled_col.iloc[i] != "" and not pd.isna(filled_col.iloc[i]):
        # 遇到新的非空值，更新 current_value
        current_value = filled_col.iloc[i]
    else:
        # 空值 → 填入 current_value，实现“合并单元格”的效果
        filled_col.iloc[i] = current_value

# 替换回输出的第一列
table.iloc[:, 0] = filled_col

# 输出 CSV
table.to_csv("final_result.csv", index=False, header=False)