In [14]:
from datasets import load_dataset, Features, Value

# 定义预期的 schema
features = Features({
    "prompt": Value("string"),
    "response": Value("string"),
    "csv_path": Value("string"),
})

# 加载数据集并指定 schema
dataset = load_dataset("RUCKBReasoning/TableLLM-SFT", features=features)
ttl_size = 0
for split, split_data in dataset.items():
    print(f"Split: {split}, Size: {len(split_data)}")
    ttl_size += len(split_data)
print(ttl_size)

Split: fetaqa, Size: 10297
Split: table_op, Size: 11587
Split: spider, Size: 3374
Split: tatqa, Size: 8425
Split: wikisql, Size: 28699
Split: wtq, Size: 18216
80598


In [3]:
from datasets import load_dataset

# 加载数据集（以 "imdb" 数据集为例）
dataset = load_dataset("Multilingual-Multimodal-NLP/TableInstruct")
# 打印数据集的大小
for split, split_data in dataset.items():
    print(f"Split: {split}, Size: {len(split_data)}")


Generating train split: 100%|██████████| 19661/19661 [00:00<00:00, 83385.62 examples/s]

Split: train, Size: 19661





In [5]:
from datasets import load_dataset

# 加载数据集（以 "imdb" 数据集为例）
dataset = load_dataset("Multilingual-Multimodal-NLP/TableBench")
# 打印数据集的大小
for split, split_data in dataset.items():
    print(f"Split: {split}, Size: {len(split_data)}")


Generating test split: 100%|██████████| 3544/3544 [00:00<00:00, 92508.94 examples/s]

Split: test, Size: 3544





In [None]:
import re

def extract_placeholders(text):
    # 定义正则表达式模式
    patterns = {
        "table_description": r"### \[Table Description\]\s*(.*?)\s*### \[Table\]",
        "table_in_csv":  r"### \[Table\]\s*```(.*?)```",
        'csv_data': r"Header and first few lines of CSV file:\s*(.*?)\s*Question:",
        'csv_data1': r"Header and first few lines of CSV file 1:\s*(.*?)\s*Header and first few lines of CSV file 2:",
        'csv_data2': r"Header and first few lines of CSV file 2:\s*(.*?)\s*Question:",
        "question": [
            r"### \[Question\]\s*(.*?)\s*### \[Solution\]",
            r"Question:\s*(.*)"
        ]
    }

    extracted = {}

    # 遍历 patterns 并处理单个模式和列表模式
    for key, pattern in patterns.items():
        if isinstance(pattern, list):  # 如果是列表模式
            for sub_pattern in pattern:
                match = re.search(sub_pattern, text, re.DOTALL)
                if match:
                    extracted[key] = match.group(1).strip()
                    break  # 找到第一个匹配后停止
        else:  # 如果是单个模式
            match = re.search(pattern, text, re.DOTALL)
            if match:
                extracted[key] = match.group(1).strip()

    return extracted

# 示例字符串
input_file = "TableLLM-SFT/fetaqa.jsonl"
with open(input_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        # Load each JSON line from the first dataset
        text = json.loads(line.strip())["prompt"]
        print(text)
        break
            
# 提取内容
extracted_data = extract_placeholders(text)

# 打印结果
for key, value in extracted_data.items():
    print(f"{key}:")
    print(value)
    print("-" * 40)


In [57]:
import json
import os
import csv

def convert_format(input_path, output_path):
    """
    Convert the format of the first dataset to match the format of the second dataset.
    Args:
        input_path (str): Path to the input JSONL file (first dataset).
        output_path (str): Path to the output JSONL file (converted dataset).
    """
    with open(input_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            # Load each JSON line from the first dataset
            original_example = json.loads(line.strip())
            example = extract_placeholders(original_example["prompt"])
            
            # Unified format
            if 'csv_data' in example:  # Single table case
                unified_example = {
                    "instruction": f"You are a data analyst proficient in Python. Below are the first few lines of a table. You need to write a Python program to solve the provided question. \n\nRead the table below in JSON format: \n[TABLE] \n{convert_csv_to_json(example['csv_data'])} \n\nLet's get start! \nQuestion: {example['question']}",
                    "response": original_example["response"],
                }
                outfile.write(json.dumps(unified_example, ensure_ascii=False) + '\n')
            elif 'csv_data1' in example and 'csv_data2' in example:  # Two table case
                pass
            elif 'table_in_csv' in example:  # Single table case
                unified_example = {
                    "instruction": f"You are a table analyst. Your task is to answer questions based on the table content. Offer a thorough and accurate solution that directly addresses the Question outlined in the [Question]. \n\nRead the table below in JSON format: \n[TABLE] \n{convert_csv_to_json(example['table_in_csv'])} \n\nLet's get start! \nQuestion: {example['question']}",
                    "response": original_example["response"],
                }
                outfile.write(json.dumps(unified_example, ensure_ascii=False) + '\n')
        if 'csv_data' in example:  # Single table case
            print(example["csv_data"],end="\n\n")
        elif 'table_in_csv' in example:  # Single table case
            print(example["table_in_csv"],end="\n\n")
        print(unified_example["instruction"])


def convert_csv_to_json(csv_text):
    # 初始化解析器，处理引号内的数据
    reader = csv.reader(csv_text.strip().split("\n"), skipinitialspace=True)
    
    # 提取列名（第一行）
    columns = next(reader)
    
    # 提取数据（后续行），并移除 "\r" 和额外空白
    data = []
    for row in reader:
        cleaned_row = [cell.strip().replace("\r", "") for cell in row]
        data.append(cleaned_row)
    
    # 构造输出格式
    converted = {
        "columns": columns,
        "data": data
    }
    return converted


if __name__ == "__main__":
    for file in os.listdir("TableLLM-SFT"):

        input_file = f"TableLLM-SFT/{file}"
        file_name=file.split(".")[0]
        if file_name!="TableInstruct_instructions" and "converted" not in file_name:
            output_file = f"TableLLM-SFT/{file_name}_converted.jsonl"
        
            # Convert the dataset
            convert_format(input_file, output_file)
            print(f"Dataset converted and saved to {output_file}")


movie_id,Title,Year,Director,Budget_million,Gross_worldwide
1,The Boondock Saints,1999,Troy Duffy,6.0,30471
2,The Big Kahuna,1999,John Swanbeck,7.0,3728888
3,Storm Catcher,1999,Anthony Hickox,5.0,40500
4,Jill Rips,2000,Anthony Hickox,4.0,456774
5,The Whole Nine Yards,2000,Jonathan Lynn,41.3,106371651
6,Battlefield Earth,2000,Roger Christian,44.0,29725663
7,Get Carter,2000,Stephen Kay,63.6,19412993
8,The Art of War,2000,Christian Duguay,60.0,40400425
9,Agent Red,2000,Damian Lee,47.0,543356
10,3000 Miles to Graceland,2001,Demian Lichtenstein,62.0,18720175

You are a table analyst. Your task is to answer questions based on the table content. Offer a thorough and accurate solution that directly addresses the Question outlined in the [Question]. 

Read the table below in JSON format: 
[TABLE] 
{'columns': ['movie_id', 'Title', 'Year', 'Director', 'Budget_million', 'Gross_worldwide'], 'data': [['1', 'The Boondock Saints', '1999', 'Troy Duffy', '6.0', '30471'], ['2', 'The Big Kahuna', '1999',

In [39]:
input_file = "TableLLM-SFT/TableInstruct_instructions.jsonl"
n=0
with open(input_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        # Load each JSON line from the first dataset
        text = json.loads(line.strip())["instruction"]
        print(text)
        n+=1
        if n>3: 
            break

You are a data analyst proficient in Python. Your task is to write executable Python code to analyze the table and then answer questions.

[Guidelines]
You should act following requirements below:
1. based on the question, write out your analytical approach, and then write Python code according to this approach.
2. The code needs to be concise and easy to understand, and if necessary, add comments for clarification.
3. Code blocks need to strictly start with ```python and end with ```
4. Your analysis must be based entirely on the above data. If the user's question is not related to data analysis, please politely refuse.
5. You need to generate executable code. If there are results to be presented, please use the print function; if there are charts, please use the matplotlib library to draw them.
6. Ensure to load the table with command ```df = pd.read_csv('table.csv')```


The generated Python code should follow the format below, and ensure the first two code lines is exactly the same