In [None]:
import pandas as pd
from pathlib import Path

def split_parquet_files(data_dir='data', random_seed=42):
    """
    将指定目录下的所有parquet文件随机均分为两份，并保存为新文件
    
    参数:
        data_dir (str): 数据根目录，默认'data'
        random_seed (int): 随机种子，确保结果可重复，默认42
    """
    data_path = Path(data_dir)
    parquet_files = list(data_path.rglob('*.parquet'))
    
    for file_path in parquet_files:
        # 读取数据
        df = pd.read_parquet(file_path)
        
        # 随机均分数据
        split1 = df.sample(frac=0.5, random_state=random_seed)
        split2 = df.drop(split1.index)
        
        # 构建新文件路径
        parent_dir = file_path.parent
        stem = file_path.stem
        inf_path = parent_dir / f"{stem}_inference.parquet"
        eval_path = parent_dir / f"{stem}_evaluation.parquet"
        
        # 保存分割后的数据
        split1.to_parquet(inf_path, index=False)
        split2.to_parquet(eval_path, index=False)
        print(f"处理完成：{file_path} -> {inf_path}, {eval_path}")
        print(f"划分数量：Total-{len(df)} infer-{len(split1)} eval-{len(split2)}")

if __name__ == "__main__":
    split_parquet_files()

In [None]:
df_temp = pd.read_parquet("data/cceval/java/test.parquet")
print(len(df_temp))
print(df_temp.head())

In [None]:
df_temp1 = pd.read_parquet("data/cceval/java/test_inference.parquet")
print(len(df_temp1))
print(df_temp1.head())

In [None]:
df_temp2 = pd.read_parquet("data/cceval/java/test_evaluation.parquet")
print(len(df_temp2))
print(df_temp2.head())

In [1]:
import pandas as pd
import random
import os

class CodeBlock(object):
    def __init__(self, file_path, description, code_content, language, _type):
        """
        Represents a block of code.
        :param file_path: The path to the code file.
        :param description: The description of the code block.
        :param code_content: The content of the code block.
        """
        self.file_path = file_path
        self.code_content = code_content
        self.description = description
        self.language = language
        self._type = _type

    def __str__(self):
        if self.language == "python":
            comment_label = "#"
        else:
            comment_label = "//"
        crossfile_context = "\n".join([f"{comment_label} {cl}" for cl in  self.description.strip().split('\n') if cl]) + "\n"
        crossfile_context += "\n".join([f"{comment_label} {cl}" for cl in  self.code_content.split('\n') if cl])
        return crossfile_context.strip()

class Example(object):
    def __init__(self, task_id, file_path, left_context, right_context, related_files, target_code, language):
        """
        Represents an example used for constructing a dataset.
        :param task_id: Task ID.
        :param file_path: File path.
        :param left_context: The context to the left of the target code.
        :param right_context: The context to the right of the target code.
        :param related_files: A list of related files, each containing a path and text.
        :param target_code: The target code snippet.
        """
        self.task_id = task_id
        self.file_path = file_path
        self.left_context = left_context
        self.right_context = right_context
        self.related_files = related_files
        self.target_code = target_code
        self.language = language

    def __str__(self):
        return (f"[Example]:\n"
                f"[Task ID]:\n{self.task_id}\n"
                f"[Path]:\n{self.file_path}\n"
                f"[Left Context]:\n{self.left_context}\n"
                f"[Target Code]:\n{self.target_code}\n"
                f"[Right Context]:\n{self.right_context}\n"
                f"[Related Files]:\n{len(self.related_files)} files\n"
        )
    
# def load_test_dataset(args, datasetname, language):
#     """
#     Loads a dataset.
#     :param args: Parameters containing various configurations.
#     :param datasetname: The name of the dataset to load.
#     :param language: The language of the data to load.
#     :return: The loaded dataset.
#     """
#     if datasetname == 'repoeval' and language != 'func_level':
#         data_frame1 = pd.read_parquet(f"data/{datasetname}/{language}/test_0.parquet")
#         data_frame2 = pd.read_parquet(f"data/{datasetname}/{language}/test_1.parquet")
#         data_frame = pd.concat([data_frame1, data_frame2], ignore_index=True)
#     else:
#         data_frame = pd.read_parquet(f"data/{datasetname}/{language}/test.parquet")

#     # data_frame = data_frame.loc[data_frame['task_id'] == 'project_cc_python/210']
    
#     if datasetname == 'repoeval':
#         language = 'python'

#     if args.debug:
#         data_frame = data_frame.sample(100)
#     dataset = []
#     for item in data_frame[["task_id", "path", "left_context", "right_context", "crossfile_context", "groundtruth"]].values:
#         cross_files = item[4] if len(item[4]) > 0 else [{'path': "", "text": "Don't need cross file context for completion"}]
#         cross_files = [CodeBlock(x["path"], f"file path: {x['path']}\nlines: {0}-{len(x['text'].splitlines())}", x["text"], language, '') for x in cross_files]
#         dataset.append(Example(item[0], item[1], item[2], item[3], cross_files, item[5], language))
    
#     return dataset

def load_test_dataset(args, datasetname, language):
    """
    Loads a dataset.
    :param args: Parameters containing various configurations.
    :param datasetname: The name of the dataset to load.
    :param language: The language of the data to load.
    :return: The loaded dataset.
    """
    if datasetname == 'repoeval' and language != 'func_level':
        data_frame1 = pd.read_parquet(f"data/{datasetname}/{language}/test_0_inference.parquet")
        data_frame2 = pd.read_parquet(f"data/{datasetname}/{language}/test_1_inference.parquet")
        data_frame = pd.concat([data_frame1, data_frame2], ignore_index=True)
    else:
        data_frame = pd.read_parquet(f"data/{datasetname}/{language}/test_inference.parquet")

    # data_frame = data_frame.loc[data_frame['task_id'] == 'project_cc_python/210']
    
    if datasetname == 'repoeval':
        language = 'python'

    if args.debug:
        data_frame = data_frame.sample(100)
    dataset = []
    for item in data_frame[["task_id", "path", "left_context", "right_context", "crossfile_context", "groundtruth"]].values:
        cross_files = item[4] if len(item[4]) > 0 else [{'path': "", "text": "Don't need cross file context for completion"}]
        cross_files = [CodeBlock(x["path"], f"file path: {x['path']}\nlines: {0}-{len(x['text'].splitlines())}", x["text"], language, '') for x in cross_files]
        dataset.append(Example(item[0], item[1], item[2], item[3], cross_files, item[5], language))
    
    return dataset

def load_train_and_valid_dataset():
    """
    Loads the training dataset.
    :return: The training dataset.
    """
    training_datasets = []
    validation_datasets = []
    for language in ["python", "java"]:
        data_frame = pd.read_parquet(f"data/github_repos/{language}/train.parquet")
        all_data = []
        temp_data = []
        for x in data_frame[["path", "content", "first"]].values:
            if x[-1]:  # At the start of a new file
                if len(temp_data) > 1:
                    all_data.append((temp_data,language))
                temp_data = []
            temp_data.append([x[0], x[1]])
        training_datasets.extend(all_data[:2000])
        validation_datasets.extend(all_data[2000:2200])
    random.shuffle(training_datasets)
    random.shuffle(validation_datasets)

    return training_datasets, validation_datasets

def construct_dataset(raw_data, num_samples):
    """
    Builds a dataset.
    :param raw_data: Raw data.
    :param num_samples: The number of samples to generate.
    :return: The list of constructed samples.
    """
    examples = []
    data_index = 0
    while len(examples) < num_samples:
        example,language = raw_data[data_index % len(raw_data)]
        data_index += 1
        selected_file = random.choice(example[1:])
        related_files = [CodeBlock(x[0], f"file path: {x[0]}\nlines: {0}-{len(x[1].splitlines())}", x[1], language, '') for x in example if x[0] != selected_file[0]]
        path = selected_file[0]
        selected_file_content = selected_file[1].split(" ")
        try_count = 0

        while try_count < 10:
            end_line_number = int(len(selected_file_content) * random.uniform(0.2, 0.8))
            left_context = " ".join(selected_file_content[:end_line_number])
            target_length = random.randint(32, 64)
            target = " ".join(selected_file_content[end_line_number:end_line_number + target_length])
            right_context = " ".join(selected_file_content[end_line_number + target_length:])
            if len(left_context.split()) > 80 and len(target.split()) > 8:
                examples.append(
                    Example(len(examples), path, left_context, right_context, related_files, target,language)
                )
                break
            # if language == 'python':
            #     if len(left_context.split()) > 64 and len(target.split()) > 5:
            #         examples.append(
            #             Example(len(examples), path, left_context, right_context, related_files, target,language)
            #         )
            #         break
            # elif language == 'java':
            #     if len(left_context.split()) > 80 and len(target.split()) > 8:
            #         examples.append(
            #             Example(len(examples), path, left_context, right_context, related_files, target,language)
            #         )
            #         break
            try_count += 1
    
    return examples


def save_train_for_data_synthesis(examples, file_path="data/github_repos/data_for_synthesis/train.parquet"):
    """
    Saves a list of Example objects to a Parquet file.
    :param examples: List of Example objects.
    :param file_path: Path to save the Parquet file.
    """
    # Convert Example objects to a dictionary
    data = {
        'task_id': [example.task_id for example in examples],
        'file_path': [example.file_path for example in examples],
        'left_context': [example.left_context for example in examples],
        'right_context': [example.right_context for example in examples],
        'target_code': [example.target_code for example in examples],
        'language': [example.language for example in examples],
        'related_files': [
            [
                {
                    'file_path': cb.file_path,
                    'description': cb.description,
                    'code_content': cb.code_content,
                    'language': cb.language,
                    '_type': cb._type
                }
                for cb in example.related_files
            ]
            for example in examples
        ]
    }
    
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)
    
    # Save the DataFrame to a Parquet file
    df.to_parquet(file_path)

def load_and_construct_train_for_data_synthesis():
    """
    Loads the training dataset for inference.
    :return: The training dataset without valid dataset.
    """
    if not os.path.exists(f"data/github_repos/data_for_synthesis"):
        # load training dataset and construct
        training_datasets = []
        for language in ["python", "java"]:
            data_frame = pd.read_parquet(f"data/github_repos/{language}/train.parquet")
            all_data = []
            temp_data = []
            for x in data_frame[["path", "content", "first"]].values:
                if x[-1]:  # At the start of a new file
                    if len(temp_data) > 1:
                        all_data.append((temp_data, language))
                    temp_data = []
                temp_data.append([x[0], x[1]])
            training_datasets.extend(all_data)
        training_datasets_examples = construct_dataset(training_datasets, len(training_datasets))
        # makedirs and save dataset
        os.makedirs(f"data/github_repos/data_for_synthesis", exist_ok=True)
        save_train_for_data_synthesis(training_datasets_examples)
    else:
        # Read the Parquet file into a DataFrame
        df = pd.read_parquet("data/github_repos/data_for_synthesis/train.parquet")
        
        # Convert the DataFrame back to a list of Example objects
        training_datasets_examples = []
        for _, row in df.iterrows():
            related_files = [
                CodeBlock(
                    file_path=cb['file_path'],
                    description=cb['description'],
                    code_content=cb['code_content'],
                    language=cb['language'],
                    _type=cb['_type']
                )
                for cb in row['related_files']
            ]
            example = Example(
                task_id=row['task_id'],
                file_path=row['file_path'],
                left_context=row['left_context'],
                right_context=row['right_context'],
                related_files=related_files,
                target_code=row['target_code'],
                language=row['language']
            )
            training_datasets_examples.append(example)
    return training_datasets_examples

training_datasets_examples = load_and_construct_train_for_data_synthesis()

In [2]:
df = pd.read_parquet("data/github_repos/data_for_synthesis/train.parquet")

# 随机均分数据
split1 = df.sample(frac=0.5, random_state=42)
split2 = df.drop(split1.index)

# 构建新文件路径

inf_path = f"data/github_repos/data_for_synthesis/train_inference.parquet"
eval_path = f"data/github_repos/data_for_synthesis/train_evaluation.parquet"

# 保存分割后的数据
split1.to_parquet(inf_path, index=False)
split2.to_parquet(eval_path, index=False)

In [3]:
from pathlib import Path
import pyarrow.parquet as pq

def get_parquet_row_counts(folder_path):
    folder = Path(folder_path)
    parquet_files = folder.glob('**/*.parquet')  # 递归获取所有parquet文件
    
    results = []
    for file in parquet_files:
        try:
            # 使用pyarrow直接读取元数据（不加载实际数据）
            parquet_file = pq.ParquetFile(file)
            row_count = parquet_file.metadata.num_rows
            results.append((str(file), row_count))
        except Exception as e:
            print(f"读取 {file} 失败: {str(e)}")
            results.append((str(file), None))
    
    return results

# 使用示例
folder_path = "./data"
results = get_parquet_row_counts(folder_path)

# 打印结果
for file, count in results:
    print(f"文件: {file} \t 行数: {count}")

读取 data\repoeval\api_level\test_1.parquet 失败: [WinError 32] Failed to open local file 'data/repoeval/api_level/test_1.parquet'. Detail: [Windows error 32] 另一个程序正在使用此文件，进程无法访问。

文件: data\cceval\java\test.parquet 	 行数: 2139
文件: data\cceval\java\test_evaluation.parquet 	 行数: 1069
文件: data\cceval\java\test_inference.parquet 	 行数: 1070
文件: data\cceval\python\test.parquet 	 行数: 2665
文件: data\cceval\python\test_evaluation.parquet 	 行数: 1333
文件: data\cceval\python\test_inference.parquet 	 行数: 1332
文件: data\codereval\java\test.parquet 	 行数: 230
文件: data\codereval\java\test_evaluation.parquet 	 行数: 115
文件: data\codereval\java\test_inference.parquet 	 行数: 115
文件: data\codereval\python\test.parquet 	 行数: 230
文件: data\codereval\python\test_evaluation.parquet 	 行数: 115
文件: data\codereval\python\test_inference.parquet 	 行数: 115
文件: data\github_repos\data_for_synthesis\train_evaluation.parquet 	 行数: 3368
文件: data\github_repos\data_for_synthesis\train_inference.parquet 	 行数: 3368
文件: data\github_repos\