In [1]:
import json
import csv
from pathlib import Path
from tqdm import tqdm
import random
import copy

In [2]:
def load_json(json_path):
    """
    以只读的方式打开json文件

    Args:
        config_path: json文件路径

    Returns:
        A dictionary

    """
    try:

        with open(json_path, 'r', encoding='UTF-8') as f:
            return json.load(f)
    except Exception as e:
        print("Error loading json file: {}".format(json_path))
        print(e)
        return None
    
def save_json(save_path, data):
    """
    Saves the data to a file with the given filename in the given path

    Args:
        :param save_path: The path to the folder where you want to save the file
        :param data: The data to be saved

    """
    with open(save_path, 'w', encoding='UTF-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)



class MAKE_NFT1000_DATASET():
    def __init__(self, len_per_project, dataset_dict, base_path, out_path) -> None:
        self.len_per_project = len_per_project
        self.dataset_dict = dataset_dict
        self.base_path = base_path
        self.out_path = out_path

    def make_dataset_index(self):
        self.make_img_caption_dict()
        self.make_img_txt_dict()

    def make_img_txt_dict(self):
        """
        制作图片，文本路径字典
        """
        dict_template = {
            "length" : {},
            "project_name_list": self.dataset_dict,
            "training_dict": {},
            "validation_dict": {},
            "test_dict": {}
            }
        # 制作NFT1000的索引文件
        for dataset_item in ["training_list", "validation_list", "test_list"]:
            project_name_list = self.dataset_dict[dataset_item]
            for project_name in tqdm(project_name_list):
                # 拼凑字典路径
                project_path = self.base_path.joinpath(project_name, "caption",  "_caption_dict.json")
                # 读取字典
                caption_dict = load_json(project_path).get("caption_dict")
                # 提取字典中的图片名
                img_name_list = list(caption_dict.keys())
                # 拼凑caption文件路径
                img_path_list = [f"NFT1000/{project_name}/img/{img_name}" for img_name in list(caption_dict.keys())]
                txt_path_list = [f"NFT1000/{project_name}/caption/{img_name.replace('.png', '.txt')}" for img_name in img_name_list]
                target_caption_dict = {k: v for k, v in zip(img_path_list, txt_path_list)}

                if dataset_item == "training_list":
                    dict_template["training_dict"].update(target_caption_dict)

                elif dataset_item == "validation_list":
                    dict_template["validation_dict"].update(target_caption_dict)

                elif dataset_item == "test_list":
                    dict_template["test_dict"].update(target_caption_dict)

        dict_template["length"]["training_dict"] = len(dict_template["training_dict"])
        dict_template["length"]["validation_dict"] = len(dict_template["validation_dict"])
        dict_template["length"]["test_dict"] = len(dict_template["test_dict"])
        # 保存到json文件

        print("\n##########  saving……  ##########\n")
        self.save_json(self.out_path.joinpath("_index", "NFT1000_img_txt_dict.json"), dict_template)
        print("\n##########  img_txt_dict is saved successfully!  ##########\n")

    def make_img_caption_dict(self):
        dict_template = {
            "length" : {},
            "project_name_list": self.dataset_dict,
            "training_dict": {},
            "validation_dict": {},
            "test_dict": {}
            }
        # 制作NFT1000的索引文件
        for dataset_item in ["training_list", "validation_list", "test_list"]:
            project_name_list = self.dataset_dict[dataset_item]
            for project_name in tqdm(project_name_list):
                # 拼凑字典路径
                project_path = self.base_path.joinpath(project_name, "caption",  "_caption_dict.json")
                # 读取字典
                caption_dict = load_json(project_path).get("caption_dict")
                img_path_list = [f"NFT1000/{project_name}/img/{img_name}" for img_name in list(caption_dict.keys())]
                target_caption_dict = {k: v for k, v in zip(img_path_list, caption_dict.values())}
                
                if dataset_item == "training_list":
                    dict_template["training_dict"].update(target_caption_dict)

                elif dataset_item == "validation_list":
                    dict_template["validation_dict"].update(target_caption_dict)

                elif dataset_item == "test_list":
                    dict_template["test_dict"].update(target_caption_dict)

        dict_template["length"]["training_dict"] = len(dict_template["training_dict"])
        dict_template["length"]["validation_dict"] = len(dict_template["validation_dict"])
        dict_template["length"]["test_dict"] = len(dict_template["test_dict"])
        # 保存到json文件

        print("\n##########  saving……  ##########\n")
        self.save_json(self.out_path.joinpath("_index", "NFT1000_img_caption_dict.json"), dict_template)
        print("\n##########  img_caption_dict is saved successfully!  ##########\n")

    def save_json(self, save_path, data):
        """
        Saves the data to a file with the given filename in the given path

        Args:
            :param save_path: The path to the folder where you want to save the file
            :param data: The data to be saved

        """
        with open(save_path, 'w', encoding='UTF-8') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)

    def load_json(self, json_path):
        """
        以只读的方式打开json文件

        Args:
            config_path: json文件路径

        Returns:
            A dictionary

        """
        try:

            with open(json_path, 'r', encoding='UTF-8') as f:
                return json.load(f)
        except Exception as e:
            print("Error loading json file: {}".format(json_path))
            print(e)
            return None


class MAKE_NFT1000_mini_DATASET(MAKE_NFT1000_DATASET):
    def __init__(self, len_per_project, dataset_dict, base_path, out_path) -> None:
        super().__init__(len_per_project, dataset_dict, base_path, out_path)
        self.img_caption_dict = {
            "length" : {},
            "project_name_list": self.dataset_dict,
            "training_dict": {},
            "validation_dict": {},
            "test_dict": {}
            }
        self.img_txt_dict = copy.deepcopy(self.img_caption_dict)

    def make_dataset_index(self):
        for dataset_item in ["training_list", "validation_list", "test_list"]:
            project_name_list = self.dataset_dict[dataset_item]
            for project_name in tqdm(project_name_list):
                # 拼凑字典路径
                project_path = self.base_path.joinpath(project_name, "caption",  "_caption_dict.json")
                # 读取字典
                caption_dict = load_json(project_path).get("caption_dict")
                
                # 对字典进行随机采样
                target_caption_dict = self.random_sample(caption_dict)

                self.make_img_caption_dict(project_name, target_caption_dict, dataset_item)
                self.make_img_txt_dict(project_name, target_caption_dict, dataset_item)
        self.save_finally()

    def make_img_txt_dict(self, project_name, caption_dict, dataset_item):
        img_path_list = [f"NFT1000/{project_name}/img/{img_name}" for img_name in caption_dict.keys()]
        txt_path_list = [f"NFT1000/{project_name}/caption/{img_name.replace('.png', '.txt')}" for img_name in caption_dict.keys()]
        target_caption_dict = {k: v for k, v in zip(img_path_list, txt_path_list)}

        if dataset_item == "training_list":
            self.img_txt_dict["training_dict"].update(target_caption_dict)

        elif dataset_item == "validation_list":
            self.img_txt_dict["validation_dict"].update(target_caption_dict)

        elif dataset_item == "test_list":
            self.img_txt_dict["test_dict"].update(target_caption_dict)

    def make_img_caption_dict(self, project_name, target_caption_dict, dataset_item):
        img_path_list = [f"NFT1000/{project_name}/img/{img_name}" for img_name in list(target_caption_dict.keys())]
        target_caption_dict = {k: v for k, v in zip(img_path_list, target_caption_dict.values())}
                        
        if dataset_item == "training_list":
            self.img_caption_dict["training_dict"].update(target_caption_dict)

        elif dataset_item == "validation_list":
            self.img_caption_dict["validation_dict"].update(target_caption_dict)

        elif dataset_item == "test_list":
            self.img_caption_dict["test_dict"].update(target_caption_dict)
            

    def random_sample(self, _dict):
        if len(_dict) <= self.len_per_project:
            return _dict
        else:
            dict_items_list = list(_dict.items())
            sampled_dict = dict(random.sample(dict_items_list, self.len_per_project))
            return sampled_dict

    def save_finally(self):
        self.img_caption_dict["length"]["training_dict"] = len(self.img_caption_dict["training_dict"])
        self.img_caption_dict["length"]["validation_dict"] = len(self.img_caption_dict["validation_dict"])
        self.img_caption_dict["length"]["test_dict"] = len(self.img_caption_dict["test_dict"])

        print("\n##########  saving……  ##########\n")
        # 保存到json文件
        self.save_json(self.out_path.joinpath("_index", "NFT1000_mini_img_caption_dict.json"), self.img_caption_dict)
        print("\n##########  img_caption_dict is saved successfully!  ##########\n")

        self.img_txt_dict["length"]["training_dict"] = len(self.img_txt_dict["training_dict"])
        self.img_txt_dict["length"]["validation_dict"] = len(self.img_txt_dict["validation_dict"])
        self.img_txt_dict["length"]["test_dict"] = len(self.img_txt_dict["test_dict"])

        print("\n##########  saving……  ##########\n")
        self.save_json(self.out_path.joinpath("_index", "NFT1000_mini_img_txt_dict.json"), self.img_txt_dict)
        print("\n##########  img_txt_dict is saved successfully!  ##########\n")


class MAKE_NFT100_DATASET(MAKE_NFT1000_mini_DATASET):
    def __init__(self, len_per_project, dataset_dict, base_path, out_path) -> None:
        super().__init__(len_per_project, dataset_dict, base_path, out_path)

    def make_dataset_index(self):
        dataset_item = "training_list"
        project_name_list = random.sample(self.dataset_dict[dataset_item], 80)
        for project_name in tqdm(project_name_list):
            # 拼凑字典路径
            project_path = self.base_path.joinpath(project_name, "caption",  "_caption_dict.json")
            # 读取字典
            caption_dict = load_json(project_path).get("caption_dict")
            
            # 对字典进行随机采样
            target_caption_dict = self.random_sample(caption_dict)

            self.make_img_caption_dict(project_name, target_caption_dict, dataset_item)
            self.make_img_txt_dict(project_name, target_caption_dict, dataset_item)
        self.img_caption_dict["project_name_list"].update({"training_list": project_name_list})
        self.img_txt_dict["project_name_list"].update({"training_list": project_name_list})


        dataset_item = "validation_list"
        project_name_list = project_name_list = random.sample(self.dataset_dict[dataset_item], 5)
        for project_name in tqdm(project_name_list):
            # 拼凑字典路径
            project_path = self.base_path.joinpath(project_name, "caption",  "_caption_dict.json")
            # 读取字典
            caption_dict = load_json(project_path).get("caption_dict")
            
            # 对字典进行随机采样
            target_caption_dict = self.random_sample(caption_dict)

            self.make_img_caption_dict(project_name, target_caption_dict, dataset_item)
            self.make_img_txt_dict(project_name, target_caption_dict, dataset_item)
        self.img_caption_dict["project_name_list"].update({"validation_list": project_name_list})
        self.img_txt_dict["project_name_list"].update({"validation_list": project_name_list})


        dataset_item = "test_list"
        project_name_list = project_name_list = random.sample(self.dataset_dict[dataset_item], 15)
        for project_name in tqdm(project_name_list):
            # 拼凑字典路径
            project_path = self.base_path.joinpath(project_name, "caption",  "_caption_dict.json")
            # 读取字典
            caption_dict = load_json(project_path).get("caption_dict")
            
            # 对字典进行随机采样
            target_caption_dict = self.random_sample(caption_dict)

            self.make_img_caption_dict(project_name, target_caption_dict, dataset_item)
            self.make_img_txt_dict(project_name, target_caption_dict, dataset_item)
        self.img_caption_dict["project_name_list"].update({"test_list": project_name_list})
        self.img_txt_dict["project_name_list"].update({"test_list": project_name_list})
        self.save_finally()

    def save_finally(self):
        self.img_caption_dict["length"]["training_dict"] = len(self.img_caption_dict["training_dict"])
        self.img_caption_dict["length"]["validation_dict"] = len(self.img_caption_dict["validation_dict"])
        self.img_caption_dict["length"]["test_dict"] = len(self.img_caption_dict["test_dict"])

        print("\n##########  saving……  ##########\n")
        # 保存到json文件
        self.save_json(self.out_path.joinpath("_index", "NFT100_mini_img_caption_dict.json"), self.img_caption_dict)
        print("\n##########  img_caption_dict is saved successfully!  ##########\n")

        self.img_txt_dict["length"]["training_dict"] = len(self.img_txt_dict["training_dict"])
        self.img_txt_dict["length"]["validation_dict"] = len(self.img_txt_dict["validation_dict"])
        self.img_txt_dict["length"]["test_dict"] = len(self.img_txt_dict["test_dict"])

        print("\n##########  saving……  ##########\n")
        self.save_json(self.out_path.joinpath("_index", "NFT100_mini_img_txt_dict.json"), self.img_txt_dict)
        print("\n##########  img_txt_dict is saved successfully!  ##########\n")


In [None]:
dataset_index = "/ShuXun_SSD/NFT1000/_index/dataset_index.json"
dataset_index = load_json(dataset_index)
print(len(dataset_index["training_list"] + dataset_index["validation_list"] + dataset_index["test_list"]))

In [None]:
task_list = ["training_dict", "validation_dict", "test_dict"]

In [3]:


if __name__ == "__main__":


    #################################################################################
    ############################# 不同终端的替换范围 开始 #############################

    source_path = Path("/ShuXun_SSD/NFT1000")
    # # 定义要检查的collection排名范围
    # task_start = 2
    # task_end = 1200
    # # 自定义检查范围
    # task_list = list(range(task_start, task_end + 1))

    # # 1 读取xlsx文件
    # wb = openpyxl.load_workbook(str(NFT_1000_excel_path))
    # NFT_1000 = wb.get_sheet_by_name("NFT1000")

    ############################# 不同终端的替换范围 结束 #############################
    #################################################################################


    dataset_dict = load_json(source_path.joinpath("_index", "dataset_index.json"))
    # dataset_dict = fio.load_json(source_path.joinpath("_index", "dataset_index_copy.json"))

    NFT1000_maker = MAKE_NFT1000_DATASET(10000, dataset_dict, source_path, source_path)
    NFT1000_maker.make_dataset_index()

    NFT1000_maker = MAKE_NFT1000_mini_DATASET(1000, dataset_dict, source_path, source_path)
    NFT1000_maker.make_dataset_index()

    NFT1000_maker = MAKE_NFT100_DATASET(500, dataset_dict, source_path, source_path)
    NFT1000_maker.make_dataset_index()


100%|██████████| 801/801 [00:45<00:00, 17.79it/s]
100%|██████████| 50/50 [00:02<00:00, 20.78it/s]
100%|██████████| 150/150 [00:07<00:00, 19.76it/s]



##########  saving……  ##########


##########  img_caption_dict is saved successfully!  ##########



100%|██████████| 801/801 [00:36<00:00, 21.76it/s]
100%|██████████| 50/50 [00:02<00:00, 23.84it/s]
100%|██████████| 150/150 [00:06<00:00, 23.95it/s]



##########  saving……  ##########


##########  img_txt_dict is saved successfully!  ##########



100%|██████████| 801/801 [00:33<00:00, 23.94it/s]
100%|██████████| 50/50 [00:01<00:00, 25.87it/s]
100%|██████████| 150/150 [00:05<00:00, 25.35it/s]



##########  saving……  ##########


##########  img_caption_dict is saved successfully!  ##########


##########  saving……  ##########


##########  img_txt_dict is saved successfully!  ##########



100%|██████████| 80/80 [00:03<00:00, 24.54it/s]
100%|██████████| 5/5 [00:00<00:00, 28.06it/s]
100%|██████████| 15/15 [00:00<00:00, 35.35it/s]



##########  saving……  ##########


##########  img_caption_dict is saved successfully!  ##########


##########  saving……  ##########


##########  img_txt_dict is saved successfully!  ##########



In [6]:
NFT100_mini_img_caption_dict_path = "/ShuXun_SSD/NFT1000/_index/NFT100_mini_img_caption_dict.json"
NFT1000_img_caption_dict_path = "/ShuXun_SSD/NFT1000/_index/NFT1000_img_caption_dict.json"
NFT1000_mini_img_caption_dict_path = "/ShuXun_SSD/NFT1000/_index/NFT1000_mini_img_caption_dict.json"
base_path = "/ShuXun_SSD/"

# path_list = [NFT100_mini_img_caption_dict_path, NFT1000_mini_img_caption_dict_path, NFT1000_img_caption_dict_path]
path_list = [NFT1000_img_caption_dict_path]
task_list = ["training_dict", "validation_dict", "test_dict"]
for path_item in path_list:
    dict_info = load_json(path_item)
    for task_item in task_list:
        task_data = dict_info[task_item]
        img_path_list = [base_path + img_path for img_path in task_data.keys()]
        data_frame = zip(img_path_list, task_data.values())
        csv_path = Path(base_path).joinpath("NFT1000", "_index", f"NFT1000_img_caption_dict_path_{task_item}.csv")
        with open(csv_path, "w", encoding="utf-8", newline='') as f:
            csv_writer = csv.writer(f)
            csv_writer.writerow(["filepath", "caption"])
            csv_writer.writerows(data_frame)