In [1]:
import os
import json
import pandas as pd
import csv

In [2]:
data_count = dict()

def pcqc2csv(main_folder_path):

    data_list = []

    range_p = main_folder_path[:-1].replace("Compound_", '',)  # 폴더 경로에서 이름을 추출하고 range 들어갈 숫자를 추출합니다.
    rlist = range_p.split('_')
    rlist = [int(i.lstrip('0')) for i in rlist]

    for i in range(rlist[0], rlist[1]):    # 위에서 추출한 숫자를 사용합니다.  예) 121475001 121500000
        subfolder_name = f"{i:09d}"
        subfolder_path = os.path.join(main_folder_path, subfolder_name)

        json_file_name = f"{subfolder_name}.B3LYP@PM6.S0.json"
        json_file_path = os.path.join(subfolder_path, json_file_name)

        if os.path.exists(json_file_path):
            with open(json_file_path, 'r') as json_file:
                try:                                     # 말씀하신 Decode 에러 처리 부분입니다.
                    json_data = json.load(json_file)  # 해당 코드를 실행시
                except json.decoder.JSONDecodeError:  # except xxx: xxx에 해당하는 에러가 발생하면 밑에 코드(여기서는 print())를 실행하고 패스합니다.
                    print('Invalid Datatype')

                molecular_formula = json_data["pubchem"]["molecular formula"]
                molecular_weight = json_data["pubchem"]["molecular weight"]
                atom_count = json_data["pubchem"]["B3LYP@PM6"]["atoms"]["elements"]["atom count"]
                heavy_atom_count = json_data["pubchem"]["B3LYP@PM6"]["atoms"]["elements"]["heavy atom count"]
                total_dipole_moment = json_data["pubchem"]["B3LYP@PM6"]["properties"]["total dipole moment"]
                homo_alpha = json_data["pubchem"]["B3LYP@PM6"]["properties"]["energy"]["alpha"]["homo"]
                lumo_alpha = json_data["pubchem"]["B3LYP@PM6"]["properties"]["energy"]["alpha"]["lumo"]
                gap_alpha = json_data["pubchem"]["B3LYP@PM6"]["properties"]["energy"]["alpha"]["gap"]
                isomeric_smiles = json_data["pubchem"]["Isomeric SMILES"]

                # Append the extracted information as a dictionary to the data_list
                data_list.append({
                    "i": i,
                    "molecular_formula": molecular_formula,
                    "molecular_weight" : molecular_weight,
                    "atom_count": atom_count,
                    "heavy_atom_count": heavy_atom_count,
                    "total_dipole_moment": total_dipole_moment,
                    "homo": homo_alpha,
                    "lumo": lumo_alpha,
                    "gap": gap_alpha,
                    "Isomeric_SMILES": isomeric_smiles
                })

    # Create a pandas DataFrame from the data_list
    df = pd.DataFrame(data_list)

    # Set the "i" column as the index of the DataFrame
    df.set_index('i', inplace=True)

    df.to_csv(main_folder_path[:-1] + '.csv')

    data_count[main_folder_path[:-1]] = len(df)

In [3]:
file_list = os.listdir('.')
folder_list = [file + '/' for file in file_list if '.' not in file]
folder_list

['Compound_045250001_045275000/',
 'Compound_018075001_018100000/',
 'Compound_034700001_034725000/',
 'Compound_043650001_043675000/',
 'Compound_012450001_012475000/',
 'Compound_051925001_051950000/',
 'Compound_032225001_032250000/',
 'Compound_044100001_044125000/',
 'Compound_066000001_066025000/',
 'Compound_038775001_038800000/',
 'Compound_042400001_042425000/',
 'Compound_056850001_056875000/',
 'Compound_025275001_025300000/',
 'Compound_003875001_003900000/',
 'Compound_036200001_036225000/',
 'Compound_062650001_062675000/',
 'Compound_011150001_011175000/',
 'Compound_053275001_053300000/',
 'Compound_033575001_033600000/',
 'Compound_038450001_038475000/',
 'Compound_041675001_041700000/',
 'Compound_039725001_039750000/',
 'Compound_055100001_055125000/',
 'Compound_014250001_014275000/',
 'Compound_041225001_041250000/',
 'Compound_048175001_048200000/',
 'Compound_061500001_061525000/',
 'Compound_015375001_015400000/',
 'Compound_008300001_008325000/',
 'Compound_007

In [4]:
for i in folder_list:  # 처리한 에러 메세지 "Invalid Datatype" 가 한번만 뜬걸보니 공백 json은 1개 밖에 없었나 봅니다!
    pcqc2csv(i)

In [5]:
data_count

{'Compound_045250001_045275000': 1597,
 'Compound_018075001_018100000': 2422,
 'Compound_034700001_034725000': 148,
 'Compound_043650001_043675000': 18550,
 'Compound_012450001_012475000': 4554,
 'Compound_051925001_051950000': 1942,
 'Compound_032225001_032250000': 119,
 'Compound_044100001_044125000': 1549,
 'Compound_066000001_066025000': 22019,
 'Compound_038775001_038800000': 91,
 'Compound_042400001_042425000': 220,
 'Compound_056850001_056875000': 2998,
 'Compound_025275001_025300000': 465,
 'Compound_003875001_003900000': 1635,
 'Compound_036200001_036225000': 73,
 'Compound_062650001_062675000': 21574,
 'Compound_011150001_011175000': 5756,
 'Compound_053275001_053300000': 2620,
 'Compound_033575001_033600000': 65,
 'Compound_038450001_038475000': 202,
 'Compound_041675001_041700000': 159,
 'Compound_039725001_039750000': 1537,
 'Compound_055100001_055125000': 22570,
 'Compound_014250001_014275000': 2541,
 'Compound_041225001_041250000': 198,
 'Compound_048175001_048200000': 2

In [6]:
sum(data_count.values())

221137