In [3]:
import os
import re
import csv

def parse_pdb_description(file_path):
    description = {'file': os.path.basename(file_path)}
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('HEADER'):
                description['header'] = line[10:].strip()
            elif line.startswith('TITLE '):
                description['title'] = (description.get('title', '') + line[10:].strip()).strip()
            elif line.startswith('COMPND'):
                m = re.search('MOLECULE: (.+?);', line)
                if m:
                    description['molecule'] = m.group(1)
            elif line.startswith('SOURCE'):
                s = re.search('ORGANISM_SCIENTIFIC: (.+?);', line)
                if s:
                    description['organism'] = s.group(1)
            elif line.startswith('KEYWDS'):
                description['keywords'] = line[10:].strip()
    return description

def get_image_path(file_path):
    image_path = f"./{os.path.basename(file_path).replace('.pdb', '.png')}"
    return image_path

def process_pdb_files(directory):
    dataset = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.pdb'):
            file_path = os.path.join(directory, file_name)
            description = parse_pdb_description(file_path)
            image_path = get_image_path(file_path)
            description['image_path'] = image_path
            dataset.append(description)
    return dataset

def save_to_csv(dataset, output_file):
    keys = dataset[0].keys() if dataset else []
    with open(output_file, 'w', newline='') as output_csv:
        dict_writer = csv.DictWriter(output_csv, keys)
        dict_writer.writeheader()
        dict_writer.writerows(dataset)

pdb_directory = './pdb'
output_csv_file = './dataset.csv'

pdb_dataset = process_pdb_files(pdb_directory)
save_to_csv(pdb_dataset, output_csv_file)
