In [1]:
import pandas as pd

In [2]:
sdf_path = 'C:/Users/Enrique/Documents/New ProtoADME Models/CYP_1A2_substrate/raw_data/modified Zaretzki data set/SoM_dataset.sdf'

with open(sdf_path, 'r', encoding='latin-1') as file:
    sdf_content = file.readlines()

In [10]:
# Adjust the parsing function to handle line contexts correctly without using list index on an iterator
def parse_sdf_v2(sdf_lines):
    import re
    records = []
    record = {}
    properties = {}
    current_property_name = None
    
    for line in sdf_lines:
        if line.strip() == '$$$$':  # End of a record
            if record:
                if properties:
                    record['properties'] = properties
                records.append(record)
            record = {}
            properties = {}
            current_property_name = None
        elif line.startswith('>'):  # Start of property data
            prop_match = re.match(r'^>\s*<([^>]+)>', line)
            if prop_match:
                current_property_name = prop_match.group(1).strip()
        elif current_property_name:  # Reading property value
            properties[current_property_name] = line.strip()
            current_property_name = None  # Reset after reading the value
        elif 'V2000' in line:  # Indicates the start of the molecule block
            record['compound_name'] = sdf_lines[sdf_lines.index(line) - 2].strip()  # Compound name is two lines before

    return records

# Need to pass a list again to use index method safely
sdf_parsed_v2 = parse_sdf_v2(sdf_content)
sdf_parsed_v2[:5]  # Show the first 5 records to verify the output structure

[{'compound_name': '3D',
  'properties': {'ID': '13_cis_retinoic_acid',
   'PRIMARY_SOM_1A2': '7',
   'PRIMARY_SOM_2A6': '7',
   'PRIMARY_SOM_2B6': '7',
   'PRIMARY_SOM_2C19': '7',
   'PRIMARY_SOM_2C8': '7',
   'PRIMARY_SOM_2C9': '7',
   'PRIMARY_SOM_2D6': '7',
   'PRIMARY_SOM_2E1': '7',
   'PRIMARY_SOM_3A4': '7',
   'Citation': 'Marill et al.,Biochem. Pharmacol.,63,933,2002'}},
 {'compound_name': '3D',
  'properties': {'ID': '1_2_4_trichlorobenzene',
   'PRIMARY_SOM_1A2': '3 4',
   'PRIMARY_SOM_2D6': '3',
   'PRIMARY_SOM_2E1': '3',
   'PRIMARY_SOM_3A4': '3',
   'SECONDARY_SOM_2E1': '4',
   'SECONDARY_SOM_3A4': '7',
   'Citation': 'Bogaards, J.J.P.; Omen, B.V.; Wolf, C.R.; Van Bladeren, P.J. Human Cytochrome P450 Enzyme Selectivities in the Oxidation of Chlorinated Benzenes. Toxicol. Appl. Pharamcol. 1995, 132, 44\x9652.'}},
 {'compound_name': '3D',
  'properties': {'ID': '2-acetylaminofluorene',
   'PRIMARY_SOM_1A2': '6',
   'SECONDARY_SOM_1A2': '1 15',
   'TERTIARY_SOM_1A2': '17',
  

In [11]:
# Create a DataFrame with the correct compound names (using 'ID' as the name) and the rest of the properties
df_corrected_data = [{
    'Compound Name': record['properties'].pop('ID', 'No ID provided'),  # Use 'ID' as the name and remove it from properties
    'Properties': record['properties']
} for record in sdf_parsed_v2]

# Create the corrected DataFrame
df_columns = ['Compound Name', 'Properties']
corrected_compound_properties_df = pd.DataFrame(df_corrected_data, columns=df_columns)

# Expand the 'Properties' dictionary into separate columns
expanded_df = pd.json_normalize(corrected_compound_properties_df['Properties'])

# Concatenate the expanded properties with the compound names to form the final DataFrame
final_df = pd.concat([corrected_compound_properties_df['Compound Name'], expanded_df], axis=1)
final_df.to_csv('SoM_dataset.csv', sep=";", index=False)