# Buchwald-Hartwig Reaction Yield Dataset

Original data repository: https://github.com/doylelab/rxnpredict

Data from: https://github.com/reymond-group/drfp/tree/main/data

# Imports

In [21]:
import pandas as pd
import yaml
from rdkit import Chem # 2022.9.5
from rdkit.Chem import rdChemReactions

# Data processing

## Download data

In [7]:
fn_data_original = "Dreher_and_Doyle_input_data.csv"

In [8]:
data = pd.read_excel('https://github.com/reymond-group/drfp/raw/main/data/Dreher_and_Doyle_input_data.xlsx')

In [9]:
data.to_csv(fn_data_original, index=False)

In [10]:
!ls -lh

total 1344
-rw-r--r--  1 pschwllr  staff   618K Mar  8 22:34 Dreher_and_Doyle_input_data.csv
-rw-r--r--  1 pschwllr  staff    38K Mar  8 22:13 example_processing_and_templates.ipynb
-rw-r--r--  1 pschwllr  staff   1.6K Mar  8 22:13 meta.yaml
-rw-r--r--  1 pschwllr  staff   4.5K Mar  8 22:13 transform.py


## Load original data

In [11]:
!head -n 5 {fn_data_original}

Ligand,Additive,Base,Aryl halide,Output
CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,70.41045785
CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,11.06445724
CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)C4CCCCC4)C=CC=C2,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,IC1=CC=C(CC)C=C1,10.22354965
CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)C(C)(C)C)C(OC)=CC=C2OC,CCOC(C1=CON=C1)=O,CN1CCCN2C1=NCCC2,ClC1=CC=C(C(F)(F)F)C=C1,20.0833829


In [50]:
df = pd.read_csv(fn_data_original, delimiter=",")

In [51]:
df.head()

Unnamed: 0,Ligand,Additive,Base,Aryl halide,Output
0,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,70.410458
1,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,11.064457
2,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,IC1=CC=C(CC)C=C1,10.22355
3,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CCOC(C1=CON=C1)=O,CN1CCCN2C1=NCCC2,ClC1=CC=C(C(F)(F)F)C=C1,20.083383
4,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN1CCCN2C1=NCCC2,ClC1=CC=C(OC)C=C1,0.492663


## Add column = field names
Clean column names (`fields_clean`) and keep original names (`fields_orig`)

In [52]:
fields_orig = df.columns.tolist()
fields_orig

['Ligand', 'Additive', 'Base', 'Aryl halide', 'Output']

In [53]:
assert fields_orig == ['Ligand', 'Additive', 'Base', 'Aryl halide', 'Output']

In [54]:
fields_clean = [
    "ligand",
    "additive",
    "base",
    "aryl_halide",
    'yield'
]

In [55]:
df.columns = fields_clean

In [56]:
assert fields_orig != fields_clean

In [57]:
df.head()

Unnamed: 0,ligand,additive,base,aryl_halide,yield
0,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,70.410458
1,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,11.064457
2,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,IC1=CC=C(CC)C=C1,10.22355
3,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CCOC(C1=CON=C1)=O,CN1CCCN2C1=NCCC2,ClC1=CC=C(C(F)(F)F)C=C1,20.083383
4,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN1CCCN2C1=NCCC2,ClC1=CC=C(OC)C=C1,0.492663


## Data cleaning

In [60]:
def generate_buchwald_hartwig_rxns(df):
    """
    Converts the entries in the excel files to reaction SMILES.
    From: https://github.com/reymond-group/drfp/blob/main/scripts/encoding/encode_buchwald_hartwig_reactions.py
    and https://github.com/rxn4chemistry/rxn_yields/blob/master/rxn_yields/data.py
    """
    df = df.copy()
    fwd_template = "[F,Cl,Br,I]-[c;H0;D3;+0:1](:[c,n:2]):[c,n:3].[NH2;D1;+0:4]-[c:5]>>[c,n:2]:[c;H0;D3;+0:1](:[c,n:3])-[NH;D2;+0:4]-[c:5]"
    methylaniline = "Cc1ccc(N)cc1"
    pd_catalyst = "O=S(=O)(O[Pd]1~[NH2]C2C=CC=CC=2C2C=CC=CC1=2)C(F)(F)F"
    methylaniline_mol = Chem.MolFromSmiles(methylaniline)
    rxn = rdChemReactions.ReactionFromSmarts(fwd_template)
    products = []

    for i, row in df.iterrows():
        reacts = (Chem.MolFromSmiles(row["aryl_halide"]), methylaniline_mol)
        rxn_products = rxn.RunReactants(reacts)

        rxn_products_smiles = set([Chem.MolToSmiles(mol[0]) for mol in rxn_products])
        assert len(rxn_products_smiles) == 1
        products.append(list(rxn_products_smiles)[0])

    df["product"] = products
    rxns = []

    for i, row in df.iterrows():
        reactants = Chem.MolToSmiles(
            Chem.MolFromSmiles(
                f"{row['aryl_halide']}.{methylaniline}.{pd_catalyst}.{row['ligand']}.{row['base']}.{row['additive']}"
            )
        )
        rxns.append(f"{reactants.replace('N~', '[NH2]')}>>{row['product']}")

    return rxns

reaction_SMILES = generate_buchwald_hartwig_rxns(df)
df.insert(4, 'reaction_SMILES', reaction_SMILES)

In [62]:
assert not df.duplicated().sum()

## Save to csv

In [63]:
fn_data_csv = "data_clean.csv"

In [65]:
df.to_csv(fn_data_csv, index=False)

In [66]:
!ls -lh {fn_data_csv}

-rw-r--r--  1 pschwllr  staff   1.4M Mar  8 23:00 data_clean.csv


In [67]:
!head -n 5 {fn_data_csv}

ligand,additive,base,aryl_halide,reaction_SMILES,yield
CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.Cc1cc(C)on1.Cc1ccc(N)cc1.Clc1ccccn1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2[NH2]1)C(F)(F)F>>Cc1ccc(Nc2ccccn2)cc1,70.41045785
CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[C@@H]4C5)C[C@H](C4)C[C@H]5C3)[C@]6(C7)C[C@@H](C[C@@H]7C8)C[C@@H]8C6)C(OC)=CC=C2OC,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,Brc1ccccn1.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COC(=O)c1ccno1.COc1ccc(OC)c(P([C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)[C@]23C[C@H]4C[C@H](C[C@H](C4)C2)C3)c1-c1c(C(C)C)cc(C(C)C)cc1C(C)C.Cc1ccc(N)cc1.O=S(=O)(O[Pd]1c2ccccc2-c2ccccc2[NH2]1)C(F)(F)F>>Cc1ccc(Nc2ccccn2)cc1,1

In [68]:
df.head()

Unnamed: 0,ligand,additive,base,aryl_halide,reaction_SMILES,yield
0,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COc1...,70.410458
1,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,Brc1ccccn1.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C...,11.064457
2,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,IC1=CC=C(CC)C=C1,CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2...,10.22355
3,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CCOC(C1=CON=C1)=O,CN1CCCN2C1=NCCC2,ClC1=CC=C(C(F)(F)F)C=C1,CCOC(=O)c1cnoc1.CN1CCCN2CCCN=C12.COc1ccc(OC)c(...,20.083383
4,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN1CCCN2C1=NCCC2,ClC1=CC=C(OC)C=C1,CN1CCCN2CCCN=C12.COc1ccc(Cl)cc1.COc1ccc(OC)c(P...,0.492663


## Load from csv

In [69]:
fn_data_csv = "data_clean.csv"

In [70]:
df = pd.read_csv(fn_data_csv)

In [71]:
df.head()

Unnamed: 0,ligand,additive,base,aryl_halide,reaction_SMILES,yield
0,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COc1...,70.410458
1,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,Brc1ccccn1.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C...,11.064457
2,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,IC1=CC=C(CC)C=C1,CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2...,10.22355
3,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CCOC(C1=CON=C1)=O,CN1CCCN2C1=NCCC2,ClC1=CC=C(C(F)(F)F)C=C1,CCOC(=O)c1cnoc1.CN1CCCN2CCCN=C12.COc1ccc(OC)c(...,20.083383
4,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN1CCCN2C1=NCCC2,ClC1=CC=C(OC)C=C1,CN1CCCN2CCCN=C12.COc1ccc(Cl)cc1.COc1ccc(OC)c(P...,0.492663


# meta YAML

In [72]:
df.head()

Unnamed: 0,ligand,additive,base,aryl_halide,reaction_SMILES,yield
0,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,ClC1=NC=CC=C1,CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C)N(C)C.COc1...,70.410458
1,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,BrC1=NC=CC=C1,Brc1ccccn1.CCN=P(N=P(N(C)C)(N(C)C)N(C)C)(N(C)C...,11.064457
2,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C3CCCCC3)...,O=C(OC)C1=CC=NO1,CN(C)P(N(C)C)(N(C)C)=NP(N(C)C)(N(C)C)=NCC,IC1=CC=C(CC)C=C1,CC(C)c1cc(C(C)C)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2...,10.22355
3,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P(C(C)(C)C)...,CCOC(C1=CON=C1)=O,CN1CCCN2C1=NCCC2,ClC1=CC=C(C(F)(F)F)C=C1,CCOC(=O)c1cnoc1.CN1CCCN2CCCN=C12.COc1ccc(OC)c(...,20.083383
4,CC(C)C(C=C(C(C)C)C=C1C(C)C)=C1C2=C(P([C@@]3(C[...,CC1=CC(C)=NO1,CN1CCCN2C1=NCCC2,ClC1=CC=C(OC)C=C1,CN1CCCN2CCCN=C12.COc1ccc(Cl)cc1.COc1ccc(OC)c(P...,0.492663


In [73]:
meta = {
    "name": "buchwald_hartwig_doyle",  # unique identifier, we will also use this for directory names
    "description": """High-throughput experimentation palladium-catalyzed Buchwald Hardwig data set with yields.""",
    "targets": [
        {
            "id": "yield",  # name of the column in a tabular dataset
            "description": "Reaction yields analyzed by UPLC",  # description of what this column means
            "units": "%",  # units of the values in this column (leave empty if unitless)
            "type": "continuous",  # can be "categorical", "ordinal", "continuous"
            "names": [  # names for the property (to sample from for building the prompts)
                "Reaction yield",
                "yield",
            ],
        },
    ],
    "identifiers": [
        {
            "id": "reaction_SMILES",  # column name
            "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
            "description": "SMILES",  # description (optional, except for "Other")
        },
    ],
    "license": "MIT license",  # license under which the original dataset was published
    "links": [  # list of relevant links (original dataset, other uses, etc.)
        {
            "url": "https://doi.org/10.1126/science.aar5169",
            "description": "corresponding publication",
        },
        {
            "url": "https://www.sciencedirect.com/science/article/pii/S2451929420300851",
            "description": "publication with data processing",
        },
        {
            "url": "https://github.com/rxn4chemistry/rxn_yields/blob/master/rxn_yields/data.py",
            "description": "preprocessing",
        },
        {
            "url": "https://github.com/reymond-group/drfp/tree/main/data",
            "description": "dataset",
        }
    ],
    "num_points": len(df),  # number of datapoints in this dataset
    "url": "https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50",
    "bibtex": [
        """@article{ahneman2018predicting,
  title={Predicting reaction performance in C--N cross-coupling using machine learning},
  author={Ahneman, Derek T and Estrada, Jes{\'u}s G and Lin, Shishi and Dreher, Spencer D and Doyle, Abigail G},
  journal={Science},
  volume={360},
  number={6385},
  pages={186--190},
  year={2018},
  publisher={American Association for the Advancement of Science}
}
}""",
    ],
}

In [74]:
def str_presenter(dumper, data):
    """configures yaml for dumping multiline strings
    Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
    """
    if data.count("\n") > 0:  # check for multiline string
        return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
    return dumper.represent_scalar("tag:yaml.org,2002:str", data)


yaml.add_representer(str, str_presenter)
yaml.representer.SafeRepresenter.add_representer(
    str, str_presenter
)  # to use with safe_dum

In [75]:
fn_meta = "meta.yaml"

In [76]:
with open(fn_meta, "w") as f:
    yaml.dump(meta, f, sort_keys=False)

In [77]:
!ls -lh {fn_meta}

-rw-r--r--  1 pschwllr  staff   975B Mar  8 23:36 meta.yaml


In [78]:
!cat {fn_meta}

name: buchwald_hartwig_doyle
description: High-throughput experimentation palladium-catalyzed Buchwald Hardwig
  data set with yields.
targets:
- id: yield
  description: Reaction yields analyzed by UPLC
  units: '%'
  type: continuous
  names:
  - Reaction yield
  - yield
identifiers:
- id: reaction_SMILES
  type: SMILES
  description: SMILES
license: MIT license
links:
- url: https://doi.org/10.1126/science.aar5169
  description: corresponding publication
num_points: 3955
url: https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50
bibtex:
- |-
  @article{ahneman2018predicting,
    title={Predicting reaction performance in C--N cross-coupling using machine learning},
    author={Ahneman, Derek T and Estrada, Jes{'u}s G and Lin, Shishi and Dreher, Spencer D and Doyle, Abigail G},
    journal={Science},
    volume={360},
    number={6385},
    pages={186--190},
    year={2018},
    publisher={American Association for the Advancement of Science}
  }
  }


# create transform.py

In [32]:
path_file = "transform.py"

In [33]:
%%writefile $path_file
import pandas as pd
import yaml
from tdc.single_pred import Tox


def get_and_transform_data():
    # get raw data
    data = Tox(name = 'LD50_Zhu')
    fn_data_original = "data_original.csv"
    data.get_data().to_csv(fn_data_original, index=False)

    # create dataframe
    df = pd.read_csv(
        fn_data_original,
        delimiter=",",
    )  # not necessary but ensure we can load the saved data

    # check if fields are the same
    fields_orig = df.columns.tolist()
    assert fields_orig == [
        "Drug_ID",
        "Drug",
        "Y",
    ]

    # overwrite column names = fields
    fields_clean = [
        "compound_name",
        "SMILES",
        "acute_toxicity",
    ]
    df.columns = fields_clean

    # data cleaning
    df.compound_name = (
        df.compound_name.str.strip()
    )  # remove leading and trailing white space characters

    assert not df.duplicated().sum()

    # save to csv
    fn_data_csv = "data_clean.csv"
    df.to_csv(fn_data_csv, index=False)

    # create meta yaml
    meta = {
            "name": "ld50_zhu",  # unique identifier, we will also use this for directory names
            "description": """Acute toxicity LD50 measures the most conservative dose that can lead to lethal adverse effects. The higher the dose, the more lethal of a drug.""",
            "targets": [
                {
                    "id": "acute_toxicity",  # name of the column in a tabular dataset
                    "description": "Acute Toxicity LD50.",  # description of what this column means
                    "units": "ld50",  # units of the values in this column (leave empty if unitless)
                    "type": "continuous",  # can be "categorical", "ordinal", "continuous"
                    "names": [  # names for the property (to sample from for building the prompts)
                        "Acute Toxicity LD50",
                        "ld50",
                        "conservative dose that can lead to lethal adverse effects.",
                        "Rat Acute Toxicity by Oral Exposure",
                        "Toxicity",
                    ],
                },
            ],
            "identifiers": [
                {
                    "id": "SMILES",  # column name
                    "type": "SMILES",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
                    "description": "SMILES",  # description (optional, except for "Other")
                },
                {
                    "id": "compound_name",
                    "type": "Synonyms",
                    "description": "compound name",
                    "names": [
                        "compound",
                        "compound name",
                        "drug",
                    ],
                },
            ],
            "license": "CC BY 4.0",  # license under which the original dataset was published
            "links": [  # list of relevant links (original dataset, other uses, etc.)
                {
                    "url": "https://doi.org/10.1021/tx900189p",
                    "description": "corresponding publication",
                },
            ],
            "num_points": len(df),  # number of datapoints in this dataset
            "url": "https://tdcommons.ai/single_pred_tasks/tox/#acute-toxicity-ld50",
            "bibtex": [
                """@article{Zhu2009,
              doi = {10.1021/tx900189p},
              url = {https://doi.org/10.1021/tx900189p},
              year = {2009},
              month = oct,
              publisher = {American Chemical Society ({ACS})},
              volume = {22},
              number = {12},
              pages = {1913--1921},
              author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander Sedykh and Douglas M. Young and Alexander Tropsha},
              title = {Quantitative Structure-Activity Relationship Modeling of Rat Acute Toxicity by Oral Exposure},
              journal = {Chemical Research in Toxicology}}""",
            ],
        }

    def str_presenter(dumper, data):
        """configures yaml for dumping multiline strings
        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
        """
        if data.count("\n") > 0:  # check for multiline string
            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
        return dumper.represent_scalar("tag:yaml.org,2002:str", data)

    yaml.add_representer(str, str_presenter)
    yaml.representer.SafeRepresenter.add_representer(
        str, str_presenter
    )  # to use with safe_dum
    fn_meta = "meta.yaml"
    with open(fn_meta, "w") as f:
        yaml.dump(meta, f, sort_keys=False)

    print(f"Finished processing {meta['name']} dataset!")


if __name__ == "__main__":
    get_and_transform_data()

Overwriting transform.py


In [34]:
!python3 transform.py

Found local copy...
Loading...
Done!
Finished processing ld50_zhu dataset!


In [35]:
ls -lh  # fmt: skip

total 1.4M
drwxrwxr-x 2 melo melo 4.0K مار  2 16:58 [0m[01;34mdata[0m/
-rw-rw-r-- 1 melo melo 661K مار  2 16:58 data_clean.csv
-rw-rw-r-- 1 melo melo 661K مار  2 16:58 data_original.csv
-rw-rw-r-- 1 melo melo  39K مار  1 22:23 example_processing_and_templates.ipynb
-rw-rw-r-- 1 melo melo 1.5K مار  2 16:58 meta.yaml
-rw-rw-r-- 1 melo melo 4.8K مار  2 16:58 transform.py


# End