# Text sampling introduction
This notebook shows a text sampling process from the cleaned data to the sample text prompts.<br>
The outlined setup is based on langchain and its prompt template setup:<br>
* https://github.com/hwchase17/langchain
* https://langchain.readthedocs.io/en/latest/modules/prompts.html

# Setup
## Installation

In [None]:
#!pip install langchain

## Imports

In [1]:
import random

In [2]:
import pandas as pd

In [3]:
import yaml

In [4]:
from langchain import PromptTemplate

In [5]:
from langchain.prompts import load_prompt

In [6]:
from functools import partial

# Raw data

In [7]:
path_data_dir = "data/lipophilicity"

In [8]:
!python {path_data_dir}/transform.py

Finished processing lipophilicity dataset!


In [9]:
path_data_csv = path_data_dir+"/data_clean.csv"

In [10]:
df = pd.read_csv(path_data_csv)

In [11]:
df.head()

Unnamed: 0,CMPD_CHEMBLID,exp,SMILES
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...


In [12]:
assert not df.duplicated().sum()  #  check for duplicates

In [13]:
path_data_meta = path_data_dir+"/meta.yaml"

In [14]:
with open(path_data_meta, "r") as stream:
    try:
        meta = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

In [15]:
meta

{'name': 'lipophilicity',
 'description': 'Experimental results of octanol/water distribution coefficient (logD at pH 7.4).',
 'targets': [{'id': 'exp',
   'description': 'experimental results of octanol/water distribution coefficient (logD at pH 7.4)',
   'units': 'logD',
   'type': 'continuous',
   'names': ['octanol/water distribution coefficient (logD at pH 7.4)',
    'octanol/water distribution coefficient']}],
 'identifiers': [{'id': 'SMILES', 'type': 'SMILES', 'description': 'SMILES'}],
 'license': 'CC BY-SA 3.0',
 'links': [{'url': 'https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv',
   'description': 'original dataset link'},
  {'url': 'https://github.com/cheminfo/molecule-features/blob/main/data/lipophilicity/meta.yaml',
   'description': 'original meta data'},
  {'url': 'https://deepchem.readthedocs.io/en/latest/api_reference/moleculenet.html#lipo-datasets',
   'description': 'original dataset link from moleculenet'},
  {'url': 'https://www.ebi.ac.uk

In [16]:
all_identifiers = [x["id"] for x in meta["identifiers"]] + [x["id"] for x in meta["targets"]]
all_identifiers

['SMILES', 'exp']

In [18]:
for i in all_identifiers:
    cols = df.columns.tolist()
    assert i in cols, f"target or identifier {i} not in columns {cols}!"
print("All targets and identifiers found in the cleaned data!")

All targets and identifiers found in the cleaned data!


# Templates

## Basic

### Setup

In [19]:
template_basic = """
The molecule whith the {SMILES_description} representation {SMILES_data} has a {target_name} of {target_value} {target_units}.
"""

In [20]:
template_basic

'\nThe molecule whith the {SMILES_description} representation {SMILES_data} has a {target_name} of {target_value} {target_units}.\n'

In [21]:
prompt_basic = PromptTemplate(
    input_variables=[
        "SMILES_description",
        "SMILES_data",
        "target_name",
        "target_value",
        "target_units",
    ],
    template=template_basic,
)

In [22]:
prompt_basic

PromptTemplate(input_variables=['SMILES_description', 'SMILES_data', 'target_name', 'target_value', 'target_units'], output_parser=None, partial_variables={}, template='\nThe molecule whith the {SMILES_description} representation {SMILES_data} has a {target_name} of {target_value} {target_units}.\n', template_format='f-string', validate_template=True)

### Create sample

In [23]:
sample = df.sample()

In [24]:
sample

Unnamed: 0,CMPD_CHEMBLID,exp,SMILES
3779,CHEMBL5,-0.1,CCN1C=C(C(=O)O)C(=O)c2ccc(C)nc12


In [40]:
class RandomVariable():
    def __init__(self, name, data, sampler):
        self.name = name
        self.data = data
        self.sampler = sampler
    
    def __repr__(self):
        return f"RandomVariable: {self.name}, {self.data}, {self.sampler}"
    
    def __call__(self):
        sample = self.sampler(self.data)
        if isinstance(sample, list):
            assert len(sample) == 1
            return sample[0]
        else:
            return sample

In [41]:
target_name_rv = RandomVariable(
    "target_name",
    meta["targets"][0]["names"],
    partial(random.sample, k=1),
)

In [42]:
target_name_rv

RandomVariable: target_name, ['octanol/water distribution coefficient (logD at pH 7.4)', 'octanol/water distribution coefficient'], functools.partial(<bound method Random.sample of <random.Random object at 0x7fce6d01ce10>>, k=1)

In [45]:
for _ in range(10):
    print(target_name_rv())

octanol/water distribution coefficient
octanol/water distribution coefficient (logD at pH 7.4)
octanol/water distribution coefficient
octanol/water distribution coefficient (logD at pH 7.4)
octanol/water distribution coefficient (logD at pH 7.4)
octanol/water distribution coefficient
octanol/water distribution coefficient
octanol/water distribution coefficient (logD at pH 7.4)
octanol/water distribution coefficient (logD at pH 7.4)
octanol/water distribution coefficient (logD at pH 7.4)


In [46]:
def get_sample_prompt_basic(df, meta, idx=None):
    if idx:
        sample = df.iloc[idx]
    else: # random sample
        sample = df.sample().iloc[0] # iloc gives us a pd series
    
    return {
        "SMILES_description": meta["identifiers"][0]["description"],
        "SMILES_data": sample.SMILES,
        #"target_name": random.sample(meta["targets"][0]["names"], k=1)[0],
        "target_name": target_name_rv(),
        "target_value": sample.exp,
        "target_units": meta["targets"][0]["units"],
    }

In [47]:
sample_dict = get_sample_prompt_basic(df, meta, idx=777)
sample_dict

{'SMILES_description': 'SMILES',
 'SMILES_data': 'COC(=O)CCC(=O)Nc1ccc2C(=O)C(=O)c3ccccc3c2c1',
 'target_name': 'octanol/water distribution coefficient',
 'target_value': 2.72,
 'target_units': 'logD'}

In [48]:
prompt_basic.format(**sample_dict)

'\nThe molecule whith the SMILES representation COC(=O)CCC(=O)Nc1ccc2C(=O)C(=O)c3ccccc3c2c1 has a octanol/water distribution coefficient of 2.72 logD.\n'

### Create samples from df

In [49]:
def get_sample_prompt_basic_from_row(sample, meta):
    return prompt_basic.format(
        SMILES_description=meta["identifiers"][0]["description"],
        SMILES_data=sample.SMILES,
        #target_name=random.sample(meta["targets"][0]["names"], k=1)[0], # we sample from list of names
        target_name=target_name_rv(),
        target_value=sample.exp,
        target_units=meta["targets"][0]["units"],
    )

In [50]:
get_sample_prompt_basic_from_row(df.sample().iloc[0], meta)

'\nThe molecule whith the SMILES representation O=C1COc2ccccc2N1CCN3CCC(CC3)NCc4cc5OCCOc5cn4 has a octanol/water distribution coefficient (logD at pH 7.4) of 1.04 logD.\n'

In [51]:
df["prompt"] = df.apply(lambda x: get_sample_prompt_basic_from_row(x, meta), axis=1)

In [52]:
df.head()

Unnamed: 0,CMPD_CHEMBLID,exp,SMILES,prompt
0,CHEMBL596271,3.54,Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,\nThe molecule whith the SMILES representation...
1,CHEMBL1951080,-1.18,COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)...,\nThe molecule whith the SMILES representation...
2,CHEMBL1771,3.69,COC(=O)[C@@H](N1CCc2sccc2C1)c3ccccc3Cl,\nThe molecule whith the SMILES representation...
3,CHEMBL234951,3.37,OC[C@H](O)CN1C(=O)C(Cc2ccccc12)NC(=O)c3cc4cc(C...,\nThe molecule whith the SMILES representation...
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(nn2C)C(C)(C)C)C(=O)N...,\nThe molecule whith the SMILES representation...


In [53]:
df.iloc[0].prompt

'\nThe molecule whith the SMILES representation Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14 has a octanol/water distribution coefficient (logD at pH 7.4) of 3.54 logD.\n'

### Save prompt template

In [54]:
prompt_basic.dict()

{'input_variables': ['SMILES_description',
  'SMILES_data',
  'target_name',
  'target_value',
  'target_units'],
 'output_parser': None,
 'partial_variables': {},
 'template': '\nThe molecule whith the {SMILES_description} representation {SMILES_data} has a {target_name} of {target_value} {target_units}.\n',
 'template_format': 'f-string',
 'validate_template': True,
 '_type': 'prompt'}

In [55]:
path_data_prompt_basic = path_data_dir+"/prompt_basic.yaml"

In [56]:
prompt_basic.save(path_data_prompt_basic)

In [57]:
!cat {path_data_prompt_basic}

_type: prompt
input_variables:
- SMILES_description
- SMILES_data
- target_name
- target_value
- target_units
output_parser: null
partial_variables: {}
template: '

  The molecule whith the {SMILES_description} representation {SMILES_data} has a {target_name}
  of {target_value} {target_units}.

  '
template_format: f-string
validate_template: true


### Load prompt template

In [58]:
prompt_basic = load_prompt(path_data_prompt_basic)
prompt_basic

PromptTemplate(input_variables=['SMILES_description', 'SMILES_data', 'target_name', 'target_value', 'target_units'], output_parser=None, partial_variables={}, template='\nThe molecule whith the {SMILES_description} representation {SMILES_data} has a {target_name} of {target_value} {target_units}.\n', template_format='f-string', validate_template=True)

## Sentence part sampling (WIP)

# Ideas & to dos:

General
* Check also https://langchain.readthedocs.io/en/latest/modules/prompts/examples/partial.html as this could make the sampling maybe easier and cleaner?


Tabular
* We now create a sample per df row. When we aggregate all tabular, KG, etc. data we need to rethink that approach.

KG
* Here we would sample the template setup before based on the walk over the KG, i.e., 1.) get graph in to text, 2.) sample input variables

# End 