# Data processing

In this notebook, I am loading a list smiles molecules I obtained from Ersilia summer 2024 internship slack channel and processing the
- Checking all 1000 smiles are valid smiles
- Standardzing the smiles

In [None]:
#import necessary files
import sys
import os
import pandas as pd
import json
import csv

In [None]:
# Add the src directory to the Python module search path
sys.path.append(os.path.abspath("../src"))

#add the input files path
unprocessed = os.path.abspath('../data/reference_library.csv')
input = os.path.abspath('../data/input.csv')
standardize = os.path.abspath('../data/standardized_smiles.csv')
sample = os.path.abspath('../data/sample_smiles.csv')
output = os.path.abspath('../data/output.csv')
tsion = os.path.abspath('../data/sample_input_inchkey.csv')

In [None]:
#count valid smiles from 1000 molecules
#import the python function thats checks smiles validity
from src.processing import validate_smiles_in_csv
valid_count, invalid_count = validate_smiles_in_csv(unprocessed)
print(f"Number of valid SMILES: {valid_count}")
print(f"Number of invalid SMILES: {invalid_count}")

Number of valid SMILES: 1000
Number of invalid SMILES: 0


In [None]:
#import the python function thats will standardize the smiles
from src.processing import standardise_smiles_from_csv
#standardized and save file
standardise_smiles_from_csv(unprocessed, standardize)


[09:17:17] Can't kekulize mol.  Unkekulized atoms: 3 7


Number of standardized SMILES: 997


In [None]:

df = pd.read_csv(standardize)
print(df.head())


                                              smiles  \
0                       CCCCNC(=S)N/N=C/C1=C(C)C=CS1   
1         CN1C(SCC2=NC(C3=CC=CS3)=NO2)=NN=C1C1CCCCC1   
2           O=C(O)CC(NC(=O)C1=CN=CC=N1)C1=CC=CC=C1Cl   
3  O=S(=O)(C1=CC=CC=C1)N1CCN(C2=NOC3=CC=CC(Cl)=C2...   
4  CCC[C@@H](C)N(C1=CC(Cl)=CC=C1CO)S(=O)(=O)C1=CC...   

                                 Standardized_SMILES  
0                           CCCCNC(=S)N/N=C/c1sccc1C  
1              Cn1c(SCc2nc(-c3cccs3)no2)nnc1C1CCCCC1  
2                 O=C(O)CC(NC(=O)c1cnccn1)c1ccccc1Cl  
3       O=S(=O)(c1ccccc1)N1CCN(c2noc3cccc(Cl)c23)CC1  
4  CCC[C@@H](C)N(c1cc(Cl)ccc1CO)S(=O)(=O)c1ccc(C)cc1  


In [None]:
# Drop the 'smiles' column
df = df.drop('smiles', axis=1)

# Rename the 'Standardized_SMILES' column to 'smiles'
df = df.rename(columns={'Standardized_SMILES': 'smiles'})
#save teh process file as input
df.to_csv(input, index=False)
print(df.head())


                                              smiles
0                           CCCCNC(=S)N/N=C/c1sccc1C
1              Cn1c(SCc2nc(-c3cccs3)no2)nnc1C1CCCCC1
2                 O=C(O)CC(NC(=O)c1cnccn1)c1ccccc1Cl
3       O=S(=O)(c1ccccc1)N1CCN(c2noc3cccc(Cl)c23)CC1
4  CCC[C@@H](C)N(c1cc(Cl)ccc1CO)S(=O)(=O)c1ccc(C)cc1


# Running Model

Here I will run the model I've fetched using ersilia -v fetch eos30gr --from_git , since importing the model is giving error(detail can be found in the dedugging ersilia folder) I will be using the terminal input to serve and run the model:
- Test model with smiles
- Test model with InChiKey
- Test model out put to csv file

In [None]:
!ersilia serve eos30gr

[32m🚀 Serving model eos30gr: deepherg[0m
[0m
[33m   URL: http://127.0.0.1:40343[0m
[33m   PID: 9909[0m
[33m   SRV: conda[0m
[0m
[34m👉 To run model:[0m
[34m   - run[0m
[0m
[34m💁 Information:[0m
[34m   - info[0m


In [None]:
#test model using sample smile file
!ersilia run -i sample_input_smiles.csv



{
    "input": {
        "key": "SUTWUYBMBWPLMW-MDWZMJQESA-N",
        "input": "CCCCNC(=S)N/N=C/c1sccc1C",
        "text": "CCCCNC(=S)N/N=C/c1sccc1C"
    },
    "output": {
        "outcome": 0.3471122086048126
    }
}[0m


In [None]:
#outputting result to file
!ersilia api run -i sample_input_smiles.csv -o output.csv

Traceback (most recent call last):
  File "/opt/conda/envs/ersilia/bin/ersilia", line 8, in <module>
    sys.exit(cli())
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1078, in main
    rv = self.invoke(ctx)
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 783, in invoke
    return __callback(*args, **kwargs)
  File "/workspaces/codespaces-jupyter/ersilia/ersilia/cli/commands/__init__.py", line 22, in wrapper
    return func(*args, **kwargs)
  File "/workspaces/codespaces-jupyter/er

Since running output to file is not working i decided to check if i have isaura and confirmed its already installed

In [None]:
#checkin if isaura is installed
!python -m pip install isaura==0.1



out putting to file is giving type error so i decided to use InChKey and test

In [None]:
#testing otputting to file with InChKey
!ersilia api run -i sample_input_inchkey.csv -o output.csv

Traceback (most recent call last):
  File "/opt/conda/envs/ersilia/bin/ersilia", line 8, in <module>
    sys.exit(cli())
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
    return self.main(*args, **kwargs)
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1078, in main
    rv = self.invoke(ctx)
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1688, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/opt/conda/envs/ersilia/lib/python3.10/site-packages/click/core.py", line 783, in invoke
    return __callback(*args, **kwargs)
  File "/workspaces/codespaces-jupyter/ersilia/ersilia/cli/commands/__init__.py", line 22, in wrapper
    return func(*args, **kwargs)
  File "/workspaces/codespaces-jupyter/er

The model seems to work file when running with out output option but gives TypeError when trying to save to output

In [None]:
#testing with out outputting to file
!ersilia api run -i sample_input_inchkey.csv

{
    "input": {
        "key": "BLGXFZZNTVWLAY-SCYLSFHTSA-N",
        "input": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O",
        "text": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O"
    },
    "output": {
        "outcome": 0.39273926615715027
    }
}[0m
{
    "input": {
        "key": "BLGXFZZNTVWLAY-SCYLSFHTSA-N",
        "input": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O",
        "text": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O"
    },
    "output": {
        "outcome": 0.39273926615715027
    }
}[0m
{
    "input": {
        "key": "BLGXFZZNTVWLAY-SCYLSFHTSA-N",
        "input": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O",
        "text": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O"
    },
    "output": {
        "outcome": 0.39273926615715027
    }
}[0m


# Since -o to output.csv is not wokring i decided to redirect the whole output to a txt file and then process the data

In [None]:
import os
#redirecting output to file
!ersilia api run -i sample_input_inchkey.csv > ../data/row_output.txt
file_path = os.path.abspath('../data/row_output.txt')
#preview file
with open(file_path, 'r') as file:
    lines = [next(file) for _ in range(10)]
for line in lines:
    print(line)



{

    "input": {

        "key": "BLGXFZZNTVWLAY-SCYLSFHTSA-N",

        "input": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O",

        "text": "COC(=O)[C@@H]1[C@H]2C[C@H]3c4[nH]c5ccccc5c4CCN3C[C@@H]2CC[C@@H]1O"

    },

    "output": {

        "outcome": 0.39273926615715027

    }

}



In [None]:

!ersilia api run -i sample_input_inchkey.csv > ../data/row_output.txt


In [None]:
#now i will extract the inchkey, smiles and outcome and creat a csv file
from src import process_json_file
process_json_file('../data/output.txt', '../fromtext.csv')

Error decoding JSON: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
Error decoding JSON: Extra data: line 1 column 12 (char 11)
Error decoding JSON: Extra data: line 1 column 14 (char 13)
Error decoding JSON: Extra data: line 1 column 16 (char 15)
Error decoding JSON: Extra data: line 1 column 15 (char 14)
Error decoding JSON: Expecting value: line 1 column 5 (char 4)
Error decoding JSON: Extra data: line 1 column 13 (char 12)
Error decoding JSON: Extra data: line 1 column 18 (char 17)
Error decoding JSON: Expecting value: line 1 column 5 (char 4)
Error decoding JSON: Expecting value: line 1 column 1 (char 0)
Error decoding JSON: Expecting property name enclosed in double quotes: line 2 column 1 (char 2)
Error decoding JSON: Extra data: line 1 column 12 (char 11)
Error decoding JSON: Extra data: line 1 column 14 (char 13)
Error decoding JSON: Extra data: line 1 column 16 (char 15)
Error decoding JSON: Extra data: line 1 column 15 (char 14)
Error decoding JS

I kept getting json format errror
- I validated the json files adn the isseu seem to be that multiple json files are stored in the output.txt file so I tried to use regular expression instead

In [4]:
import re
import os


def extract_keys_and_values_from_text(file_path):
    key_value_pairs = []
    with open(file_path, 'r') as file:
        text = file.read()
        # Define a regular expression pattern to find key-value pairs
        pattern = r'"key":\s*"(.*?)".*?"input":\s*"(.*?)".*?"outcome":\s*\[\s*(\d+\.\d+)\s*\]'
        matches = re.findall(pattern, text, re.DOTALL)
        for match in matches:
            key_value_pairs.append(("key", match[0]))
            key_value_pairs.append(("input", match[1]))
            key_value_pairs.append(("outcome", float(match[2])))
    return key_value_pairs

file_path = os.path.abspath('../data/row_output.txt')
key_value_pairs = extract_keys_and_values_from_text(file_path)
for key, value in key_value_pairs:
    print(f"Key: {key}, Value: {value}")


Key: key, Value: SUTWUYBMBWPLMW-MDWZMJQESA-N
Key: input, Value: CCCCNC(=S)N/N=C/c1sccc1C
Key: outcome, Value: 0.5726072607260726
Key: key, Value: SDKIBDZIDPFNHT-UHFFFAOYSA-N
Key: input, Value: CC(C)Cc1cc(C(=O)NCc2cccnc2)no1
Key: outcome, Value: 0.19801980198019803
Key: key, Value: CGVTXIMVAPAPNK-UHFFFAOYSA-N
Key: input, Value: COC(=O)CC1NN=C2N(CCN2c2ccc(Cl)cc2)C1=O
Key: outcome, Value: 0.416996699669967
Key: key, Value: LWTPUALMLDLRFE-UHFFFAOYSA-N
Key: input, Value: CCN1c2ncc(COc3cccc(C(=O)O)c3)cc2C(=O)N(C)c2ccc(Cl)nc21
Key: outcome, Value: 0.34488448844884484
Key: key, Value: OAGNFKCWXVXZJD-NDENLUEZSA-N
Key: input, Value: Cc1c(NC(=S)N/N=C\\c2ccccc2O)c(=O)n(-c2ccccc2)n1C
Key: outcome, Value: 0.5016501650165017
Key: key, Value: YBVUWGYHDUMFEA-ZCFIWIBFSA-N
Key: input, Value: C[C@@H](O)c1nc(-c2nc(-c3nc(CO)cs3)cs2)cs1
Key: outcome, Value: 0.30544554455445544
Key: key, Value: GASJOQPNNSMIFF-UHFFFAOYSA-N
Key: input, Value: CCCCCCCCCCCC(=O)CCCC
Key: outcome, Value: 0.5997760490334747
Key: key

In [5]:
import csv
import re

def extract_keys_and_values_from_text(file_path):
    key_value_pairs = []
    with open(file_path, 'r') as file:
        text = file.read()
        # Define a regular expression pattern to find key-value pairs
        pattern = r'"key":\s*"(.*?)".*?"input":\s*"(.*?)".*?"outcome":\s*\[\s*(\d+\.\d+)\s*\]'
        matches = re.findall(pattern, text, re.DOTALL)
        for match in matches:
            key_value_pairs.append(("key", match[0]))
            key_value_pairs.append(("input", match[1]))
            key_value_pairs.append(("outcome", float(match[2])))
    return key_value_pairs

def write_to_csv(key_value_pairs, output_file):
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Key', 'Input', 'Outcome'])
        for i in range(0, len(key_value_pairs), 3):
            writer.writerow([key_value_pairs[i][1], key_value_pairs[i+1][1], key_value_pairs[i+2][1]])

file_path = os.path.abspath('../data/row_output.txt')
output_file = os.path.abspath('../data/final_output.scv')
key_value_pairs = extract_keys_and_values_from_text(file_path)
write_to_csv(key_value_pairs, output_file)

In [9]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m233.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: seaborn
Successfully installed seaborn-0.13.2


# Test output and analyse

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
output_file = os.path.abspath('../data/final_output.scv')

# Assuming your CSV file is named 'predictions.csv' and the predicted values are in the third column
df = pd.read_csv(output_file, header=None, names=['Key', 'Input', 'Outcome'])

# Plotting a histogram
plt.figure(figsize=(10, 6))
plt.hist(df['Outcome'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Predicted Values')
plt.ylabel('Frequency')
plt.title('Histogram of Predicted Values')
plt.grid(axis='y', alpha=0.75)
plt.show()

# Plotting a density plot
plt.figure(figsize=(10, 6))
sns.kdeplot(df['Prediction'], shade=True, color='skyblue')
plt.xlabel('Predicted Values')
plt.ylabel('Density')
plt.title('Density Plot of Predicted Values')
plt.grid(axis='y', alpha=0.75)
plt.show()

ValueError: Unable to parse string "Outcome" at position 0