In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression  # LogisticRegression is not used for regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [36]:
def get_unique_atoms(smiles_list):
    unique_atoms = set()
    
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:  # Check if the molecule was created successfully
            for atom in mol.GetAtoms():
                unique_atoms.add(atom.GetSymbol())  # Add atomic symbol to the set
    
    return unique_atoms

In [37]:
def get_atomic_composition_frequency(smiles_list):
    atomic_frequencies = []
    
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        composition = {'Br': 0, 'C': 0, 'Cl': 0, 'F': 0, 'N': 0, 'O': 0, 'S': 0}  
        
        if mol is not None:  # Check if the molecule was created successfully
            total_atoms = mol.GetNumAtoms()  # Get total number of atoms in the molecule
            
            for atom in mol.GetAtoms():
                composition[atom.GetSymbol()] += 1  # Count occurrences of each atom
            
            # Calculate frequency
            frequency = {atom: count / total_atoms for atom, count in composition.items()}
            atomic_frequencies.append(frequency)  # Append the frequency for this SMILES
    
    return atomic_frequencies

In [38]:
#Degree of atoms
target_atoms = {'Br', 'C', 'Cl', 'F', 'N', 'O', 'S'}
def compute_target_atom_degrees(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # Handle invalid SMILES

    # Initialize a dictionary to hold the highest degrees for target atoms
    degrees_dict = {atom: 0 for atom in target_atoms}

    # Get the degree of each atom and update the dictionary for target atoms
    for atom in mol.GetAtoms():
        atom_symbol = atom.GetSymbol()
        if atom_symbol in target_atoms:
            current_degree = atom.GetDegree()
            # Update the degree if the current one is higher
            if current_degree > degrees_dict[atom_symbol]:
                degrees_dict[atom_symbol] = current_degree

    return degrees_dict

In [39]:
#Bond type
def compute_bond_types_for_cyclic_peptides(df):
    
    def compute_bond_types(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None  # Handle invalid SMILES

        bond_types = {
            'Single': 0,
            'Double': 0,
            'Triple': 0,
            'Aromatic': 0,
            'Conjugated': 0,
            'No-bond': 0
        }

        # Iterate through the bonds in the molecule
        for bond in mol.GetBonds():
            bond_order = bond.GetBondTypeAsDouble()
            if bond_order == 1.0:
                bond_types['Single'] += 1
            elif bond_order == 2.0:
                bond_types['Double'] += 1
            elif bond_order == 3.0:
                bond_types['Triple'] += 1
            elif bond_order == 1.5:
                bond_types['Aromatic'] += 1
            elif bond_order == 1.4:
                bond_types['Conjugated'] += 1
            else:
                bond_types['No-bond'] += 1

        return bond_types

    df['Bond_Types'] = df['SMILES'].apply(compute_bond_types)

    bond_types_df = df['Bond_Types'].apply(pd.Series)

    df = pd.concat([df, bond_types_df], axis=1)

    df.drop(columns=['Bond_Types'], inplace=True)

    return df

In [40]:
#Formal charges
def calculate_overall_formal_charge(smiles):
   
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # Handle invalid SMILES

    overall_charge = 0
    
    # Define valence electrons for common atoms
    valence_electrons = {
        'C': 4,
        'N': 5,
        'O': 6,
        'S': 6,
        'P': 5,
        'F': 7,
        'Cl': 7,
        'Br': 7,
        'I': 7,
    }

    for atom in mol.GetAtoms():
        atom_symbol = atom.GetSymbol()
        valence = valence_electrons.get(atom_symbol, 0)
        non_bonding = atom.GetNumImplicitHs() 
        bonding = atom.GetDegree() * 2  # 2 electrons for each bond

        # Calculate formal charge
        formal_charge = valence - (non_bonding + bonding // 2)
        overall_charge += formal_charge

    return overall_charge

In [41]:
def check_aromatic_and_ring(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return (None, None)  # Handle invalid SMILES

    # Check if the molecule is aromatic
    is_aromatic = any(atom.GetIsAromatic() for atom in mol.GetAtoms())

    # Check if the molecule contains a ring
    ring_info = mol.GetRingInfo()
    is_in_ring = ring_info.NumRings() > 0

    return (int(is_aromatic), int(is_in_ring))

In [None]:
df_train = pd.read_csv('data/Train.csv')
df_train

Unnamed: 0,ID,SMILES,Permeability,Sequence,MolWt
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,"['F', 'meL', 'A', 'meL', 'Ser(tBu)', 'meL', 'm...",1773.325
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,"['F', 'meF', 'P', 'L', 'Me_Phe(3-Cl)', 'T', 'L...",1745.057
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,"['Sar', 'F', 'meL', 'meL', 'F', 'meL', 'L', 'm...",1733.267
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,"['Me_Bal', 'F', 'meF', 'meL', 'T', 'meA', 'meL...",1725.281
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,"['meA', 'meL', 'meF', 'T', 'meL', 'meL', 'I', ...",1723.309
...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,"['A', 'L', 'Mono76']",402.539
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,"['A', 'L', 'H2NEt_Phe']",374.485
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,"['dL', 'S', 'Mono85']",370.494
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,"['dS', 'V', 'Mono85']",356.467


In [43]:
unique_atoms = get_unique_atoms(df_train['SMILES'])
unique_atoms

{'Br', 'C', 'Cl', 'F', 'N', 'O', 'S'}

In [44]:
atomic_frequencies = get_atomic_composition_frequency(df_train['SMILES'])
frequency_df = pd.DataFrame(atomic_frequencies)
frequency_df.fillna(0, inplace=True)
frequency_df

Unnamed: 0,Br,C,Cl,F,N,O,S
0,0.0,0.748031,0.00000,0.0,0.118110,0.133858,0.0
1,0.0,0.747967,0.01626,0.0,0.113821,0.121951,0.0
2,0.0,0.752000,0.00000,0.0,0.136000,0.112000,0.0
3,0.0,0.739837,0.00000,0.0,0.121951,0.138211,0.0
4,0.0,0.747967,0.00000,0.0,0.121951,0.130081,0.0
...,...,...,...,...,...,...,...
5563,0.0,0.758621,0.00000,0.0,0.137931,0.103448,0.0
5564,0.0,0.740741,0.00000,0.0,0.148148,0.111111,0.0
5565,0.0,0.692308,0.00000,0.0,0.153846,0.153846,0.0
5566,0.0,0.680000,0.00000,0.0,0.160000,0.160000,0.0


In [45]:
df_train_atomic_comp = pd.concat([df_train[['ID','SMILES','Permeability']], frequency_df], axis=1)
df_train_atomic_comp

Unnamed: 0,ID,SMILES,Permeability,Br,C,Cl,F,N,O,S
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,0.0,0.748031,0.00000,0.0,0.118110,0.133858,0.0
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0.0,0.747967,0.01626,0.0,0.113821,0.121951,0.0
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,0.0,0.752000,0.00000,0.0,0.136000,0.112000,0.0
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,0.0,0.739837,0.00000,0.0,0.121951,0.138211,0.0
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,0.0,0.747967,0.00000,0.0,0.121951,0.130081,0.0
...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,0.0,0.758621,0.00000,0.0,0.137931,0.103448,0.0
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,0.0,0.740741,0.00000,0.0,0.148148,0.111111,0.0
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,0.0,0.692308,0.00000,0.0,0.153846,0.153846,0.0
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,0.0,0.680000,0.00000,0.0,0.160000,0.160000,0.0


In [None]:
df_train_atomic_comp.to_csv('Atomic_features/Train_atomic_comp.csv', index=False)

In [None]:
df_test = pd.read_csv('data/Test.csv')
atomic_frequencies = get_atomic_composition_frequency(df_test['SMILES'])
frequency_df = pd.DataFrame(atomic_frequencies)
frequency_df.fillna(0, inplace=True)
frequency_df

Unnamed: 0,Br,C,Cl,F,N,O,S
0,0.0,0.746032,0.007937,0.0,0.119048,0.126984,0.0
1,0.0,0.739837,0.000000,0.0,0.121951,0.138211,0.0
2,0.0,0.745902,0.000000,0.0,0.122951,0.131148,0.0
3,0.0,0.754098,0.000000,0.0,0.131148,0.114754,0.0
4,0.0,0.731092,0.000000,0.0,0.126050,0.142857,0.0
...,...,...,...,...,...,...,...
1387,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0
1388,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0
1389,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0
1390,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0


In [48]:
df_test_atomic_comp = pd.concat([df_test[['ID','SMILES','Permeability']], frequency_df], axis=1)
df_test_atomic_comp

Unnamed: 0,ID,SMILES,Permeability,Br,C,Cl,F,N,O,S
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0.0,0.746032,0.007937,0.0,0.119048,0.126984,0.0
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0.0,0.739837,0.000000,0.0,0.121951,0.138211,0.0
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.0,0.745902,0.000000,0.0,0.122951,0.131148,0.0
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0.0,0.754098,0.000000,0.0,0.131148,0.114754,0.0
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0.0,0.731092,0.000000,0.0,0.126050,0.142857,0.0
...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0


In [None]:
df_test_atomic_comp.to_csv('Atomic_features/Test_atomic_comp.csv', index=False)

In [50]:
#Degree of atoms

In [None]:
df_train = pd.read_csv('Atomic_features/Train_atomic_comp.csv')
df_test = pd.read_csv('Atomic_features/Test_atomic_comp.csv')


In [None]:
df_train['Atom_Degrees'] = df_train['SMILES'].apply(compute_target_atom_degrees)
degrees_df = df_train['Atom_Degrees'].apply(pd.Series)

degrees_df.columns = [f'Degree_{atom}' for atom in target_atoms]

df_train = pd.concat([df_train, degrees_df], axis=1)
df_train = df_train.drop('Atom_Degrees', axis=1)
df_train.to_csv("Atomic_features/Train_atomic_comp_and_degree.csv",index=False)
df_train

Unnamed: 0,ID,SMILES,Permeability,Br,C,Cl,F,N,O,S,Degree_Br,Degree_Cl,Degree_N,Degree_C,Degree_F,Degree_S,Degree_O
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,0.0,0.748031,0.00000,0.0,0.118110,0.133858,0.0,0,0,3,4,0,0,2
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0.0,0.747967,0.01626,0.0,0.113821,0.121951,0.0,0,1,3,3,0,0,1
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,0.0,0.752000,0.00000,0.0,0.136000,0.112000,0.0,0,0,3,3,0,0,1
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,0.0,0.739837,0.00000,0.0,0.121951,0.138211,0.0,0,0,3,4,0,0,2
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,0.0,0.747967,0.00000,0.0,0.121951,0.130081,0.0,0,0,3,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,0.0,0.758621,0.00000,0.0,0.137931,0.103448,0.0,0,0,2,3,0,0,1
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,0.0,0.740741,0.00000,0.0,0.148148,0.111111,0.0,0,0,2,3,0,0,1
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,0.0,0.692308,0.00000,0.0,0.153846,0.153846,0.0,0,0,2,3,0,0,1
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,0.0,0.680000,0.00000,0.0,0.160000,0.160000,0.0,0,0,2,3,0,0,1


In [None]:
df_test['Atom_Degrees'] = df_test['SMILES'].apply(compute_target_atom_degrees)
degrees_df = df_test['Atom_Degrees'].apply(pd.Series)

degrees_df.columns = [f'Degree_{atom}' for atom in target_atoms]

df_test = pd.concat([df_test, degrees_df], axis=1)
df_test = df_test.drop('Atom_Degrees', axis=1)
df_test.to_csv("Atomic_features/Test_atomic_comp_and_degree.csv",index=False)
df_test

Unnamed: 0,ID,SMILES,Permeability,Br,C,Cl,F,N,O,S,Degree_Br,Degree_Cl,Degree_N,Degree_C,Degree_F,Degree_S,Degree_O
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0.0,0.746032,0.007937,0.0,0.119048,0.126984,0.0,0,1,3,3,0,0,1
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0.0,0.739837,0.000000,0.0,0.121951,0.138211,0.0,0,0,3,4,0,0,2
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.0,0.745902,0.000000,0.0,0.122951,0.131148,0.0,0,0,3,3,0,0,1
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0.0,0.754098,0.000000,0.0,0.131148,0.114754,0.0,0,0,3,3,0,0,1
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0.0,0.731092,0.000000,0.0,0.126050,0.142857,0.0,0,0,3,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0,0,0,2,3,0,0,1
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0,0,0,2,3,0,0,1
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0,0,0,3,3,0,0,1
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0,0,0,3,3,0,0,1


In [None]:
#CSV with only degree columns
df_train = pd.read_csv('data/Train.csv')
df_train = df_train[['ID','SMILES','Permeability']]
df_train['Atom_Degrees'] = df_train['SMILES'].apply(compute_target_atom_degrees)
degrees_df = df_train['Atom_Degrees'].apply(pd.Series)

degrees_df.columns = [f'Degree_{atom}' for atom in target_atoms]

df_train = pd.concat([df_train, degrees_df], axis=1)
df_train = df_train.drop('Atom_Degrees', axis=1)
df_train.to_csv("Atomic_features/Train_atomic_degrees.csv",index=False)
df_train

Unnamed: 0,ID,SMILES,Permeability,Degree_Br,Degree_Cl,Degree_N,Degree_C,Degree_F,Degree_S,Degree_O
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,0,0,3,4,0,0,2
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0,1,3,3,0,0,1
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,0,0,3,3,0,0,1
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,0,0,3,4,0,0,2
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,0,0,3,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,0,0,2,3,0,0,1
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,0,0,2,3,0,0,1
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,0,0,2,3,0,0,1
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,0,0,2,3,0,0,1


In [None]:
#CSV with only degree columns
df_test = pd.read_csv('data/Test.csv')
df_test = df_test[['ID','SMILES','Permeability']]
df_test['Atom_Degrees'] = df_test['SMILES'].apply(compute_target_atom_degrees)
degrees_df = df_test['Atom_Degrees'].apply(pd.Series)

# Rename the columns to include the atom symbols
degrees_df.columns = [f'Degree_{atom}' for atom in target_atoms]

# Concatenate the original DataFrame with the new degrees DataFrame
df_test = pd.concat([df_test, degrees_df], axis=1)
df_test = df_test.drop('Atom_Degrees', axis=1)
df_test.to_csv("Atomic_features/Test_atomic_degrees.csv",index=False)
df_test

Unnamed: 0,ID,SMILES,Permeability,Degree_Br,Degree_Cl,Degree_N,Degree_C,Degree_F,Degree_S,Degree_O
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0,1,3,3,0,0,1
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0,0,3,4,0,0,2
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0,0,3,3,0,0,1
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0,0,3,3,0,0,1
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0,0,3,3,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0,0,2,3,0,0,1
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0,0,2,3,0,0,1
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0,0,3,3,0,0,1
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0,0,3,3,0,0,1


In [56]:
#Bond_type

In [None]:
df_train = pd.read_csv('data/Train.csv')
df_train = df_train[['ID','SMILES','Permeability']]
df_train = compute_bond_types_for_cyclic_peptides(df_train)
df_train.to_csv('Atomic_features/Train_bonds_type.csv', index=False)

In [None]:
df_train = pd.read_csv('Atomic_features/Train_atomic_comp_and_degree.csv')
df_train = compute_bond_types_for_cyclic_peptides(df_train)
df_train.to_csv('Atomic_features/Train_Atomic_comp_atomic_degree_bonds_type.csv', index=False)
df_train

Unnamed: 0,ID,SMILES,Permeability,Br,C,Cl,F,N,O,S,...,Degree_C,Degree_F,Degree_S,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,0.0,0.748031,0.00000,0.0,0.118110,0.133858,0.0,...,4,0,0,2,98,15,0,18,0,0
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,0.0,0.747967,0.01626,0.0,0.113821,0.121951,0.0,...,3,0,0,1,91,14,0,24,0,0
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,0.0,0.752000,0.00000,0.0,0.136000,0.112000,0.0,...,3,0,0,1,91,15,0,24,0,0
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,0.0,0.739837,0.00000,0.0,0.121951,0.138211,0.0,...,4,0,0,2,99,15,0,12,0,0
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,0.0,0.747967,0.00000,0.0,0.121951,0.130081,0.0,...,3,0,0,1,99,15,0,12,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,0.0,0.758621,0.00000,0.0,0.137931,0.103448,0.0,...,3,0,0,1,21,3,0,6,0,0
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,0.0,0.740741,0.00000,0.0,0.148148,0.111111,0.0,...,3,0,0,1,19,3,0,6,0,0
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,0.0,0.692308,0.00000,0.0,0.153846,0.153846,0.0,...,3,0,0,1,23,3,0,0,0,0
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,0.0,0.680000,0.00000,0.0,0.160000,0.160000,0.0,...,3,0,0,1,22,3,0,0,0,0


In [None]:
df_test = pd.read_csv('data/Test.csv')
df_test = df_test[['ID','SMILES','Permeability']]
df_test = compute_bond_types_for_cyclic_peptides(df_test)
df_test.to_csv('Atomic_features/Test_bonds_type.csv', index=False)
df_test

Unnamed: 0,ID,SMILES,Permeability,Single,Double,Triple,Aromatic,Conjugated,No-bond
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,97,15,0,18,0,0
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,99,15,0,12,0,0
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,93,15,0,18,0,0
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,85,14,0,29,0,0
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,95,15,0,12,0,0
...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,23,3,0,6,0,0
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,23,3,0,6,0,0
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,22,4,0,6,0,0
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,24,3,0,6,0,0


In [None]:
df_test = pd.read_csv('Atomic_features/Test_atomic_comp_and_degree.csv')
df_test = compute_bond_types_for_cyclic_peptides(df_test)
df_test.to_csv('Atomic_features/Test_Atomic_comp_atomic_degree_bonds_type.csv', index=False)
df_test

Unnamed: 0,ID,SMILES,Permeability,Br,C,Cl,F,N,O,S,...,Degree_C,Degree_F,Degree_S,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0.0,0.746032,0.007937,0.0,0.119048,0.126984,0.0,...,3,0,0,1,97,15,0,18,0,0
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0.0,0.739837,0.000000,0.0,0.121951,0.138211,0.0,...,4,0,0,2,99,15,0,12,0,0
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.0,0.745902,0.000000,0.0,0.122951,0.131148,0.0,...,3,0,0,1,93,15,0,18,0,0
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0.0,0.754098,0.000000,0.0,0.131148,0.114754,0.0,...,3,0,0,1,85,14,0,29,0,0
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0.0,0.731092,0.000000,0.0,0.126050,0.142857,0.0,...,3,0,0,1,95,15,0,12,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0,...,3,0,0,1,23,3,0,6,0,0
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0,...,3,0,0,1,23,3,0,6,0,0
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0,...,3,0,0,1,22,4,0,6,0,0
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0,...,3,0,0,1,24,3,0,6,0,0


In [61]:
#Overall Formal charge

In [None]:
df_train = pd.read_csv('data/Train.csv')
df_train = df_train[['ID','SMILES','Permeability']]
df_train['Overall_Formal_Charge'] = df_train['SMILES'].apply(calculate_overall_formal_charge)
df_train.to_csv('Atomic_features/Train_formal_charge.csv', index=False)
df_train

Unnamed: 0,ID,SMILES,Permeability,Overall_Formal_Charge
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,163
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,167
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,156
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,155
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,153
...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,35
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,35
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,33
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,33


In [None]:
df_test = pd.read_csv('data/Test.csv')
df_test = df_test[['ID','SMILES','Permeability']]
df_test['Overall_Formal_Charge'] = df_test['SMILES'].apply(calculate_overall_formal_charge)
df_test.to_csv('Atomic_features/Test_formal_charge.csv', index=False)
df_test

Unnamed: 0,ID,SMILES,Permeability,Overall_Formal_Charge
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,166
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,157
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,159
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,157
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,156
...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,35
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,35
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,40
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,39


In [64]:
#Aromatic and ring

In [None]:
df_train = pd.read_csv('data/Train.csv')
df_train = df_train[['ID','SMILES','Permeability']]
df_train[['Is_Aromatic', 'Is_In_Ring']] = df_train['SMILES'].apply(check_aromatic_and_ring).apply(pd.Series)
df_train.to_csv('Atomic_features/Train_is_ring_is_aromatic.csv', index=False)
df_train

Unnamed: 0,ID,SMILES,Permeability,Is_Aromatic,Is_In_Ring
0,915,CC[C@H](C)[C@H](NC(=O)[C@@H]1CC(=O)N[C@@H](Cc2...,-7.0,1,1
1,888,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1cccc(Cl)c1)N(C)...,-7.0,1,1
2,593,C/N=C(\NC)NCCC[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C...,-7.0,1,1
3,916,CC[C@H](C)[C@H](NC(=O)[C@@H](NC(=O)[C@@H]1CC(=...,-7.0,1,1
4,900,CC[C@H](C)[C@H](NC(=O)[C@H](Cc1ccccc1)N(C)C(=O...,-7.0,1,1
...,...,...,...,...,...
5563,2469,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-4.7,1,1
5564,2467,CC(C)C[C@@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2cccc...,-5.6,1,1
5565,2512,CC(C)C[C@H]1NC(=O)[C@@H](C)NCCCCCCNC(=O)[C@H](...,-5.5,0,1
5566,2511,CC(C)[C@@H]1NC(=O)[C@@H](CO)NC(=O)[C@@H](C)NCC...,-6.3,0,1


In [None]:
df_test = pd.read_csv('data/Test.csv')
df_test = df_test[['ID','SMILES','Permeability']]
df_test[['Is_Aromatic', 'Is_In_Ring']] = df_test['SMILES'].apply(check_aromatic_and_ring).apply(pd.Series)
df_test.to_csv('Atomic_features/Test_is_ring_is_aromatic.csv', index=False)
df_test

Unnamed: 0,ID,SMILES,Permeability,Is_Aromatic,Is_In_Ring
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,1,1
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,1,1
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,1,1
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,1,1
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,1,1
...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,1,1
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,1,1
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,1,1
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,1,1


In [None]:
df1 = pd.read_csv("Atomic_features/Train_Atomic_comp_atomic_degree_bonds_type.csv")
df2 = pd.read_csv("Atomic_features/Train_formal_charge.csv")
df3 = pd.read_csv("Atomic_features/Train_is_ring_is_aromatic.csv")

merged_df = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')

# Merge the result with df3
df_train = pd.merge(merged_df, df3, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_train.to_csv('Atomic_features/Train_all_atomic_desc.csv', index=False)

In [None]:
df1 = pd.read_csv("Atomic_features/Test_Atomic_comp_atomic_degree_bonds_type.csv")
df2 = pd.read_csv("Atomic_features/Test_formal_charge.csv")
df3 = pd.read_csv("Atomic_features/Test_is_ring_is_aromatic.csv")

merged_df = pd.merge(df1, df2, on=['ID', 'SMILES', 'Permeability'], how='inner')

# Merge the result with df3
df_test = pd.merge(merged_df, df3, on=['ID', 'SMILES', 'Permeability'], how='inner')
df_test.to_csv('Atomic_features/Test_all_atomic_desc.csv', index=False)
df_test

Unnamed: 0,ID,SMILES,Permeability,Br,C,Cl,F,N,O,S,...,Degree_O,Single,Double,Triple,Aromatic,Conjugated,No-bond,Overall_Formal_Charge,Is_Aromatic,Is_In_Ring
0,908,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-7.00,0.0,0.746032,0.007937,0.0,0.119048,0.126984,0.0,...,1,97,15,0,18,0,0,166,1,1
1,923,CC[C@H](C)[C@@H]1NC(=O)[C@H](C)N(C)C(=O)[C@H](...,-7.00,0.0,0.739837,0.000000,0.0,0.121951,0.138211,0.0,...,2,99,15,0,12,0,0,157,1,1
2,897,CC[C@H](C)[C@@H]1NC(=O)[C@H](Cc2ccccc2)N(C)C(=...,-7.00,0.0,0.745902,0.000000,0.0,0.122951,0.131148,0.0,...,1,93,15,0,18,0,0,159,1,1
3,587,CC(C)C[C@@H]1NC(=O)[C@H](Cc2c[nH]cn2)NC(=O)[C@...,-6.74,0.0,0.754098,0.000000,0.0,0.131148,0.114754,0.0,...,1,85,14,0,29,0,0,157,1,1
4,921,CC[C@H](C)[C@@H]1NC(=O)[C@H](CC(C)C)N(C)C(=O)[...,-5.54,0.0,0.731092,0.000000,0.0,0.126050,0.142857,0.0,...,1,95,15,0,12,0,0,156,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1387,2481,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc...,-4.50,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0,...,1,23,3,0,6,0,0,35,1,1
1388,2485,CC(C)C[C@H]1NC(=O)[C@H](C)NC(=O)[C@@H](Cc2cccc...,-4.80,0.0,0.774194,0.000000,0.0,0.129032,0.096774,0.0,...,1,23,3,0,6,0,0,35,1,1
1389,5604,CC(C)CN1CC(=O)N[C@@H](Cc2ccccc2)C(=O)NCCCCC(=O...,-6.38,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0,...,1,22,4,0,6,0,0,40,1,1
1390,2513,C[C@H]1NCCCCCCNC(=O)[C@H](Cc2ccc(O)cc2)NC(=O)[...,-7.80,0.0,0.741935,0.000000,0.0,0.129032,0.129032,0.0,...,1,24,3,0,6,0,0,39,1,1
