In [1]:
import pandas as pd

In [61]:
df = pd.read_csv('bioactive_peptides_all_pages.csv')
df.shape

(5047, 10)

In [62]:
df.columns

Index(['activity', 'ID', 'Name', 'Sequence', 'Chem. mass', 'Monois. mass',
       'Unnamed: 6', 'SMILES', 'InChI', 'InChIKey'],
      dtype='object')

In [63]:
df.loc[df['activity'].str.startswith('Peptide'), 'activity'] = ''
df.shape

(5047, 10)

In [64]:
df['function'] = df['activity'] + ' | ' + df['Name']

In [65]:
# df['activity'].unique().tolist()
# df['function'].unique().tolist()

df['function'].isnull().sum()

320

In [66]:
df.drop_duplicates()
df.shape

(5047, 11)

In [67]:
df.columns

Index(['activity', 'ID', 'Name', 'Sequence', 'Chem. mass', 'Monois. mass',
       'Unnamed: 6', 'SMILES', 'InChI', 'InChIKey', 'function'],
      dtype='object')

In [68]:
df.drop(['Name','activity','ID', 'InChIKey'],axis=1, inplace=True)
df.shape

(5047, 7)

In [69]:
df.columns

Index(['Sequence', 'Chem. mass', 'Monois. mass', 'Unnamed: 6', 'SMILES',
       'InChI', 'function'],
      dtype='object')

In [70]:
df = df.rename(columns={'Unnamed: 6': 'ic50_ec50'})


In [71]:
df.columns

Index(['Sequence', 'Chem. mass', 'Monois. mass', 'ic50_ec50', 'SMILES',
       'InChI', 'function'],
      dtype='object')

In [73]:
df = df.rename(columns={'Unnamed: 6': 'ic50_ec50', 'Chem. mass': 'Cmass', 'Monois. mass': 'Mmass', 'InChI': 'inchi', 'SMILES': 'smiles'})


In [74]:
df.columns

Index(['Sequence', 'Cmass', 'Mmass', 'ic50_ec50', 'smiles', 'inchi',
       'function'],
      dtype='object')

In [75]:
df = df[~df['Sequence'].astype(str).apply(lambda x: x.startswith('<'))]
df.shape

(4993, 7)

In [76]:
print("Sequence = ",len(df['Sequence'].unique()))
print("inchi = ",len(df['inchi'].unique()))
print("EC50 = ",len(df['ic50_ec50'].unique()))
print("smiles = ",len(df['smiles'].unique()))
print("functions = ",len(df['function'].unique()))

Sequence =  4077
inchi =  2753
EC50 =  1196
smiles =  2570
functions =  1188


In [82]:
df['Sequence'].isnull().sum()


1

In [83]:
df = df.dropna(subset=['Sequence'])
df.shape

(4992, 7)

In [95]:
def convert_value(value):
      """Converts a string value like "0.00 EC50" or "0.00 IC50" to a float, negating if it contains "IC50"."""
      try:
            # Extract the numeric value
            split_value = value.split()
            numeric_value = float(split_value[0])
          
            # Negate if the value contains "IC50"
            if "IC50" in value:
                numeric_value = -numeric_value
            
            return numeric_value
      except ValueError:
            # Handle cases where the value cannot be converted to a float
            return None
      
df['ic50_ec50'] = df['ic50_ec50'].apply(convert_value)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ic50_ec50'] = df['ic50_ec50'].apply(convert_value)


In [96]:
df['ic50_ec50'].max(), df['ic50_ec50'].min()

(40000.0, -25800.0)

In [97]:
df.head()

Unnamed: 0,Sequence,Cmass,Mmass,ic50_ec50,smiles,inchi,function
0,NYKKPKL,890.078,889.5369,0.0,N[C@@]([H])(CC(=O)N)C(=O)N[C@@]([H])(Cc1ccc(O)...,1S/C42H71N11O10/c1-25(2)22-33(42(62)63)52-38(5...,regulating | re | regulating cell-permeability...
1,NYKKPKLAAAPALLALLVAPLLAVAA,2601.2117,2599.6143,0.0,N[C@@]([H])(CC(=O)N)C(=O)N[C@@]([H])(Cc1ccc(O)...,1S/C125H214N30O29/c1-62(2)53-86(146-108(166)84...,regulating | re | regulating cell-permeability...
2,AAVALLPAVLLALLAPAAANYKKPKL,2601.2117,2599.6143,0.0,N[C@@]([H])(C)C(=O)N[C@@]([H])(C)C(=O)N[C@@]([...,1S/C125H214N30O29/c1-62(2)53-85(145-113(171)89...,regulating | re | regulating cell-permeability...
3,NYKKPKLAAAAAVALLPAVLLALLAP,2601.2117,2599.6143,0.0,N[C@@]([H])(CC(=O)N)C(=O)N[C@@]([H])(Cc1ccc(O)...,1S/C125H214N30O29/c1-62(2)53-86(146-110(168)84...,regulating | re | regulating cell-permeability...
4,VVYPWTQRF,1195.3651,1194.6167,0.0,,1S/C59H82N14O13/c1-31(2)47(61)54(81)71-48(32(3...,opioid | op | VV-hemorphin-7


In [98]:
df.to_csv('cleaned.csv', index=False) # to be used for testing