## Process and upload PubChem Assay data
This notebook contains the following steps
* read in all selected PubChem assays  
* filter for assays with >10k cids  
* calculate rscores of the activities for better comparison between asssays and call of active/inactive compounds


In [None]:
import sqlite3 
import pandas as pd
from scipy import stats
import glob
import os

In [None]:
# For rscore calculation, ensure mad is not unrealistic low - with 10 at least activity of 30% required for rscore 3
mad_lower_bound = 10 

### Assay data folder
This script assumes all pubchem bioassay data was downloaded into one folder as .csv files. All files are read in and processed.


In [None]:
folder_with_assay_data = '../data_cell_assays'

### Prepare DB

In [None]:
conn = sqlite3.connect('../pubchem_gcm.db')

In [None]:
conn.execute('''DROP TABLE IF EXISTS assays;''')

#create table 
conn.execute('''
CREATE TABLE assays(
         aid INT,
         cid INT,
         pubchem_activity_outcome TEXT,
         pubchem_activity_score   INT,
         rscore REAL,
         PRIMARY KEY(aid, cid)
         );
         ''')

### Prepare and upload assay data to DB
* filter for assays with > 10k compounds measured
* calculate rscores of activities for better comparability between assays and definition of active compounds

In [None]:
def prepare_and_upload_df(filename):
    df = pd.read_csv(filename, low_memory=False, on_bad_lines='skip')
    df = df[~ df.PUBCHEM_CID.isna()].copy() # some rows in the beginning often with metadata
    if df.shape[0] > 10000:
        print("%d compounds added" % df.shape[0])
        df['AID'] = filename.replace(folder_with_assay_data+'/','').replace('.concise.csv','')  
        
        df = df[['AID','PUBCHEM_CID','PUBCHEM_ACTIVITY_OUTCOME','PUBCHEM_ACTIVITY_SCORE']].dropna(subset=['PUBCHEM_CID','PUBCHEM_ACTIVITY_SCORE']).drop_duplicates()
        df = df.astype({"PUBCHEM_CID": int})
        df = df.rename(columns={'PUBCHEM_CID': 'cid'})
        df = df.sort_values(by=['PUBCHEM_ACTIVITY_SCORE'],ascending=False).groupby(['cid'],as_index=False).first()
        act_median = df['PUBCHEM_ACTIVITY_SCORE'].median()
        act_mad = max(stats.median_abs_deviation(df['PUBCHEM_ACTIVITY_SCORE'], scale='normal', nan_policy='omit'), mad_lower_bound)
        df['rscore'] = (df['PUBCHEM_ACTIVITY_SCORE'] - act_median) / act_mad
        df.to_sql('assays', conn, if_exists='append', index=False) 

In [None]:
for filename in glob.glob(os.path.join(folder_with_assay_data, '*.csv')):  #[0:100]:
    #print(filename, filename.replace('data_cell/','').replace('.concise.csv',''))
    prepare_and_upload_df(filename)

In [None]:
conn.execute('''CREATE INDEX assay_cid_index ON assays (cid);''')

### DB stat

In [None]:
pd.read_sql('select * from assays limit 5', conn)

In [None]:
pd.read_sql('select count (distinct a.AID) from assays a', conn)

In [None]:
pd.read_sql('select count (*) from assays ', conn)

### Get distinct CIDS from the uploaded assays for clustering

In [None]:
df_cids = pd.read_sql('select distinct a.cid from assays a', conn)

In [None]:
df_cids.shape

In [None]:
df_cids.to_csv('pubchem_cids.csv')

In [None]:
conn.close()

### Steps outside of this repo
* Added smiles and inchi_keys to cids
* Clustering with chemfp rdkit morgan2 similarity (tanimoto cutoff 0.5) and mcl clustering with perplexity = 1.8 (https://www.micans.org/mcl/https://www.micans.org/mcl/)