## Calculate compound level GCM scores
The score quantifies how much each compound of a GCM cluster matches the cluster assay enrichment profile.
The score has two contributions:
* rscore mean in the enriched assays in the enriched direction - how much activity is observed in the desired profile
* rscore mean over all assays - counter measure to deprioritize broadly unselective compounds
* compounds are only considered active if at least one assay has an absolute rscore > 3, otherwise they are considered inactive


In [None]:
import sqlite3 
import pandas as pd
import glob
import os

In [None]:
conn = sqlite3.connect('../pubchem_gcm.db')

### Calculate compound scores

In [None]:
df = pd.read_sql("""
     
    WITH rscore_mean_all AS (
        SELECT g.cid, AVG(ABS(a.rscore)) AS rscore_mean_all
        FROM assays a
        JOIN gcm_clusters g ON (a.cid=g.cid)
        JOIN gcm_cluster_assay_stat s ON (g.gcm_cluster=s.gcm_cluster AND a.AID = s.AID)
        WHERE s.assay_qualified_for_profile = 'yes'
        --AND g.gcm_cluster IN (523, 26610)
        GROUP BY g.cid
    ),
    rscore_mean_enriched AS (
        SELECT g.cid, AVG(a.rscore * s.act_dir) AS rscore_mean_enriched,
            MAX(a.rscore * s.act_dir) AS rscore_x_dir_max_enriched
        FROM assays a
        JOIN gcm_clusters g ON (a.cid=g.cid)
        JOIN gcm_cluster_assay_stat s ON (g.gcm_cluster=s.gcm_cluster AND a.AID = s.AID)
        WHERE s.assay_qualified_for_profile = 'yes'
        AND s.adj_p_val < 0.1
        --AND g.gcm_cluster IN (523, 26610)
        GROUP BY g.cid
    ),
    profile_scores AS (
        SELECT a.cid, a.rscore_mean_all, COALESCE(e.rscore_mean_enriched,0) AS rscore_mean_enriched,
           COALESCE(e.rscore_mean_enriched,0) / a.rscore_mean_all AS cpd_profile_score, e.rscore_x_dir_max_enriched,
           CASE WHEN e.rscore_x_dir_max_enriched > 3 THEN 1.0 ELSE 0.0 END AS rscore_active
           --CASE WHEN ABS(e.rscore_x_dir_max_enriched) > 3 THEN 1.0 ELSE 0.0 END AS rscore_active
        FROM rscore_mean_all a 
        LEFT JOIN rscore_mean_enriched e ON (a.cid=e.cid)
    )
    SELECT g.*, p.rscore_mean_all, p.rscore_mean_enriched, p.cpd_profile_score,
           --DENSE_RANK() OVER (PARTITION BY g.gcm_cluster ORDER BY p.cpd_profile_score * p.rscore_active DESC) AS cpd_profile_score_rank,
           --ROW_NUMBER() OVER (PARTITION BY g.gcm_cluster ORDER BY p.cpd_profile_score * p.rscore_active DESC) AS cpd_profile_score_rank,
           --ROW_NUMBER() OVER (PARTITION BY g.gcm_cluster ORDER BY COALESCE (p.rscore_active, p.cpd_profile_score) DESC) AS cpd_profile_score_rank,
           ROW_NUMBER() OVER (PARTITION BY g.gcm_cluster ORDER BY p.rscore_active DESC, p.cpd_profile_score DESC) AS cpd_profile_score_rank,
           p.rscore_x_dir_max_enriched, p.rscore_active
    FROM gcm_clusters g
    JOIN profile_scores p ON (g.cid=p.cid)
    --WHERE g.gcm_cluster IN (523, 26610)

;""", conn)

In [None]:
df.to_csv('gcm_cpd_profile_scores.csv', index=False)

In [None]:
df = pd.read_csv('gcm_cpd_profile_scores.csv')

In [None]:
df.shape

In [None]:
conn.execute('''DROP TABLE IF EXISTS gcm_cpds_scores;''')

conn.execute('''
CREATE TABLE gcm_cpds_scores(
        inchi_key TEXT,
        gcm_cluster INT,
        cluster_size INT,
        cid INT,
        smiles TEXT,
        rscore_mean_all REAL,
        rscore_mean_enriched REAL,
        cpd_profile_score REAL,
        cpd_profile_score_rank INT,
        rscore_x_dir_max_enriched REAL,
        rscore_active REAL,
        PRIMARY KEY(cid, gcm_cluster)
         );
         ''')

In [None]:
df.to_sql('gcm_cpds_scores', conn, if_exists='append', index=False) 

In [None]:
conn.execute('''CREATE INDEX gcm_cpds_scores_gcm_cluster_index ON gcm_cpds_scores (gcm_cluster);''')

In [None]:
conn.close()