## Calculate GCM cluster summaries. 
* Summarize selectivity profile of enriched and not enriched assays for each chemcial cluster 
* Conclude if assay profile qualifies for GCM or not

In [None]:
import sqlite3 
import pandas as pd
import glob
import os

In [None]:
conn = sqlite3.connect('../pubchem_gcm.db')

### GCM cluster summaries 

In [None]:
df = pd.read_sql("""

     WITH gcm_cluster_num_assays AS (
        SELECT a.gcm_cluster, a.cluster_size, a.max_cpds_tested, COUNT(DISTINCT a.aid) AS num_assays
        FROM gcm_cluster_assay_stat a
        WHERE a.assay_qualified_for_profile = 'yes' 
        GROUP BY a.gcm_cluster, a.cluster_size, a.max_cpds_tested
    ),
    gcm_cluster_num_assays_enriched AS (
        SELECT a.gcm_cluster, COUNT(DISTINCT a.aid) AS num_assays_enriched
        FROM gcm_cluster_assay_stat a
        WHERE a.assay_qualified_for_profile = 'yes' 
        AND a.adj_p_val < 0.1 
        GROUP BY a.gcm_cluster
    ),
    gcm_cluster_num_assays_enriched_up AS (
        SELECT a.gcm_cluster, COUNT(DISTINCT a.aid) AS num_assays_enriched_up
        FROM gcm_cluster_assay_stat a
        WHERE a.assay_qualified_for_profile = 'yes' 
        AND a.adj_p_val < 0.1 
        AND a.act_dir = 1
        GROUP BY a.gcm_cluster
    ),
    gcm_cluster_num_assays_enriched_down AS (
        SELECT a.gcm_cluster, COUNT(DISTINCT a.aid) AS num_assays_enriched_down
        FROM gcm_cluster_assay_stat a
        WHERE a.assay_qualified_for_profile = 'yes' 
        AND a.adj_p_val < 0.1 
        AND a.act_dir = -1
        GROUP BY a.gcm_cluster
    ),
    gcm_cluster_summary AS(
        SELECT t.*, COALESCE(a.num_assays_enriched,0) AS num_assays_enriched,
               COALESCE(1.0 * a.num_assays_enriched,0) / t.num_assays AS fract_assays_enriched,
               COALESCE(a.num_assays_enriched_up,0) AS num_assays_enriched_up,
               COALESCE(a.num_assays_enriched_down,0) AS num_assays_enriched_down
        FROM gcm_cluster_num_assays t
        LEFT JOIN gcm_cluster_num_assays_enriched a USING (gcm_cluster)
        LEFT JOIN gcm_cluster_num_assays_enriched_up a USING (gcm_cluster)
        LEFT JOIN gcm_cluster_num_assays_enriched_down a USING (gcm_cluster)
    )
    SELECT g.*,
            -- conclude if assay profile qualifies for GCM or not
            (CASE WHEN 
                    (
                    g.num_assays >= 10 
                    AND g.num_assays_enriched > 0
                    AND g.fract_assays_enriched < 0.2
                    AND g.num_assays_enriched <= 5
                    AND g.max_cpds_tested < 200
                    )
                    THEN 'yes'
                    ELSE 'no' END) AS gcm_conclusion
    FROM gcm_cluster_summary g
    

;""", conn)

In [None]:
df.shape

In [None]:
df.to_csv('gcm_cluster_summary.csv', index=False)

In [None]:
df = pd.read_csv('gcm_cluster_summary.csv', low_memory=False)

### Upload cluster summaries

In [None]:
conn.execute('''DROP TABLE IF EXISTS gcm_cluster_summary;''')

conn.execute('''
CREATE TABLE gcm_cluster_summary(
        gcm_cluster INT,
        cluster_size INT,
        max_cpds_tested INT,
        num_assays INT,
        num_assays_enriched INT,
        fract_assays_enriched REAL,
        num_assays_enriched_up INT,
        num_assays_enriched_down INT,
        gcm_conclusion TEXT,
        PRIMARY KEY(gcm_cluster)
         );
         ''')

In [None]:
df.to_sql('gcm_cluster_summary', conn, if_exists='append', index=False) 

In [None]:
conn.close()