# Data retrieval and cleaning

We import basic libraries

In [6]:
import pandas as pd
import numpy as np
from google.cloud import storage
import os
from pathlib import Path
from tqdm.std import tqdm
import sqlite3

## Retrieval and Storage of Basic files from Google Cloud Storage

In [7]:
BUCKET_NAME = 'raw_profiles'
PLATE_NUMBER = '24277'
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".morpho_minds_data")

In [8]:
def create_folder_structure(plate_number):
        """
        Check for folder structure and create it when needed.
        """
        ## Check if data folders exists. If not, create it.
        if not os.path.exists(LOCAL_DATA_PATH):
            os.makedirs(LOCAL_DATA_PATH)
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw'))
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed'))

        if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(plate_number)):
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number))
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw'))
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed'))

        if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw')):
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw'))

        if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed')):
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed'))

In [9]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """
    Download a file from GCS. Is called blob so is generic but will retrieve the SQLite DB.

    :param bucket_name: The name of the bucket
    :param source_blob_name: The name of the blob
    :param destination_file_name: The name of the file to save the blob to
    """
    # Initialize a client
    storage_client = storage.Client()
    # Get the bucket
    bucket = storage_client.bucket(bucket_name)
    # Get the blob
    blob = bucket.blob(source_blob_name)
    # Download the blob to a destination file
    with open(destination_file_name, 'wb') as f:
        with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
            storage_client.download_blob_to_file(blob, file_obj)


## Merge of Dataframes into Useful Data

In [25]:
# AGP == Ph-golgi
# DNA == Hoechst
# ER == ERSyto
# Mito == Mito
# RNA == ERSytoBleed

In [176]:
class Plate:
    def __init__(self, plate_number=None, chem_df=None, images_df=None, well_df=None, plate_df=None):
        self.plate_number = plate_number
        self.chem_df = chem_df
        self.images_df = images_df
        self.well_df = well_df
        self.plate_df = plate_df

    def load(self):
        """
        Load the all the plate data into different dataframes.
        """
        # Check for folder structure and create it when needed.
        create_folder_structure(self.plate_number)

        ## Check that file chemical_compounds.csv exists locally. If not, download it.
        data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'chemical_annotations.csv')
        data_query_cached_exists = data_query_cache_path.is_file()

        if data_query_cached_exists:
            print('Loading Chemical Annotations from local CSV...')
            chem_df = pd.read_csv(data_query_cache_path)
        else:
            print('Loading Chemical Annotations from remote server...')
            download_blob(BUCKET_NAME,
                        f'{self.plate_number}/chemical_annotations.csv',
                        Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'chemical_annotations.csv')
                        )
            chem_df = pd.read_csv(data_query_cache_path)

        ## Check that sqlite db exists locally. If not, download it.
        data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', f'{self.plate_number}.sqlite')
        data_query_cached_exists = data_query_cache_path.is_file()

        if data_query_cached_exists:
            print('Loading SQLite DB from local DB...')
        else:
            print('Loading SQLite DB from remote DB...')
            download_blob(BUCKET_NAME,
                        f'{self.plate_number}/{self.plate_number}.sqlite',
                        Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', f'{self.plate_number}.sqlite')
                        )
            
        conn = sqlite3.connect(data_query_cache_path)
        query = """
                SELECT Image_URL_OrigAGP, Image_URL_OrigDNA, Image_URL_OrigER, Image_URL_OrigMito, Image_URL_OrigRNA, Image_Count_Cells
                FROM Image
                """
        cursor = conn.execute(query)
        data = cursor.fetchall()
        images_df = pd.DataFrame(data, columns=['Ph-golgi', 'Hoechst', 'ERSyto', 'Mito', 'ERSytoBleed', 'CellCount'])
        
        conn.close()
        
        ## Check that mean_well_profile.csv exists. If not, download it.
        data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', 'mean_well_profiles.csv')
        data_query_cached_exists = data_query_cache_path.is_file()

        if data_query_cached_exists:
            print('Loading Well Profiles from local CSV...')
            well_df = pd.read_csv(data_query_cache_path)
        else:
            print('Loading Well Profiles from remote server...')
            download_blob(BUCKET_NAME,
                        f'{self.plate_number}/mean_well_profiles.csv',
                        Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', 'mean_well_profiles.csv')
                        )
            well_df = pd.read_csv(data_query_cache_path)
        self.chem_df = chem_df
        self.images_df = images_df
        self.well_df = well_df
        
        print('✅ Data loaded successfully.')

        return self

    def merge_data(self):
        """
        Clean the data.
        """
        print('Extracting well from picture file name...')
        wells_df = self.images_df.drop(columns=['CellCount']).applymap(lambda x: x.split('/')[-1].split('_')[1])
        wells_df['well'] = wells_df.apply(lambda row: row.unique()[0] if row.nunique()==1 else 0, axis=1)
        
        print('Extracting photo id from picture file name...')
        photo_number_df = self.images_df.drop(columns=['CellCount',]).applymap(lambda x: x.split('/')[-1].split('_')[2])
        photo_number_df['photo_number'] = photo_number_df.apply(lambda row: row.unique()[0][1] if row.nunique()==1 else float('NaN'), axis=1)        
        
        print('Converting photo path for training...')
        self.images_df['Ph-golgi'] = self.images_df['Ph-golgi'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-Ph_golgi', x.split('/')[-1]))
        self.images_df['Hoechst'] = self.images_df['Hoechst'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-Hoechst', x.split('/')[-1]))
        self.images_df['ERSyto'] = self.images_df['ERSyto'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-ERSyto', x.split('/')[-1]))
        self.images_df['Mito'] = self.images_df['Mito'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-Mito', x.split('/')[-1]))
        self.images_df['ERSytoBleed'] = self.images_df['ERSytoBleed'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-ERSytoBleed', x.split('/')[-1]))
        
        print('Concatenating...')    
        self.merged_df = pd.concat([
            self.images_df,
            wells_df['well'],
            photo_number_df['photo_number'],
        ],
        axis = 1)
        
        print('Identifying drugs used per well...')    
         
        chem_cols = ['BROAD_ID', 'CPD_NAME', 'CPD_NAME_TYPE', 'SOURCE_NAME', 'CPD_SMILES']
        well_cols = ['Metadata_Well', 'Metadata_ASSAY_WELL_ROLE', 'Metadata_broad_sample', 'Metadata_mmoles_per_liter',]

        chem_df = self.chem_df[chem_cols].rename(columns={'BROAD_ID':'Drug_id'})
        well_df = self.well_df[well_cols].rename(columns={'Metadata_broad_sample':'Drug_id', 'Metadata_Well':'well'})

        self.merged_df = self.merged_df.merge((well_df.merge(chem_df, how='left')), how='left')
        
        print('✅ Data Merged')
        
    def save(self):
        pass
            
plate = Plate('24277').load()
plate.merge_data()
images_df = plate.images_df
chem_df = plate.chem_df
well_df = plate.well_df
merge_df = plate.merged_df

Loading Chemical Annotations from local CSV...
Loading SQLite DB from local DB...
Loading Well Profiles from local CSV...
✅ Data loaded successfully.
Extracting well from picture file name...
Extracting photo id from picture file name...
Converting photo path for training...
Concatenating...
Identifying drugs used per well...
✅ Data Merged


In [177]:
plate.merged_df

Unnamed: 0,Ph-golgi,Hoechst,ERSyto,Mito,ERSytoBleed,CellCount,well,photo_number,Metadata_ASSAY_WELL_ROLE,Drug_id,Metadata_mmoles_per_liter,CPD_NAME,CPD_NAME_TYPE,SOURCE_NAME,CPD_SMILES
0,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,91.0,a01,1,treated,BRD-K18250272-003-03-7,3.022516,propoxycaine,INN,Prestwick Chemical Inc.,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC
1,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,89.0,a01,2,treated,BRD-K18250272-003-03-7,3.022516,propoxycaine,INN,Prestwick Chemical Inc.,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC
2,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,59.0,a01,3,treated,BRD-K18250272-003-03-7,3.022516,propoxycaine,INN,Prestwick Chemical Inc.,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC
3,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,74.0,a01,4,treated,BRD-K18250272-003-03-7,3.022516,propoxycaine,INN,Prestwick Chemical Inc.,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC
4,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,87.0,a01,5,treated,BRD-K18250272-003-03-7,3.022516,propoxycaine,INN,Prestwick Chemical Inc.,CCCOc1cc(N)ccc1C(=O)OCCN(CC)CC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,56.0,p24,2,treated,BRD-K40742111-001-02-6,5.000000,baeomycesic acid,common,MicroSource Discovery Systems Inc.,COc1cc(C)c(C(=O)Oc2cc(C)c(O)c(C(O)=O)c2C)c(O)c...
2293,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,121.0,p24,3,treated,BRD-K40742111-001-02-6,5.000000,baeomycesic acid,common,MicroSource Discovery Systems Inc.,COc1cc(C)c(C(=O)Oc2cc(C)c(O)c(C(O)=O)c2C)c(O)c...
2294,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,59.0,p24,4,treated,BRD-K40742111-001-02-6,5.000000,baeomycesic acid,common,MicroSource Discovery Systems Inc.,COc1cc(C)c(C(=O)Oc2cc(C)c(O)c(C(O)=O)c2C)c(O)c...
2295,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,/Users/pepe/.morpho_minds_data/24277/Raw_pictu...,68.0,p24,5,treated,BRD-K40742111-001-02-6,5.000000,baeomycesic acid,common,MicroSource Discovery Systems Inc.,COc1cc(C)c(C(=O)Oc2cc(C)c(O)c(C(O)=O)c2C)c(O)c...


In [161]:
chem_cols = ['BROAD_ID', 'CPD_NAME', 'CPD_NAME_TYPE', 'SOURCE_NAME', 'CPD_SMILES']

well_cols = ['Metadata_Well', 'Metadata_ASSAY_WELL_ROLE', 'Metadata_broad_sample', 'Metadata_mmoles_per_liter',]

chem_df = chem_df[chem_cols].rename(columns={'BROAD_ID':'Drug_id'})
well_df = well_df[well_cols].rename(columns={'Metadata_broad_sample':'Drug_id', 'Metadata_Well':'well'})

merge_df.merge((well_df.merge(chem_df, how='left')), how='left')

In [139]:
chem_df

Unnamed: 0,BROAD_ID,CPD_NAME,CPD_NAME_TYPE,SOURCE_NAME,CPD_SMILES
0,BRD-A56675431-001-04-0,altizide,INN,Prestwick Chemical Inc.,NS(=O)(=O)c1cc2c(NC(CSCC=C)NS2(=O)=O)cc1Cl
1,BRD-A51829654-001-01-4,"BRL-15,572",common,Biomol International Inc.,OC(CN1CCN(CC1)c1cccc(Cl)c1)C(c1ccccc1)c1ccccc1
2,BRD-K04046242-001-03-6,equilin,primary-common,Prestwick Chemical Inc.,C[C@]12CC[C@H]3C(=CCc4cc(O)ccc34)[C@@H]1CCC2=O
3,BRD-K16508793-001-01-8,diazepam,INN,MicroSource Discovery Systems Inc.,CN1c2ccc(Cl)cc2C(=NCC1=O)c1ccccc1
4,BRD-K09397065-001-01-6,SR 57227A,to-be-curated,Biomol International Inc.,NC1CCN(CC1)c1cccc(Cl)n1
...,...,...,...,...,...
30611,BRD-K47092271-001-01-7,BRD-K47092271,BROAD_CPD_ID,Broad Institute of MIT and Harvard,COCC(=O)Nc1ccc2O[C@@H]3[C@@H](C[C@H](CC(=O)NCC...
30612,BRD-K30358777-001-01-0,BRD-K30358777,BROAD_CPD_ID,Broad Institute of MIT and Harvard,COc1ccc(CNC(=O)C[C@@H]2C[C@H]3[C@H](Oc4ccc(NC(...
30613,BRD-K32423836-001-01-9,BRD-K32423836,BROAD_CPD_ID,Broad Institute of MIT and Harvard,COCCNC(=O)C[C@H]1C[C@@H]2[C@@H](Oc3ccc(NC(=O)C...
30614,BRD-K28250273-001-01-2,BRD-K28250273,BROAD_CPD_ID,Broad Institute of MIT and Harvard,OC[C@H]1[C@H]([C@H](C#N)N1C(=O)Nc1cccc(F)c1)c1...
