# Data retrieval and cleaning

We import basic libraries

In [90]:
import pandas as pd
import numpy as np
from google.cloud import storage
import os
from pathlib import Path
from tqdm.std import tqdm
import sqlite3

## Retrieval and Storage of Basic files from Google Cloud Storage

In [67]:
BUCKET_NAME = 'raw_profiles'
PLATE_NUMBER = '24277'
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".morpho_minds_data")

In [78]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """
    Download a file from GCS. Is called blob so is generic but will retrieve the SQLite DB.
    """
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(source_blob_name)

    # Download the blob to a destination file
    with open(destination_file_name, 'wb') as f:
        with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
            storage_client.download_blob_to_file(blob, file_obj)

In [98]:
def create_folder_structure():
    """
    Check for folder structure and create it when needed. 
    """
    
    ## Check if data folders exists. If not, create it.
    if not os.path.exists(LOCAL_DATA_PATH):
        os.makedirs(LOCAL_DATA_PATH)
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw'))
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed'))

    if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER)):
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER))
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw'))
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed'))

    if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw')):
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw'))   

    if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed')):
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed'))   

In [101]:
def load_dataframes():
    """
    Load all the raw data into dataframes needed to work.
    """
    
    ## Check that file chemical_compounds.csv exists locally. If not, download it.
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'chemical_annotations.csv')
    data_query_cached_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print('Loading Chemical Annotations from local CSV...')
        chem_df = pd.read_csv(data_query_cache_path)
    else:
        print('Loading Chemical Annotations from remote server...')
        download_blob(BUCKET_NAME,
                      f'{PLATE_NUMBER}/chemical_annotations.csv',
                      Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'chemical_annotations.csv')
                     )
        chem_df = pd.read_csv(data_query_cache_path)

    ## Check that sqlite db exists locally. If not, download it.
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw', f'{PLATE_NUMBER}.sqlite')
    data_query_cached_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print('Loading SQLite DB from local DB...')
    else:
        print('Loading SQLite DB from remote DB...')
        download_blob(BUCKET_NAME,
                      f'{PLATE_NUMBER}/{PLATE_NUMBER}.sqlite',
                      Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw', f'{PLATE_NUMBER}.sqlite')
                     )

    conn = sqlite3.connect(data_query_cache_path)
    query = """
            SELECT Image_URL_OrigAGP, Image_URL_OrigDNA, Image_URL_OrigER, Image_URL_OrigMito, Image_URL_OrigRNA
            FROM Image
            """
    cursor = conn.execute(query)
    data = cursor.fetchall()
    images_df = pd.DataFrame(data, columns=[column[0] for column in cursor.description])

    ## Check that mean_well_profile.csv exists. If not, download it.
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'mean_well_profiles.csv')
    data_query_cached_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print('Loading Well Profiles from local CSV...')
        well_df = pd.read_csv(data_query_cache_path)
    else:
        print('Loading Well Profiles from remote server...')
        download_blob(BUCKET_NAME,
                      f'{PLATE_NUMBER}/mean_well_profiles.csv',
                      Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'mean_well_profiles.csv')
                     )
        well_df = pd.read_csv(data_query_cache_path)
    return chem_df, images_df, well_df

In [102]:
create_folder_structure()

chem_df, images_df, well_df = load_dataframes()

print('Datasets Loaded correctly ✅')

Loading Chemical Annotations from local CSV...
Loading SQLite DB from local DB...
Loading Well Profiles from local CSV...
Datasets Loaded correctly ✅


## Merge of Dataframes into Useful Data

In [103]:
chem_df.head()

Unnamed: 0,BROAD_ID,CPD_NAME,CPD_NAME_TYPE,CPD_SAMPLE_ID,DOS_LIBRARY,SOURCE_NAME,CHEMIST_NAME,VENDOR_CATALOG_ID,CPD_SMILES,USERCOMMENT
0,BRD-A56675431-001-04-0,altizide,INN,SA82748,,Prestwick Chemical Inc.,,Prestw-721,NS(=O)(=O)c1cc2c(NC(CSCC=C)NS2(=O)=O)cc1Cl,
1,BRD-A51829654-001-01-4,"BRL-15,572",common,SA82481,,Biomol International Inc.,,AC-536,OC(CN1CCN(CC1)c1cccc(Cl)c1)C(c1ccccc1)c1ccccc1,
2,BRD-K04046242-001-03-6,equilin,primary-common,SA82922,,Prestwick Chemical Inc.,,Prestw-850,C[C@]12CC[C@H]3C(=CCc4cc(O)ccc34)[C@@H]1CCC2=O,
3,BRD-K16508793-001-01-8,diazepam,INN,SA59660,,MicroSource Discovery Systems Inc.,,1900003,CN1c2ccc(Cl)cc2C(=NCC1=O)c1ccccc1,
4,BRD-K09397065-001-01-6,SR 57227A,to-be-curated,SA82504,,Biomol International Inc.,,AC-561,NC1CCN(CC1)c1cccc(Cl)n1,


In [104]:
well_df.columns

Index(['Metadata_Plate', 'Metadata_Well', 'Metadata_Assay_Plate_Barcode',
       'Metadata_Plate_Map_Name', 'Metadata_well_position',
       'Metadata_ASSAY_WELL_ROLE', 'Metadata_broad_sample',
       'Metadata_mmoles_per_liter', 'Metadata_solvent', 'Metadata_pert_id',
       ...
       'Nuclei_Texture_Variance_DNA_5_0', 'Nuclei_Texture_Variance_ER_10_0',
       'Nuclei_Texture_Variance_ER_3_0', 'Nuclei_Texture_Variance_ER_5_0',
       'Nuclei_Texture_Variance_Mito_10_0', 'Nuclei_Texture_Variance_Mito_3_0',
       'Nuclei_Texture_Variance_Mito_5_0', 'Nuclei_Texture_Variance_RNA_10_0',
       'Nuclei_Texture_Variance_RNA_3_0', 'Nuclei_Texture_Variance_RNA_5_0'],
      dtype='object', length=1800)

In [109]:
images_df.columns

Index(['Image_URL_OrigAGP', 'Image_URL_OrigDNA', 'Image_URL_OrigER',
       'Image_URL_OrigMito', 'Image_URL_OrigRNA'],
      dtype='object')

In [114]:
images_df.iloc[0,:]['Image_URL_OrigAGP'].split('/')[-1].split('_')[1]

'a01'