# Data retrieval and cleaning

We import basic libraries

In [1]:
import pandas as pd
import numpy as np
from google.cloud import storage
import os
from pathlib import Path
from tqdm.std import tqdm
import sqlite3

## Retrieval and Storage of Basic files from Google Cloud Storage

In [2]:
BUCKET_NAME = 'raw_profiles'
PLATE_NUMBER = '24277'
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".morpho_minds_data")

In [3]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """
    Download a file from GCS. Is called blob so is generic but will retrieve the SQLite DB.
    """
    # Initialize a client
    storage_client = storage.Client()

    # Get the bucket
    bucket = storage_client.bucket(bucket_name)

    # Get the blob
    blob = bucket.blob(source_blob_name)

    # Download the blob to a destination file
    with open(destination_file_name, 'wb') as f:
        with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
            storage_client.download_blob_to_file(blob, file_obj)

In [4]:
def create_folder_structure():
    """
    Check for folder structure and create it when needed. 
    """
    
    ## Check if data folders exists. If not, create it.
    if not os.path.exists(LOCAL_DATA_PATH):
        os.makedirs(LOCAL_DATA_PATH)
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw'))
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed'))

    if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER)):
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER))
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw'))
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed'))

    if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw')):
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw'))   

    if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed')):
        os.makedirs(Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'processed'))   

In [5]:
def load_dataframes():
    """
    Load all the raw data into dataframes needed to work.
    """
    
    ## Check that file chemical_compounds.csv exists locally. If not, download it.
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'chemical_annotations.csv')
    data_query_cached_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print('Loading Chemical Annotations from local CSV...')
        chem_df = pd.read_csv(data_query_cache_path)
    else:
        print('Loading Chemical Annotations from remote server...')
        download_blob(BUCKET_NAME,
                      f'{PLATE_NUMBER}/chemical_annotations.csv',
                      Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'chemical_annotations.csv')
                     )
        chem_df = pd.read_csv(data_query_cache_path)

    ## Check that sqlite db exists locally. If not, download it.
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw', f'{PLATE_NUMBER}.sqlite')
    data_query_cached_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print('Loading SQLite DB from local DB...')
    else:
        print('Loading SQLite DB from remote DB...')
        download_blob(BUCKET_NAME,
                      f'{PLATE_NUMBER}/{PLATE_NUMBER}.sqlite',
                      Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'raw', f'{PLATE_NUMBER}.sqlite')
                     )

    conn = sqlite3.connect(data_query_cache_path)
    query = """
            SELECT Image_URL_OrigAGP, Image_URL_OrigDNA, Image_URL_OrigER, Image_URL_OrigMito, Image_URL_OrigRNA
            FROM Image
            """
    cursor = conn.execute(query)
    data = cursor.fetchall()
    images_df = pd.DataFrame(data, columns=[column[0] for column in cursor.description])

    ## Check that mean_well_profile.csv exists. If not, download it.
    data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'mean_well_profiles.csv')
    data_query_cached_exists = data_query_cache_path.is_file()

    if data_query_cached_exists:
        print('Loading Well Profiles from local CSV...')
        well_df = pd.read_csv(data_query_cache_path)
    else:
        print('Loading Well Profiles from remote server...')
        download_blob(BUCKET_NAME,
                      f'{PLATE_NUMBER}/mean_well_profiles.csv',
                      Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'mean_well_profiles.csv')
                     )
        well_df = pd.read_csv(data_query_cache_path)
    return chem_df, images_df, well_df

In [6]:
create_folder_structure()

chem_df, images_df, well_df = load_dataframes()

print('Datasets Loaded correctly ✅')

Loading Chemical Annotations from local CSV...
Loading SQLite DB from local DB...
Loading Well Profiles from local CSV...
Datasets Loaded correctly ✅


## Merge of Dataframes into Useful Data

In [None]:
# OrigAGP = Ph_Golgi
# OrigDNA = Hoechst
# OrigER = ERSyto
# OrigMito = Mito
# OrigRNA = ERSytoBleed

In [54]:
images_df_copy = images_df.copy()

In [46]:
images_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2297 entries, 0 to 2296
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Image_URL_OrigAGP   2297 non-null   object
 1   Image_URL_OrigDNA   2297 non-null   object
 2   Image_URL_OrigER    2297 non-null   object
 3   Image_URL_OrigMito  2297 non-null   object
 4   Image_URL_OrigRNA   2297 non-null   object
dtypes: object(5)
memory usage: 89.9+ KB


In [100]:
wells_df = images_df.applymap(lambda x: x.split('/')[-1].split('_')[1])
wells_df.rename(columns={
    'Image_URL_OrigAGP': 'WellAGP', 
    'Image_URL_OrigDNA': 'WellDNA', 
    'Image_URL_OrigER': 'WellER', 
    'Image_URL_OrigMito': 'WellMito', 
    'Image_URL_OrigRNA': 'WellRNA'})

Unnamed: 0,WellAGP,WellDNA,WellER,WellMito,WellRNA
0,a01,a01,a01,a01,a01
1,a01,a01,a01,a01,a01
2,a01,a01,a01,a01,a01
3,a01,a01,a01,a01,a01
4,a01,a01,a01,a01,a01
...,...,...,...,...,...
2292,p24,p24,p24,p24,p24
2293,p24,p24,p24,p24,p24
2294,p24,p24,p24,p24,p24
2295,p24,p24,p24,p24,p24


In [101]:
photo_number_df = images_df.applymap(lambda x: x.split('/')[-1].split('_')[2])
photo_number_df.rename(columns={
    'Image_URL_OrigAGP': 'PhotoNAGP', 
    'Image_URL_OrigDNA': 'PhotoNDNA', 
    'Image_URL_OrigER': 'PhotoNER', 
    'Image_URL_OrigMito': 'PhotoNMito', 
    'Image_URL_OrigRNA': 'PhotoNRNA'})

Unnamed: 0,PhotoNAGP,PhotoNDNA,PhotoNER,PhotoNMito,PhotoNRNA
0,s1,s1,s1,s1,s1
1,s2,s2,s2,s2,s2
2,s3,s3,s3,s3,s3
3,s4,s4,s4,s4,s4
4,s5,s5,s5,s5,s5
...,...,...,...,...,...
2292,s2,s2,s2,s2,s2
2293,s3,s3,s3,s3,s3
2294,s4,s4,s4,s4,s4
2295,s5,s5,s5,s5,s5


In [87]:
pd.concat(
    [
      
    axis = 1)

Unnamed: 0,Image_URL_OrigAGP,Image_URL_OrigDNA,Image_URL_OrigER,Image_URL_OrigMito,Image_URL_OrigRNA,Image_URL_OrigAGP.1,Image_URL_OrigDNA.1,Image_URL_OrigER.1,Image_URL_OrigMito.1,Image_URL_OrigRNA.1,Image_URL_OrigAGP.2,Image_URL_OrigDNA.2,Image_URL_OrigER.2,Image_URL_OrigMito.2,Image_URL_OrigRNA.2
0,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,a01,a01,a01,a01,a01,s1,s1,s1,s1,s1
1,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,a01,a01,a01,a01,a01,s2,s2,s2,s2,s2
2,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,a01,a01,a01,a01,a01,s3,s3,s3,s3,s3
3,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,a01,a01,a01,a01,a01,s4,s4,s4,s4,s4
4,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,a01,a01,a01,a01,a01,s5,s5,s5,s5,s5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,p24,p24,p24,p24,p24,s2,s2,s2,s2,s2
2293,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,p24,p24,p24,p24,p24,s3,s3,s3,s3,s3
2294,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,p24,p24,p24,p24,p24,s4,s4,s4,s4,s4
2295,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,file:/home/ubuntu/bucket/projects/2015_Bray_Gi...,p24,p24,p24,p24,p24,s5,s5,s5,s5,s5


In [8]:
#well_df.columns

In [9]:
#images_df.columns

In [10]:
#images_df.iloc[0,:]['Image_URL_OrigAGP'].split('/')[-1].split('_')[1]

In [107]:
images_df.iloc[0,:][0]

'file:/home/ubuntu/bucket/projects/2015_Bray_GigaScience/CDRP/images/24277/cdp2bioactives_a01_s1_w4f0fa59df-2025-4b28-b6f1-a41391e7d227.tif'

In [108]:
images_df.iloc[0,:][1]

'file:/home/ubuntu/bucket/projects/2015_Bray_GigaScience/CDRP/images/24277/cdp2bioactives_a01_s1_w127df6541-a9a1-46dc-bcdb-b64b187d422b.tif'