# Data retrieval and cleaning

We import basic libraries

In [6]:
import pandas as pd
import numpy as np
from google.cloud import storage
import os
from pathlib import Path
from tqdm.std import tqdm
import sqlite3

## Retrieval and Storage of Basic files from Google Cloud Storage

In [7]:
BUCKET_NAME = 'raw_profiles'
PLATE_NUMBER = '24277'
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".morpho_minds_data")

In [8]:
def create_folder_structure(plate_number):
        """
        Check for folder structure and create it when needed.
        """
        ## Check if data folders exists. If not, create it.
        if not os.path.exists(LOCAL_DATA_PATH):
            os.makedirs(LOCAL_DATA_PATH)
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw'))
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed'))

        if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(plate_number)):
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number))
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw'))
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed'))

        if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw')):
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'raw'))

        if not os.path.exists(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed')):
            os.makedirs(Path(LOCAL_DATA_PATH).joinpath(plate_number, 'processed'))

In [9]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """
    Download a file from GCS. Is called blob so is generic but will retrieve the SQLite DB.

    :param bucket_name: The name of the bucket
    :param source_blob_name: The name of the blob
    :param destination_file_name: The name of the file to save the blob to
    """
    # Initialize a client
    storage_client = storage.Client()
    # Get the bucket
    bucket = storage_client.bucket(bucket_name)
    # Get the blob
    blob = bucket.blob(source_blob_name)
    # Download the blob to a destination file
    with open(destination_file_name, 'wb') as f:
        with tqdm.wrapattr(f, "write", total=blob.size) as file_obj:
            storage_client.download_blob_to_file(blob, file_obj)


## Merge of Dataframes into Useful Data

In [93]:
class Plate:
    def __init__(self, plate_number=None, chem_df=None, images_df=None, well_df=None, plate_df=None):
        self.plate_number = plate_number
        self.chem_df = chem_df
        self.images_df = images_df
        self.well_df = well_df
        self.plate_df = plate_df

    def load(self):
        """
        Load the all the plate data into different dataframes.
        """
        # Check for folder structure and create it when needed.
        create_folder_structure(self.plate_number)

        ## Check that file chemical_compounds.csv exists locally. If not, download it.
        data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'chemical_annotations.csv')
        data_query_cached_exists = data_query_cache_path.is_file()

        if data_query_cached_exists:
            print('Loading Chemical Annotations from local CSV...')
            chem_df = pd.read_csv(data_query_cache_path)
        else:
            print('Loading Chemical Annotations from remote server...')
            download_blob(BUCKET_NAME,
                        f'{self.plate_number}/chemical_annotations.csv',
                        Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'chemical_annotations.csv')
                        )
            chem_df = pd.read_csv(data_query_cache_path)

        ## Check that sqlite db exists locally. If not, download it.
        data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', f'{self.plate_number}.sqlite')
        data_query_cached_exists = data_query_cache_path.is_file()

        if data_query_cached_exists:
            print('Loading SQLite DB from local DB...')
        else:
            print('Loading SQLite DB from remote DB...')
            download_blob(BUCKET_NAME,
                        f'{self.plate_number}/{self.plate_number}.sqlite',
                        Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', f'{self.plate_number}.sqlite')
                        )

        conn = sqlite3.connect(data_query_cache_path)
        query = """
                SELECT Image_URL_OrigAGP, Image_URL_OrigDNA, Image_URL_OrigER, Image_URL_OrigMito, Image_URL_OrigRNA, Image_Count_Cells
                FROM Image
                """
        cursor = conn.execute(query)
        data = cursor.fetchall()
        images_df = pd.DataFrame(data, columns=[column[0] for column in cursor.description])
        
        conn.close()
        
        ## Check that mean_well_profile.csv exists. If not, download it.
        data_query_cache_path = Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', 'mean_well_profiles.csv')
        data_query_cached_exists = data_query_cache_path.is_file()

        if data_query_cached_exists:
            print('Loading Well Profiles from local CSV...')
            well_df = pd.read_csv(data_query_cache_path)
        else:
            print('Loading Well Profiles from remote server...')
            download_blob(BUCKET_NAME,
                        f'{self.plate_number}/mean_well_profiles.csv',
                        Path(LOCAL_DATA_PATH).joinpath(self.plate_number, 'raw', 'mean_well_profiles.csv')
                        )
            well_df = pd.read_csv(data_query_cache_path)
        self.chem_df = chem_df
        self.images_df = images_df
        self.well_df = well_df
        
        print('✅ Data loaded successfully.')

        return self

    def merge_data(self):
        """
        Clean the data.
        """
        wells_df = self.images_df.applymap(lambda x: x.split('/')[-1].split('_')[1])
        wells_df['well'] = wells_df.apply(lambda row: row.unique()[0] if row.nunique()==1 else 0, axis=1)

        photo_number_df = self.images_df.applymap(lambda x: x.split('/')[-1].split('_')[2])
        photo_number_df['photo_number'] = photo_number_df.apply(lambda row: row.unique()[0] if row.nunique()==1 else float('NaN'), axis=1)
        
        self.images_df['Image_URL_OrigAGP'] = self.images_df['Image_URL_OrigAGP'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-Ph_golgi', x.split('/')[-1]))
        self.images_df['Image_URL_OrigDNA'] = self.images_df['Image_URL_OrigDNA'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-Hoechst', x.split('/')[-1]))
        self.images_df['Image_URL_OrigER'] = self.images_df['Image_URL_OrigER'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-ERSyto', x.split('/')[-1]))
        self.images_df['Image_URL_OrigMito'] = self.images_df['Image_URL_OrigMito'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-Mito', x.split('/')[-1]))
        self.images_df['Image_URL_OrigRNA'] = self.images_df['Image_URL_OrigRNA'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Raw_pictures', f'{PLATE_NUMBER}-ERSytoBleed', x.split('/')[-1]))
            
        self.merged_df = pd.concat([
            self.images_df,
            wells_df['well'],
            photo_number_df['photo_number'],
        ],
        axis = 1)
        
        
            
            

In [94]:
plate = Plate('24277').load();

Loading Chemical Annotations from local CSV...
Loading SQLite DB from local DB...
Loading Well Profiles from local CSV...
✅ Data loaded successfully.


In [95]:
plate.merge_data()

In [96]:
plate.__dict__.keys()

dict_keys(['plate_number', 'chem_df', 'images_df', 'well_df', 'plate_df', 'merged_df'])

In [13]:
images_df = plate.images_df

In [None]:
# BROAD_ID in CHEM_DF == Metadata_broad_sample in WELL_DF

In [63]:
df.apply(lambda row: row.nunique() ==1, axis=1)

0     True
1    False
2    False
dtype: bool

In [71]:
try_df = wells_df.apply(lambda row: row.unique()[0] if row.nunique()==1 else float('NaN'), axis=1)

In [31]:
images_df['Image_URL_OrigAGP'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Pictures', f'{PLATE_NUMBER}-Ph_golgi', x.split('/')[-1]))
images_df['Image_URL_OrigDNA'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Pictures', f'{PLATE_NUMBER}-Hoechst', x.split('/')[-1]))
images_df['Image_URL_OrigER'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Pictures', f'{PLATE_NUMBER}-ERSyto', x.split('/')[-1]))
images_df['Image_URL_OrigMito'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Pictures', f'{PLATE_NUMBER}-Mito', x.split('/')[-1]))
images_df['Image_URL_OrigRNA'].apply(lambda x: Path(LOCAL_DATA_PATH).joinpath(PLATE_NUMBER, 'Pictures', f'{PLATE_NUMBER}-ERSytoBleed', x.split('/')[-1]))

0       /Users/pepe/.morpho_minds_data/24277/photos/24...
1       /Users/pepe/.morpho_minds_data/24277/photos/24...
2       /Users/pepe/.morpho_minds_data/24277/photos/24...
3       /Users/pepe/.morpho_minds_data/24277/photos/24...
4       /Users/pepe/.morpho_minds_data/24277/photos/24...
                              ...                        
2292    /Users/pepe/.morpho_minds_data/24277/photos/24...
2293    /Users/pepe/.morpho_minds_data/24277/photos/24...
2294    /Users/pepe/.morpho_minds_data/24277/photos/24...
2295    /Users/pepe/.morpho_minds_data/24277/photos/24...
2296    /Users/pepe/.morpho_minds_data/24277/photos/24...
Name: Image_URL_OrigRNA, Length: 2297, dtype: object

In [25]:
# AGP == Ph-golgi
# DNA == Hoechst
# ER == ERSyto
# Mito == Mito
# RNA == ERSytoBleed

In [27]:
columns

['Image_URL_OrigAGP',
 'Image_URL_OrigDNA',
 'Image_URL_OrigER',
 'Image_URL_OrigMito',
 'Image_URL_OrigRNA']