# Get 2D UMAP embeddings for NF1 DeepProfiler data

## Import libraries

In [1]:
import pathlib
import pandas as pd
import umap

import UMAPutils as utils

## Nuclei DeepProfiler Project

In [2]:
norm_data_nuc_path = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_fs_deepprofiler_nuc.csv.gz")
norm_data_nuc = pd.read_csv(norm_data_nuc_path, compression="gzip")

print(norm_data_nuc.shape)
norm_data_nuc.head()

(257, 3837)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,652.868421,760.552632,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.514201,0.478099,0.484105,-0.212163,-0.572581,-0.23907,0.907958,0.219261,0.311132,-0.288884
1,1015.898477,209.162437,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.250138,-0.740111,-0.190505,-0.690312,-0.862426,4.039268,-1.532861,-1.382833,-0.336643,0.894669
2,387.20283,238.853774,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.35873,-0.183352,1.252175,-0.824103,-0.18572,-0.380602,0.464281,0.019876,0.793334,1.428218
3,259.502304,250.400922,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,0.540821,-0.724848,1.529958,-0.9405,-0.649775,-0.212045,0.009084,0.655653,-0.164469,0.250813
4,351.255708,592.429224,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.408348,-0.629579,-0.423012,-1.421697,0.519623,-0.639995,0.571886,-1.03172,-0.108736,0.103133


In [3]:
metadata_dataframe, feature_data = utils.split_data(norm_data_nuc)

print(metadata_dataframe.shape)
metadata_dataframe.head()

(257, 10)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,Metadata_Genotype_Replicate,Metadata_Model
0,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
1,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
2,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
3,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
4,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet


In [4]:
x_locations = norm_data_nuc["Location_Center_X"]
metadata_dataframe.insert(0, "Location_Center_X", x_locations)
y_locations = norm_data_nuc["Location_Center_Y"]
metadata_dataframe.insert(1, "Location_Center_Y", y_locations)

print(metadata_dataframe.shape)
metadata_dataframe.head()

(257, 12)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,Metadata_Genotype_Replicate,Metadata_Model
0,652.868421,760.552632,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
1,1015.898477,209.162437,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
2,387.20283,238.853774,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
3,259.502304,250.400922,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
4,351.255708,592.429224,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet


In [5]:
grouped_data = metadata_dataframe.groupby(["Metadata_Well"])['Metadata_Well'].count().reset_index(name='Metadata_number_of_singlecells')
grouped_data

Unnamed: 0,Metadata_Well,Metadata_number_of_singlecells
0,C6,26
1,C7,23
2,D6,7
3,D7,32
4,E6,17
5,E7,60
6,F6,24
7,F7,68


In [6]:
metadata_dataframe = metadata_dataframe.merge(grouped_data, on="Metadata_Well")
new_metadata_column = metadata_dataframe.pop("Metadata_number_of_singlecells")
metadata_dataframe.insert(2, "Metadata_number_of_singlecells", new_metadata_column)

In [7]:
feature_data = feature_data.drop(['Location_Center_X', 'Location_Center_Y'], axis=1)

print(feature_data.shape)
feature_data.head()

(257, 3825)


Unnamed: 0,efficientnet_0,efficientnet_1,efficientnet_10,efficientnet_100,efficientnet_1000,efficientnet_1001,efficientnet_1002,efficientnet_1003,efficientnet_1004,efficientnet_1005,...,efficientnet_990,efficientnet_991,efficientnet_992,efficientnet_993,efficientnet_994,efficientnet_995,efficientnet_996,efficientnet_997,efficientnet_998,efficientnet_999
0,0.235069,-2.524998,-0.645525,-0.321902,0.327108,-0.425506,0.109151,-0.522269,-0.248527,-1.001633,...,-0.74527,-0.594816,-0.45005,-0.059045,-0.647262,-0.209944,0.866904,0.236938,-0.026535,-0.276103
1,-0.623746,1.0377,1.009706,-0.41876,0.288138,-0.374713,-0.480356,0.14285,-0.198695,0.062926,...,2.162966,-0.37387,0.273352,-0.867027,-0.576743,0.107865,-0.154136,-0.675416,0.090121,2.199593
2,-0.514111,0.838314,0.741559,0.00739,-0.343973,0.162181,-0.167019,-0.035345,0.142035,-0.430217,...,-0.247819,-0.051758,-0.282056,0.498236,-0.575197,-0.652407,-0.175582,0.287247,-0.310507,-0.001377
3,-0.590767,-0.004433,-1.088701,1.448685,0.253028,0.257632,-0.265532,3.350086,0.145468,0.271872,...,-0.060401,0.82096,0.148889,-1.753011,0.850738,1.141865,1.014065,-0.894101,1.714785,3.437282
4,-0.630595,-0.344079,1.002623,-0.005197,-1.205885,0.062935,-0.146158,-0.390066,0.045541,-0.298416,...,0.459659,0.901046,-1.393525,0.293504,-0.724374,-0.392195,-0.502709,-0.853243,1.176767,-0.377427


In [8]:
fit = umap.UMAP(random_state=0, n_components=2)

embeddings = pd.DataFrame(
        fit.fit_transform(feature_data), columns=["UMAP1", "UMAP2"]
    )
embeddings

Unnamed: 0,UMAP1,UMAP2
0,4.264595,6.184330
1,4.122251,3.966355
2,5.334163,4.959183
3,5.821635,3.165360
4,4.885064,4.497552
...,...,...
252,5.298770,1.950539
253,5.954596,3.117260
254,5.501772,1.967708
255,5.394285,3.641819


In [9]:
save_path = pathlib.Path('../../data/norm_fs_embeddings_dp_nuc.csv.gz')

norm_fs_embeddings_data_dp_nuc = utils.merge_metadata_embeddings(metadata_dataframe, embeddings, save_path)

print(norm_fs_embeddings_data_dp_nuc.shape)
norm_fs_embeddings_data_dp_nuc.head()

(257, 15)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_number_of_singlecells,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,Metadata_Genotype_Replicate,Metadata_Model,UMAP1,UMAP2
0,652.868421,760.552632,7,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,4.264595,6.18433
1,451.956522,477.682609,7,1,D6,2,1_D6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,4.122251,3.966355
2,528.925743,563.628713,7,1,D6,2,1_D6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,5.334163,4.959183
3,417.522321,702.272321,7,1,D6,2,1_D6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,5.821635,3.16536
4,166.436019,337.886256,7,1,D6,4,1_D6_4,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,4.885064,4.497552


## Cytoplasm DeepProfiler Project

In [10]:
norm_data_cyto_path = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_fs_deepprofiler_cyto.csv.gz")
norm_data_cyto = pd.read_csv(norm_data_cyto_path, compression="gzip")

print(norm_data_cyto.shape)
norm_data_cyto.head()

(256, 3844)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,...,efficientnet_3830,efficientnet_3831,efficientnet_3832,efficientnet_3833,efficientnet_3834,efficientnet_3835,efficientnet_3836,efficientnet_3837,efficientnet_3838,efficientnet_3839
0,650.422472,736.706742,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.034014,0.859518,-0.025713,-0.729571,0.267231,-0.847366,-1.090082,0.566786,-0.024363,0.230463
1,949.182667,256.734667,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.021109,0.185511,-0.612227,2.925806,-0.927761,-0.30435,-0.031065,0.626545,-0.40806,-1.211616
2,454.460081,265.797023,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.158382,-0.810211,-0.720877,0.127869,1.199453,1.977439,-0.844405,-0.299137,-0.775166,-0.292143
3,211.165254,274.745763,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.831178,-0.614643,-0.168719,-0.343051,-0.219525,0.493543,-0.751189,0.684554,0.159646,0.196
4,375.12513,566.646507,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,...,-0.118977,0.065025,-0.43171,-0.547652,-0.538548,0.022739,0.639954,0.393346,0.235406,0.932857


In [11]:
metadata_dataframe, feature_data = utils.split_data(norm_data_cyto)

print(metadata_dataframe.shape)
metadata_dataframe.head()

(256, 10)


Unnamed: 0,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,Metadata_Genotype_Replicate,Metadata_Model
0,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
1,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
2,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
3,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
4,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet


In [12]:
x_locations = norm_data_cyto["Location_Center_X"]
metadata_dataframe.insert(0, "Location_Center_X", x_locations)
y_locations = norm_data_cyto["Location_Center_Y"]
metadata_dataframe.insert(1, "Location_Center_Y", y_locations)

print(metadata_dataframe.shape)
metadata_dataframe.head()

(256, 12)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,Metadata_Genotype_Replicate,Metadata_Model
0,650.422472,736.706742,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
1,949.182667,256.734667,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
2,454.460081,265.797023,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
3,211.165254,274.745763,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet
4,375.12513,566.646507,1,F6,2,1_F6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet


In [13]:
grouped_data = metadata_dataframe.groupby(["Metadata_Well"])['Metadata_Well'].count().reset_index(name='Metadata_number_of_singlecells')
grouped_data

Unnamed: 0,Metadata_Well,Metadata_number_of_singlecells
0,C6,26
1,C7,23
2,D6,7
3,D7,32
4,E6,17
5,E7,59
6,F6,24
7,F7,68


In [14]:
metadata_dataframe = metadata_dataframe.merge(grouped_data, on="Metadata_Well")
new_metadata_column = metadata_dataframe.pop("Metadata_number_of_singlecells")
metadata_dataframe.insert(2, "Metadata_number_of_singlecells", new_metadata_column)

In [15]:
feature_data = feature_data.drop(['Location_Center_X', 'Location_Center_Y'], axis=1)

print(feature_data.shape)
feature_data.head()

(256, 3832)


Unnamed: 0,efficientnet_0,efficientnet_1,efficientnet_10,efficientnet_100,efficientnet_1000,efficientnet_1001,efficientnet_1002,efficientnet_1003,efficientnet_1004,efficientnet_1005,...,efficientnet_990,efficientnet_991,efficientnet_992,efficientnet_993,efficientnet_994,efficientnet_995,efficientnet_996,efficientnet_997,efficientnet_998,efficientnet_999
0,0.518928,-1.030638,-0.17667,-0.658651,-0.231146,-0.410912,1.258987,0.528172,-0.462594,-0.798068,...,-0.749512,-0.732841,0.767652,-0.881471,-0.512916,-0.123882,-0.047746,-0.188216,0.222457,0.343864
1,-0.532346,0.038914,-0.481328,-0.494546,-0.460594,-0.361398,-0.503176,-0.468075,0.42903,-0.015511,...,0.111416,-0.817825,-1.079264,-1.538474,0.100065,-0.407861,-0.273762,-1.445162,0.608907,-0.422729
2,0.717823,1.356706,0.209188,-0.497367,-0.710925,-0.286272,-0.258021,-0.265605,-0.434462,-0.365862,...,-0.585437,-0.748314,0.5629,-0.810733,-0.278916,0.4189,-0.477898,0.500113,-0.110842,0.531526
3,-0.083235,-0.367833,-0.04748,1.570235,-0.601118,0.391697,0.271105,2.680643,-0.773746,0.150493,...,1.544803,0.140156,0.679623,0.881273,1.707555,-0.41993,-0.16985,-0.467632,-0.659259,-0.136917
4,-0.698319,-0.016846,-0.11346,0.370509,-0.797704,-0.59463,-0.81606,-0.001589,-0.834368,-0.70913,...,-0.37462,0.715716,-0.446599,-0.323712,-0.663026,-0.708899,-0.434248,-0.835424,0.168417,0.139753


In [16]:
fit = umap.UMAP(random_state=0, n_components=2)

embeddings = pd.DataFrame(
        fit.fit_transform(feature_data), columns=["UMAP1", "UMAP2"]
    )
embeddings

Unnamed: 0,UMAP1,UMAP2
0,13.848951,2.563878
1,10.949956,2.749159
2,12.673684,2.965985
3,11.994288,2.616522
4,12.952534,3.528729
...,...,...
251,9.114523,5.192198
252,8.850242,5.101776
253,9.071429,5.187535
254,9.066143,5.200025


In [17]:
save_path = pathlib.Path('../../data/norm_fs_embeddings_dp_cyto.csv.gz')

norm_fs_embeddings_data_dp_cyto = utils.merge_metadata_embeddings(metadata_dataframe, embeddings, save_path)

print(norm_fs_embeddings_data_dp_cyto.shape)
norm_fs_embeddings_data_dp_cyto.head()

(256, 15)


Unnamed: 0,Location_Center_X,Location_Center_Y,Metadata_number_of_singlecells,Metadata_Plate,Metadata_Well,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_ER,Metadata_Actin,Metadata_Genotype,Metadata_Genotype_Replicate,Metadata_Model,UMAP1,UMAP2
0,650.422472,736.706742,7,1,D6,3,1_D6_3,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,13.848951,2.563878
1,443.544415,442.5708,7,1,D6,2,1_D6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,10.949956,2.749159
2,576.896764,565.42527,7,1,D6,2,1_D6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,12.673684,2.965985
3,413.186421,700.486766,7,1,D6,2,1_D6_2,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,11.994288,2.616522
4,201.482682,373.559777,7,1,D6,4,1_D6_4,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,../../../../1_preprocessing_data/Corrected_Ima...,WT,1,efficientnet,12.952534,3.528729
