## Apply tsne to each feature dataset

Input: Data representations
Output: tsne embeddings for plotting

In [1]:
import pathlib
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

import sys

sys.path.append("../utils")
from split_utils import get_features_data

In [2]:
np.random.seed(1234)

In [3]:
output_file = pathlib.Path("evaluations", "tsne_embeddings.csv.gz")

In [4]:
# load x (features) and y (labels) dataframes
labeled_data_path = pathlib.Path("../0.download_data/data/labeled_data__ic.csv.gz")
labeled_data = get_features_data(labeled_data_path).reset_index(drop=True)

print(labeled_data.shape)
labeled_data.head(3)

(2862, 1450)


Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.85668,-0.934949,0.725091,2.25545,-0.565433,1.628086,-0.605625,-0.748135
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556


In [5]:
metadata_columns = [
    "Mitocheck_Phenotypic_Class",
    "Cell_UUID",
    "Location_Center_X",
    "Location_Center_Y",
    "Metadata_Plate",
    "Metadata_Well",
    "Metadata_Frame",
    "Metadata_Site",
    "Metadata_Plate_Map_Name",
    "Metadata_DNA",
    "Metadata_Gene",
    "Metadata_Gene_Replicate",
    "Metadata_Object_Outline",
]

feature_groups = [
    "CP",
    "DP",
    "CP_DP"
]

## Apply tSNE

We test different perplexities ranging from 2 to 300.

From scikit-learn (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html):

> The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. Different values can result in significantly different results. The perplexity must be less than the number of samples.

We do not know what the appropriate value of perplexity is for our dataset, so we will test several

In [6]:
tsne_embedding_df = []
list_of_perplexities = [2, 10, 15, 30, 40, 60, 80, 100, 150, 300]

for perplexity in list_of_perplexities:
    for feature_group in feature_groups:
        # Compile dataset
        if feature_group == "CP_DP":
            input_data_to_tsne = labeled_data.drop(metadata_columns, axis=1)
        else:
            input_data_to_tsne = labeled_data.loc[:, labeled_data.columns.str.startswith(feature_group)]
    
        tsne_model = TSNE(
            n_components=2,
            learning_rate='auto',
            init='random',
            perplexity=perplexity
            )
    
        tsne_embedding = pd.DataFrame(
            tsne_model.fit_transform(input_data_to_tsne)
        )
    
        tsne_embedding.columns = ['tsne_x', 'tsne_y']
    
        tsne_embedding_df.append(
            pd.concat([
                labeled_data.loc[:, metadata_columns],
                tsne_embedding
                ], axis=1
                )
                .assign(
                    feature_group=feature_group,
                    perplexity=perplexity
                )
            )
    
tsne_embedding_df = pd.concat(tsne_embedding_df, axis=0).reset_index(drop=True)

print(tsne_embedding_df.shape)
tsne_embedding_df.head()

(85860, 17)


Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,Metadata_Gene_Replicate,Metadata_Object_Outline,tsne_x,tsne_y,feature_group,perplexity
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,RAB21,1,[[396 595]\n [395 596]\n [394 596]\n [393 596]...,15.230359,-30.346323,CP,2
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,RAB21,1,[[361 563]\n [360 564]\n [359 564]\n [358 564]...,39.679607,-47.5816,CP,2
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,RAB21,1,[[379 662]\n [378 663]\n [377 663]\n [376 663]...,14.805232,-30.469143,CP,2
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,KIF14,1,[[923 515]\n [922 516]\n [921 516]\n [920 516]...,29.561859,-54.327892,CP,2
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,KIF14,1,[[465 108]\n [464 109]\n [464 110]\n [463 111]...,21.115387,-55.362625,CP,2


In [7]:
# Output file
tsne_embedding_df.to_csv(output_file, sep="\t", index=False)