## Explore data

- Calculate pairwise correlations between single-cells

In [1]:
import pathlib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import sys
sys.path.append("../utils")
from split_utils import get_features_data
from train_utils import get_X_y_data

In [2]:
def create_tidy_corr_matrix(data_array, labels):
    # Calculate the pairwise correlation matrix
    correlation_matrix = np.corrcoef(data_array, rowvar=True)
    
    # Convert the correlation matrix to a DataFrame for easier manipulation
    df_corr = pd.DataFrame(correlation_matrix)
    
    # Melt the correlation matrix
    melted_corr = df_corr.stack().reset_index()
    melted_corr.columns = ["Row_ID", "Pairwise_Row_ID", "Correlation"]
    
    # Filter out the lower triangle including diagonal
    melted_corr = melted_corr[melted_corr["Row_ID"] < melted_corr["Pairwise_Row_ID"]]
    
    # Add labels for the rows and columns
    melted_corr["Row_Label"] = melted_corr["Row_ID"].apply(lambda x: labels[x])
    melted_corr["Pairwise_Row_Label"] = melted_corr["Pairwise_Row_ID"].apply(lambda x: labels[x])
    
    # Reorder columns
    melted_corr = melted_corr[["Row_ID", "Pairwise_Row_ID", "Correlation", "Row_Label", "Pairwise_Row_Label"]]
    
    return melted_corr

In [3]:
# Set constants
feature_spaces = ["CP", "DP", "CP_and_DP"]

output_dir = "data"
output_basename = pathlib.Path(output_dir, "pairwise_correlations")

In [4]:
# load x (features) and y (labels) dataframes
labeled_data_path = pathlib.Path("../0.download_data/data/labeled_data__ic.csv.gz")
labeled_data = get_features_data(labeled_data_path)

print(labeled_data.shape)
labeled_data.head(3)

(2862, 1450)


Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,1.526493,-0.388909,-0.715202,-0.939279,-0.077689,1.965509,18.685819,0.061676,2.641369,-0.086854
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.482883,-1.354858,-0.85668,-0.934949,0.725091,2.25545,-0.565433,1.628086,-0.605625,-0.748135
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,0.888706,1.350431,-0.648841,0.264205,0.131341,0.678315,0.171044,0.342206,-0.581597,0.505556


In [5]:
for feature_space in feature_spaces:
    # Get specific feature sets
    cp_feature_df, cp_label_df = get_X_y_data(labeled_data, dataset=feature_space)

    # Calculate pairwise correlations between nuclei
    cp_tidy_corr_df = create_tidy_corr_matrix(cp_feature_df, cp_label_df)

    # Output to file
    output_file = f"{output_basename}_{feature_space}.tsv.gz"
    cp_tidy_corr_df.to_csv(output_file, sep="\t", index=False)