# Perform Two-Sample KS-Test on NF1 Genotype Samples

## Import libraries

In [15]:
import numpy as np
from scipy.stats import ks_2samp
import pathlib
import pandas as pd

## Set seed

In [16]:
np.random.seed(0)

## Load in NF1 data

In [17]:
norm_fs_data = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz")

data = pd.read_csv(norm_fs_data, compression="gzip", index_col=0)

print(data.shape)
data.head()

(149, 1055)


Unnamed: 0,Metadata_WellRow,Metadata_WellCol,Metadata_number_of_singlecells,Metadata_gene_name,Metadata_genotype,Metadata_ImageNumber,Metadata_Plate,Metadata_Well,Metadata_Cytoplasm_Parent_Cells,Metadata_Cytoplasm_Parent_OrigNuclei,...,Nuclei_Texture_SumVariance_RFP_3_02_256,Nuclei_Texture_SumVariance_RFP_3_03_256,Nuclei_Texture_Variance_GFP_3_00_256,Nuclei_Texture_Variance_GFP_3_01_256,Nuclei_Texture_Variance_GFP_3_02_256,Nuclei_Texture_Variance_GFP_3_03_256,Nuclei_Texture_Variance_RFP_3_00_256,Nuclei_Texture_Variance_RFP_3_01_256,Nuclei_Texture_Variance_RFP_3_02_256,Nuclei_Texture_Variance_RFP_3_03_256
0,C,6,12,NF1,WT,1,1,C6,1,4,...,3.14154,3.202273,-0.097356,-0.096165,-0.094202,-0.106456,3.337969,3.350528,3.278168,3.310371
1,C,6,12,NF1,WT,1,1,C6,2,5,...,0.315924,0.258633,-0.087971,-0.069493,-0.065539,-0.095377,0.314776,0.31392,0.34842,0.318693
2,C,6,12,NF1,WT,1,1,C6,3,7,...,0.295233,0.383161,0.065251,0.00555,-0.015212,-0.029087,0.348492,0.33394,0.341312,0.347999
3,C,6,12,NF1,WT,1,1,C6,4,8,...,1.151725,1.159965,0.023403,0.051931,0.026268,-0.002094,1.184695,1.243519,1.263751,1.167156
4,C,6,12,NF1,WT,4,1,C6,1,3,...,0.699723,0.628294,-0.428904,-0.416992,-0.429383,-0.420997,0.690298,0.662006,0.685883,0.701466


## Helper functions to perform KS-test and create final `csv` file with results

In [18]:
def nf1_ks_test_two_sample(normalized_data: pd.DataFrame):
    """separate features by genotype and perform two sample ks-test on each feature

    Parameters
    ----------
    normalized_data : pd.Dataframe
        pycytominer output after normalization

    Returns
    -------
    pd.Dataframe
        feature results from the two sample ks-test
    """
    feature_results = []

    # divide the NF1 data based on genotype
    null_features = normalized_data[(normalized_data["Metadata_genotype"] == "Null")]
    wt_features = normalized_data[(normalized_data["Metadata_genotype"] == "WT")]

    # iterate through the columns in the data (both of the genotype dataframes will have the same columns)
    for column in normalized_data:
        # do not include metadata columns
        if "Metadata" not in column:
            # convert each individual column (feature) into numpy array
            null_feature = null_features[column].to_numpy()
            wt_feature = wt_features[column].to_numpy()
            
            # run two-sample ks-test for each feature 
            results = ks_2samp(wt_feature, null_feature)
            # convert all keys/ks-test results (even the hidden ones due to scipy) into a dictionary 
            # and put them as a list
            results = tuple(list(results._asdict().values()))
            feature_results.append(results)

    feature_results = pd.DataFrame(feature_results, columns=["statistic", "pvalue", "statistic_location", "statistic_sign"])

    return feature_results

def merge_features_kstest(
    feature_results: pd.DataFrame,
    feature_names: list,
    save_path: pathlib.Path = None,
):
    """
    merge features with ks-test results into one dataframe

    Parameters
    ----------
    feature_results : pd.Dataframe
        ks-test results
    column_names : list
        feature names from the columns of the NF1 data
    save_path : pathlib.Path
        path for the new dataframe

    Returns
    -------
    pd.Dataframe
        merged dataframe with features and ks-test results
    """
    # put dataframes into list of where the columns should go
    dataframes = [feature_names, feature_results]

    # merge dataframes together
    merged_dataframe = pd.concat(dataframes, axis=1)

    # save csv file if you would like
    if save_path is not None:
        merged_dataframe.to_csv(save_path, index=False)

    return merged_dataframe

## Perform two sample KS-test

In [19]:
feature_results = nf1_ks_test_two_sample(data)
feature_results

Unnamed: 0,statistic,pvalue,statistic_location,statistic_sign
0,0.525862,4.029958e-07,-0.254826,1
1,0.476750,7.982931e-06,0.593271,-1
2,0.339342,3.820458e-03,-0.138475,-1
3,0.123824,7.710387e-01,-0.451752,-1
4,0.098485,9.375309e-01,-0.828083,-1
...,...,...,...,...
1038,0.142111,6.134193e-01,0.086883,1
1039,0.256531,5.602030e-02,-0.323494,1
1040,0.264890,4.448887e-02,-0.337497,1
1041,0.273772,3.439218e-02,-0.325302,1


## Take feature columns from data and create a list

In [20]:
# find feature names in the columns from the data
feature_names = [
        col_name
        for col_name in data.columns.tolist()
        if "Metadata" not in col_name
    ]

feature_names = pd.DataFrame(feature_names)
feature_names.columns = ["Features"]

feature_names

Unnamed: 0,Features
0,Cytoplasm_Number_Object_Number
1,Cytoplasm_AreaShape_Area
2,Cytoplasm_AreaShape_BoundingBoxArea
3,Cytoplasm_AreaShape_BoundingBoxMaximum_X
4,Cytoplasm_AreaShape_BoundingBoxMaximum_Y
...,...
1038,Nuclei_Texture_Variance_GFP_3_03_256
1039,Nuclei_Texture_Variance_RFP_3_00_256
1040,Nuclei_Texture_Variance_RFP_3_01_256
1041,Nuclei_Texture_Variance_RFP_3_02_256


## Save the final `csv` file with merged features and results

In [21]:
save_path = pathlib.Path("data/nf1_kstest_two_sample_results.csv")

merge_features_kstest(feature_results, feature_names, save_path)

Unnamed: 0,Features,statistic,pvalue,statistic_location,statistic_sign
0,Cytoplasm_Number_Object_Number,0.525862,4.029958e-07,-0.254826,1
1,Cytoplasm_AreaShape_Area,0.476750,7.982931e-06,0.593271,-1
2,Cytoplasm_AreaShape_BoundingBoxArea,0.339342,3.820458e-03,-0.138475,-1
3,Cytoplasm_AreaShape_BoundingBoxMaximum_X,0.123824,7.710387e-01,-0.451752,-1
4,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,0.098485,9.375309e-01,-0.828083,-1
...,...,...,...,...,...
1038,Nuclei_Texture_Variance_GFP_3_03_256,0.142111,6.134193e-01,0.086883,1
1039,Nuclei_Texture_Variance_RFP_3_00_256,0.256531,5.602030e-02,-0.323494,1
1040,Nuclei_Texture_Variance_RFP_3_01_256,0.264890,4.448887e-02,-0.337497,1
1041,Nuclei_Texture_Variance_RFP_3_02_256,0.273772,3.439218e-02,-0.325302,1
