# Perform Two-Sample KS-Test on NF1 Genotype Samples

## Import libraries

In [1]:
import numpy as np
from scipy.stats import ks_2samp
import pathlib
import pandas as pd

## Set seed

In [2]:
np.random.seed(0)

## Load in NF1 data

In [3]:
norm_fs_data = pathlib.Path("../../../4_processing_features/data/nf1_sc_norm_cellprofiler.csv.gz")

data = pd.read_csv(norm_fs_data, compression="gzip", index_col=0)

## Helper functions to perform KS-test and create final `csv` file with results

In [4]:
def nf1_ks_test_two_sample(data: pd.DataFrame):
    """seperate features by genotype and perform two sample ks-test on each feature

    Parameters
    ----------
    data : pd.Dataframe
        pycytominer output after normalization and feature selection

    Returns
    -------
    pd.Dataframe
        feature results from the two sample ks-test
    """
    feature_results = []

    null_features = data[(data["Metadata_genotype"] == "Null")]
    wt_features = data[(data["Metadata_genotype"] == "WT")]

    for column in data:
        if "Metadata" not in column:
            null_feature = null_features[column].to_numpy()
            wt_feature = wt_features[column].to_numpy()
            results = ks_2samp(wt_feature, null_feature)
            # have to seperate out namedtuple due to scipy hiding the last two results 
            results = tuple([results.statistic, results.pvalue, results.statistic_location, results.statistic_sign])
            feature_results.append(results)

    feature_results = pd.DataFrame(feature_results, columns=["statistic", "pvalue", "statistic_location", "statistic_sign"])

    return feature_results


def merge_features_kstest(
    feature_results: pd.DataFrame,
    column_names: list,
    save_path: pathlib.Path = None,
):
    """
    merge features with ks-test results into one dataframe

    Parameters
    ----------
    feature_results : pd.Dataframe
        ks-test results
    column_names : list
        feature names from the columns of the NF1 data
    save_path : pathlib.Path
        path for the new dataframe

    Returns
    -------
    pd.Dataframe
        merged dataframe with features and ks-test results
    """
    # put dataframes into list of where the columns should go
    dataframes = [column_names, feature_results]

    # merge dataframes together
    merged_dataframe = pd.concat(dataframes, axis=1)

    # save csv file if you would like
    if save_path is not None:
        merged_dataframe.to_csv(save_path, index=False)

    return merged_dataframe

## Peform two sample KS-test

In [5]:
feature_results = nf1_ks_test_two_sample(data)
feature_results

Unnamed: 0,statistic,pvalue,statistic_location,statistic_sign
0,0.525862,4.029958e-07,-0.254826,1
1,0.476750,7.982931e-06,0.593271,-1
2,0.339342,3.820458e-03,-0.138475,-1
3,0.123824,7.710387e-01,-0.451752,-1
4,0.098485,9.375309e-01,-0.828083,-1
...,...,...,...,...
1038,0.142111,6.134193e-01,0.086883,1
1039,0.256531,5.602030e-02,-0.323494,1
1040,0.264890,4.448887e-02,-0.337497,1
1041,0.273772,3.439218e-02,-0.325302,1


## Take feature columns from data and create a list

In [6]:
features = data.iloc[:,12:]
column_names = features.columns.tolist()
column_names = pd.DataFrame(column_names)
column_names.columns = ["Features"]

column_names

Unnamed: 0,Features
0,Cytoplasm_Number_Object_Number
1,Cytoplasm_AreaShape_Area
2,Cytoplasm_AreaShape_BoundingBoxArea
3,Cytoplasm_AreaShape_BoundingBoxMaximum_X
4,Cytoplasm_AreaShape_BoundingBoxMaximum_Y
...,...
1038,Nuclei_Texture_Variance_GFP_3_03_256
1039,Nuclei_Texture_Variance_RFP_3_00_256
1040,Nuclei_Texture_Variance_RFP_3_01_256
1041,Nuclei_Texture_Variance_RFP_3_02_256


## Save the final `csv` file with merged features and results

In [7]:
save_path = pathlib.Path("data/nf1_kstest_two_sample_results.csv")

merge_features_kstest(feature_results, column_names, save_path)

Unnamed: 0,Features,statistic,pvalue,statistic_location,statistic_sign
0,Cytoplasm_Number_Object_Number,0.525862,4.029958e-07,-0.254826,1
1,Cytoplasm_AreaShape_Area,0.476750,7.982931e-06,0.593271,-1
2,Cytoplasm_AreaShape_BoundingBoxArea,0.339342,3.820458e-03,-0.138475,-1
3,Cytoplasm_AreaShape_BoundingBoxMaximum_X,0.123824,7.710387e-01,-0.451752,-1
4,Cytoplasm_AreaShape_BoundingBoxMaximum_Y,0.098485,9.375309e-01,-0.828083,-1
...,...,...,...,...,...
1038,Nuclei_Texture_Variance_GFP_3_03_256,0.142111,6.134193e-01,0.086883,1
1039,Nuclei_Texture_Variance_RFP_3_00_256,0.256531,5.602030e-02,-0.323494,1
1040,Nuclei_Texture_Variance_RFP_3_01_256,0.264890,4.448887e-02,-0.337497,1
1041,Nuclei_Texture_Variance_RFP_3_02_256,0.273772,3.439218e-02,-0.325302,1
