In [1]:
# Import necessary packages
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
import scikit_posthocs

# Import plotting packages
import iqplot
import bokeh.io
from bokeh.io import output_file, show
from bokeh.layouts import column, row
bokeh.io.output_notebook()

# Import data from directory of measurement tables, collected from Fiji

In [4]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*_Puncta.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details fromm Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['Embryo'], df['Treatment'], df['Stains'], 
             df['FOV'], df['del1']) = zip(*df['Image'].map(lambda x:x.split('_')))
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)
df_summary = df_summary.drop(['Total Area', 'Average Size', '%Area', 'Mean', 'IntDen', 'del1'], axis=1)

# Preview dataframe to confirm import successful
df_summary.head()

# Assign import to full_results df
full_results = df_summary.copy()
full_results.to_csv('Combined_Source_Data.csv')
full_results.head()

Unnamed: 0,Slice,Count,Image,Date,Embryo,Treatment,Stains,FOV
0,1,1,20210807_Emb2_ControlMO_H2BRFP;PostPermFLAG;Pr...,20210807,Emb2,ControlMO,H2BRFP;PostPermFLAG;PrePermFLAG;TPMT,FOV12
1,2,1,20210807_Emb2_ControlMO_H2BRFP;PostPermFLAG;Pr...,20210807,Emb2,ControlMO,H2BRFP;PostPermFLAG;PrePermFLAG;TPMT,FOV12
2,3,0,20210807_Emb2_ControlMO_H2BRFP;PostPermFLAG;Pr...,20210807,Emb2,ControlMO,H2BRFP;PostPermFLAG;PrePermFLAG;TPMT,FOV12
3,4,1,20210807_Emb2_ControlMO_H2BRFP;PostPermFLAG;Pr...,20210807,Emb2,ControlMO,H2BRFP;PostPermFLAG;PrePermFLAG;TPMT,FOV12
0,1,2,20210807_Emb3_SMPD3MO_H2BRFP;PostPermFLAG;PreP...,20210807,Emb3,SMPD3MO,H2BRFP;PostPermFLAG;PrePermFLAG;TPMT,FOV6


## Analyze results of nSMase2 MO on LRP6 internalization count

Generate ECDF plot to display distribution frequencies

Run 2-sample Kolmogorov-Smirnov Test to determine statistical significance

In [9]:
treatment_list = [
    'ControlMO',
    'SMPD3MO',
]

df_subset = full_results
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatment_list)]

# Make ECDF plot using iqplot
data_ecdf = iqplot.ecdf(
    data=df_subset, q='Count', cats='Treatment', q_axis='x'
    ,style='staircase'
    ,order=treatment_list
#     ,palette=['#1f77b4', '#ff7f0e','#2ca02c']
#     ,palette=['#9467bd', '#d62728']
    ,line_kwargs=dict(line_width=3)
#     ,conf_int=True, n_bs_reps=1000, ptiles=[16, 84]   # ptiles values equate to SEM
    ,conf_int=True, n_bs_reps=1000, ptiles=[2.5, 97.5]   # ptiles values equate to 95% CIs

    # Other customization parameters
#     ,x_range=(-1.5,35)
    ,frame_height = 350, frame_width = 450
    ,x_axis_label='FLAG-LRP6+ Puncta Count', y_axis_label='Cumulative Distribution Frequency'
    ,show_legend=True
)

# Other customization parameters
data_ecdf.axis.axis_label_text_font_size = '20px'
data_ecdf.axis.axis_label_text_font_style = 'normal'
data_ecdf.axis.major_label_text_font_size = '18px'

# data_ecdf.output_backend = "svg"
show(row(data_ecdf))

### Kolmogorov-Smirnov test - NO MULTIPLE COMPARISONS
# Define samples to compare
category = 'Treatment'
sample1 = 'ControlMO'
sample2 = 'SMPD3MO'
metric = 'Count'

# Run 2-sample Kolmogorov-Smirnov Test
ks_result = stats.ks_2samp(df_subset.loc[df_subset[category]==sample1][metric]
                           ,df_subset.loc[df_subset[category]==sample2][metric])

# Display results of Kolmogorov-Smirnov test
print('Two-sample Kolmogorov-Smirnov test results for ' + sample1 + ' vs ' + sample2 + ': \n\t\t\t\t statistic=' + str(ks_result[0]) + 
    '\n\t\t\t\t p-value=' + str(ks_result[1]))

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')

Two-sample Kolmogorov-Smirnov test results for ControlMO vs SMPD3MO: 
				 statistic=0.18094809143114515
				 p-value=0.003684573459250773
n = 214 cells in the ControlMO dataset.
n = 166 cells in the SMPD3MO dataset.
