In [1]:
# Import necessary packages
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
import scikit_posthocs

# Import plotting packages
import iqplot
import bokeh.io
from bokeh.io import output_file, show
from bokeh.layouts import column, row
bokeh.io.output_notebook()

# Import data from directory of measurement tables, collected from Fiji

In [2]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*_Puncta.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['Embryo'], df['Treatment'], df['Stains'], 
             df['FOV'], df['del1']) = zip(*df['Image'].map(lambda x:x.split('_')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Slice', 'Total Area', 'Average Size', '%Area', 'Mean', 'IntDen', 'del1'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)

# Preview dataframe to confirm import successful
df_summary.head()

# Assign import to full_results df
full_results = df_summary.copy()
full_results.head()

Unnamed: 0,Count,Image,Date,Embryo,Treatment,Stains,FOV
0,1,20210807_Emb6_H2BRFP_Tf633;H2BRFP;TPMT_FOV2_Pu...,20210807,Emb6,H2BRFP,Tf633;H2BRFP;TPMT,FOV2
1,2,20210807_Emb6_H2BRFP_Tf633;H2BRFP;TPMT_FOV2_Pu...,20210807,Emb6,H2BRFP,Tf633;H2BRFP;TPMT,FOV2
2,2,20210807_Emb6_H2BRFP_Tf633;H2BRFP;TPMT_FOV2_Pu...,20210807,Emb6,H2BRFP,Tf633;H2BRFP;TPMT,FOV2
0,7,20210807_Emb6_SMPD3FLAG_Tf633;H2BRFP;TPMT_FOV2...,20210807,Emb6,SMPD3FLAG,Tf633;H2BRFP;TPMT,FOV2
1,12,20210807_Emb6_SMPD3FLAG_Tf633;H2BRFP;TPMT_FOV2...,20210807,Emb6,SMPD3FLAG,Tf633;H2BRFP;TPMT,FOV2


## Plot and Analyze nSMase2 LOF vs Rescue

In [3]:
# Parse out desired treatments
treatment_list = [
    'ControlMO',
    'SMPD3MO',
    'SMPD3FLAG',
    'SMPD3MO;SMPD3FLAG',
]

df_subset = full_results
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatment_list)]

# Make ECDF plot using iqplot
data_ecdf = iqplot.box(
    data=df_subset, q='Count', cats='Treatment', q_axis='x'
    ,order=treatment_list
#     ,palette=['#1f77b4', '#ff7f0e','#2ca02c', '#d62728']    # Color scheme for Control vs nSMase2 MO vs Rescue
#     ,palette=['#9467bd', '#d62728']              # Color scheme for H2BRFP vs nSMase2-FLAG

    # Other customization parameters
    ,frame_height = 150, frame_width = 450
    ,x_axis_label='Transferrin-633 Puncta Count'


#     ,jitter=True, jitter_kwargs=dict(width=0.3)
#     ,marker_kwargs=dict(alpha=0.8, size=1)
    ,box_kwargs=dict(line_color='black', line_width=1.5)
    ,whisker_kwargs=dict(line_color='black', line_width=1.5)
    ,median_kwargs=dict(line_color='black', line_width=2)
        ,whisker_caps=True

)

# Other customization parameters
data_ecdf.axis.axis_label_text_font_size = '20px'
data_ecdf.axis.axis_label_text_font_style = 'normal'
data_ecdf.axis.major_label_text_font_size = '18px'
# data_ecdf.xaxis.major_label_orientation = 7

# data_ecdf.output_backend = "svg"
show(row(data_ecdf))

In [11]:
# Parse out desired treatments
treatment_list = [
    'ControlMO',
    'SMPD3MO',
    'SMPD3FLAG',
#     'SMPD3MO;SMPD3FLAG',
]

df_subset = full_results
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatment_list)]

# Make ECDF plot using iqplot
data_ecdf = iqplot.ecdf(
    data=df_subset, q='Count', cats='Treatment', q_axis='x'
    ,style='staircase'
    ,order=treatment_list
    ,palette=['#1f77b4', '#ff7f0e','#2ca02c', '#d62728']    # Color scheme for Control vs nSMase2 MO vs Rescue
#     ,palette=['#9467bd', '#d62728']              # Color scheme for H2BRFP vs nSMase2-FLAG
    ,line_kwargs=dict(line_width=3)
    ,conf_int=True, n_bs_reps=1000, ptiles=[2.5, 97.5]   # ptiles values equate to 95% Confidence Intervals

    # Other customization parameters
#     ,x_range=(-1.5,25)
    ,frame_height = 350, frame_width = 450
    ,x_axis_label='Transferrin-633 Puncta Count', y_axis_label='Cumulative Distribution Frequency'
    ,show_legend=False
#     ,x_axis_type='log'
    ,x_range=(-1, 20)
)

# Other customization parameters
data_ecdf.axis.axis_label_text_font_size = '20px'
data_ecdf.axis.axis_label_text_font_style = 'normal'
data_ecdf.axis.major_label_text_font_size = '18px'

# data_ecdf.output_backend = "svg"
data_ecdf.background_fill_color = None
data_ecdf.border_fill_color = None
# data_ecdf.legend_fill_color = None
show(row(data_ecdf))

### Kruskal-Wallis Test with Dunn's Multiple Comparisons Correction
Useful for comparing multiple datasets

In [42]:
### Kruskal-Wallis test
# Define samples to compare
sample1 = 'ControlMO'
sample2 = 'SMPD3MO'
sample3 = 'SMPD3MO;SMPD3FLAG'


# Run Kruskal-Wallis test
kw_result = stats.kruskal(
                df_subset.loc[df_subset['Treatment']==sample1]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample2]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample3]['Count']

    )

# Dunn's Posthoc for Multiple Comparisons
mult_compar = scikit_posthocs.posthoc_dunn(df_subset, val_col='Count', group_col='Treatment'
                                           , sort=False, p_adjust='bonferroni').round(10)

# Display test results
print('Kruskal-Wallis test results: \n\t\t\t\t statistic=' + str(kw_result[0]) + 
    '\n\t\t\t\t p-value=' + str(kw_result[1]))
print("\nDunn's posthoc multiple comparison result: \n" + str(mult_compar) +'\n')
# mult_compar.to_csv("Results of Dunn's Posthoc.csv")

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')

Kruskal-Wallis test results: 
				 statistic=nan
				 p-value=nan

Dunn's posthoc multiple comparison result: 
              SMPD3FLAG  SMPD3MO     ControlMO
SMPD3FLAG  1.000000e+00      0.0  4.790000e-08
SMPD3MO    0.000000e+00      1.0  0.000000e+00
ControlMO  4.790000e-08      0.0  1.000000e+00

n = 194 cells in the SMPD3FLAG dataset.
n = 250 cells in the SMPD3MO dataset.
n = 300 cells in the ControlMO dataset.


## Plot and Analyze H2B-RFP vs nSMase2-FLAG

In [7]:
# Parse out desired treatments
treatment_list = [
    'H2BRFP',
    'SMPD3FLAG',
]

df_subset = full_results
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatment_list)]

# Make ECDF plot using iqplot
data_ecdf = iqplot.ecdf(
    data=df_subset, q='Count', cats='Treatment', q_axis='x'
    ,style='staircase'
    ,order=treatment_list
#     ,palette=['#1f77b4', '#ff7f0e','#2ca02c']    # Color scheme for Control vs nSMase2 MO vs Rescue
    ,palette=['#9467bd', '#d62728']              # Color scheme for H2BRFP vs nSMase2-FLAG
    ,line_kwargs=dict(line_width=3)
    ,conf_int=True, n_bs_reps=1000, ptiles=[2.5, 97.5]   # ptiles values equate to 95% Confidence Intervals

    # Other customization parameters
#     ,x_range=(-1.5,25)
    ,frame_height = 350, frame_width = 450
    ,x_axis_label='Transferrin-633 Puncta Count', y_axis_label='Cumulative Distribution Frequency'
    ,show_legend=True
)

# Other customization parameters
data_ecdf.axis.axis_label_text_font_size = '20px'
data_ecdf.axis.axis_label_text_font_style = 'normal'
data_ecdf.axis.major_label_text_font_size = '18px'

# data_ecdf.output_backend = "svg"
show(row(data_ecdf))

### Kolmogorov-Smirnov Test
Useful for comparing two datasets, not multiple comparisons

In [14]:
### Kolmogorov-Smirnov test - NO MULTIPLE COMPARISONS
# Define samples to compare
sample1 = 'H2BRFP'
sample2 = 'SMPD3FLAG'
metric = 'Count'

# Run 2-sample Kolmogorov-Smirnov Test
ks_result = stats.ks_2samp(df_subset.loc[df_subset['Treatment']==sample1][metric]
                           ,df_subset.loc[df_subset['Treatment']==sample2][metric]
                          )
                              
# Display results of Kolmogorov-Smirnov test
print('Two-sample Kolmogorov-Smirnov test results for ' + sample1 + ' vs ' + sample2 + ': \n\t\t\t\t statistic=' + str(ks_result[0]) + 
    '\n\t\t\t\t p-value=' + str(ks_result[1]))

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')

Two-sample Kolmogorov-Smirnov test results for H2BRFP vs SMPD3FLAG: 
				 statistic=0.2654639175257732
				 p-value=1.2496611226042731e-05
n = 144 cells in the H2BRFP dataset.
n = 194 cells in the SMPD3FLAG dataset.
