In [21]:
# Import necessary packages
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
import scikit_posthocs
import iqplot
import bokeh.io
bokeh.io.output_notebook()

# Import data from directory of measurement tables, collected from Fiji

In [22]:
# Define path to directory with measurements
path = os.path.abspath('2_Puncta/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*Summary.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    
#     (df['Date'], df['CellLine'], df['Dose'], df['Time'], df['Treatment'], 
#      df['Channels'], df['Mag'], df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))

    (df['Date'], df['CellLine'], df['Treatment'], df['Rep'], df['Channels'], 
     df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))

#     (df['Date'], df['Treatment'], df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Total Area', 'Average Size', '%Area', 'Mean', 'IntDen', 'Image', 'Channels'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)


# Preview dataframe to confirm import successful
df_summary.head()

Unnamed: 0,Slice,Count,Date,CellLine,Treatment,Rep,FOV,Measurement
0,1,3,20200820,DF1,SMPD3gRNA,Rep1,FOV6,PunctaSummary
1,1,3,20200820,DF1,SMPD3gRNA,Rep1,FOV6,PunctaSummary
2,2,7,20200820,DF1,SMPD3gRNA,Rep1,FOV6,PunctaSummary
3,2,10,20200820,DF1,SMPD3gRNA,Rep1,FOV6,PunctaSummary
4,3,1,20200820,DF1,SMPD3gRNA,Rep1,FOV6,PunctaSummary


# Parse dataframe by desired 'cell line' and 'treatment' combinations, then plot results

In [23]:
# Generate and display list of cell lines and treatments present in this dataset
cell_list = df_summary['CellLine'].unique().tolist()
treatment_list = df_summary['Treatment'].unique().tolist()

print('Cells lines: ' + str(cell_list))
print('Treatments: ' + str(treatment_list))

Cells lines: ['DF1', 'U2OS']
Treatments: ['SMPD3gRNA', 'SMPD3', 'SMPD3N130A', 'Dyn1K44A', 'SGMS2', '2aRFP', 'ControlgRNA']


In [28]:
# Prepare for parsing data:
# To populate this "comment on/off" code block, copy the results of th cell lists above
cells = [
    'U2OS', 
#     'DF1',
]
treatments = [
        '2aRFP',
        'Dyn1K44A',
        'SMPD3',
        'SMPD3N130A', 
#         'SGMS2',
#         'ControlgRNA',
#         'SMPD3gRNA',
        ]


from bokeh.palettes import RdBu4

# Copy dataset to not disrupt raw data
df_subset = df_summary

# Pull out only cells and treaments of interest
df_subset = df_subset.loc[df_subset['CellLine'].isin(cells)]
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatments)]
# df_subset = df_subset['Count'].dropna()

# Make ECDF plot using iqplot
data_ecdf = iqplot.ecdf(
    data=df_subset, q='Count', cats='Treatment'
#     ,title=str(cells) + ' cells treated with ' + str(treatments)
    ,style='staircase'
#     ,conf_int=True, n_bs_reps=1000, ptiles=[2.5, 97.5]   # ptiles values equate to 95% CIs
    ,conf_int=True, n_bs_reps=1000, ptiles=[16, 84]   # ptiles values equate to 68% CIs (SEM)
        ,line_kwargs=dict(line_width=3)
    ,show_legend=False

    # Other customization parameters
    ,frame_height = 350, frame_width = 450
    ,order = treatments
    ,palette = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e']
#     ,palette = ['#019DE1', '#0070A1', '#00A6ED', '#008BC7']
#     ,palette = ['#E8754F', '#2066AC', '#68ADCC', '#666666']
    ,x_axis_label='Transferrin-633 Puncta Count', y_axis_label='Cumulative Distribution Frequency'
    ,x_range=(0,50)
#     ,y_axis_type='log'
        
)

# Other customization parameters
data_ecdf.axis.axis_label_text_font_size = '20px'
data_ecdf.axis.axis_label_text_font_style = 'normal'
data_ecdf.axis.major_label_text_font_size = '18px'


# data_ecdf.output_backend = "svg"
bokeh.io.show(data_ecdf)

In [10]:
# Prepare for parsing data:
# To populate this "comment on/off" code block, copy the results of th cell lists above
cells = [
    'U2OS', 
#     'DF1',
]
treatments = [
        '2aRFP',
        'Dyn1K44A',
        'SMPD3',
        'SMPD3N130A', 
#         'SGMS2',
#         'ControlgRNA',
#         'SMPD3gRNA',
        ]


from bokeh.palettes import RdBu4

# Copy dataset to not disrupt raw data
df_subset = df_summary

# Pull out only cells and treaments of interest
df_subset = df_subset.loc[df_subset['CellLine'].isin(cells)]
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatments)]
# df_subset = df_subset['Count'].dropna()

# Make ECDF plot using iqplot
data_ecdf = iqplot.stripbox(
    data=df_subset, q='Count', cats='Treatment', q_axis='y'
#     ,title=str(cells) + ' cells treated with ' + str(treatments)
#     ,style='staircase'
#     ,conf_int=True, n_bs_reps=1000, ptiles=[2.5, 97.5]   # ptiles values equate to 95% CIs
#     ,conf_int=True, n_bs_reps=1000, ptiles=[16, 84]   # ptiles values equate to 68% CIs (SEM)
#     ,show_legend=False

    # Other customization parameters
    ,frame_height = 300, frame_width = 300
    ,order = treatments
    ,palette = ['#666666', '#666666', '#666666', '#666666']
#     ,palette = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e']
    ,jitter=True
#     ,palette = ['#019DE1', '#0070A1', '#00A6ED', '#008BC7']
#     ,palette = ['#E8754F', '#2066AC', '#68ADCC', '#666666']
    ,x_axis_label='Treatment', y_axis_label='Transferrin-633 Puncta'
#     ,x_range=(0,50)
#     ,y_axis_type='log'
        
)

# Other customization parameters
data_ecdf.title.text_font_size = '18px'
data_ecdf.legend.title='Treatment'
# data_ecdf.legend.title_text_font_style = 'bold'
data_ecdf.legend.title_text_font_size = '16px'
data_ecdf.legend.label_text_font_size = '14px'

data_ecdf.axis.axis_label_text_font_size = '16px'
data_ecdf.axis.axis_label_text_font_style = 'bold'
data_ecdf.axis.major_label_text_font_size = '14px'

# data_ecdf.output_backend = "svg"
bokeh.io.show(data_ecdf)

You are attempting to set `plot.legend.title` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

You are attempting to set `plot.legend.title_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.

You are attempting to set `plot.legend.label_text_font_size` on a plot that has zero legends added, this will have no effect.

Before legend properties can be set, you must add a Legend explicitly, or call a glyph method with a legend parameter set.



# Kruskal-Wallis Test with Dunn's Multiple Comparisons Correction
Useful for comparing multiple datasets

In [50]:
# Reminder of treatments to compare
treatment_list

['SMPD3gRNA',
 'SMPD3',
 'SMPD3N130A',
 'Dyn1K44A',
 'SGMS2',
 '2aRFP',
 'ControlgRNA']

In [64]:
### Kruskal-Wallis test
# Define samples to compare
sample1 = '2aRFP'
sample2 = 'Dyn1K44A'
sample3 = 'SMPD3'
sample4 = 'SMPD3N130A'
# sample5 = 'SGMS2'
# sample6 = 'nSMase2gRNA1'

# Run Kruskal-Wallis test
kw_result = stats.kruskal(
                df_subset.loc[df_subset['Treatment']==sample1]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample2]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample3]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample4]['Count']
#                ,df_subset.loc[df_subset['Treatment']==sample5]['Count']
#                ,df_subset.loc[df_subset['Treatment']==sample6]['Count']
    )

# Dunn's Posthoc for Multiple Comparisons
mult_compar = scikit_posthocs.posthoc_dunn(df_subset, val_col='Count', group_col='Treatment'
                                           , sort=False, p_adjust='bonferroni').round(6)

# Display test results
print('Kruskal-Wallis test results: \n\t\t\t\t statistic=' + str(kw_result[0]) + 
    '\n\t\t\t\t p-value=' + str(kw_result[1]))
print("\nDunn's posthoc multiple comparison result: \n" + str(mult_compar) +'\n')
# mult_compar.to_csv("Results of Dunn's Posthoc.csv")

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')

Kruskal-Wallis test results: 
				 statistic=42.81823141572586
				 p-value=2.689693437699311e-09

Dunn's posthoc multiple comparison result: 
               SMPD3  SMPD3N130A  Dyn1K44A     2aRFP
SMPD3       1.000000    0.034371  0.000000  0.047485
SMPD3N130A  0.034371    1.000000  0.000700  1.000000
Dyn1K44A    0.000000    0.000700  1.000000  0.000048
2aRFP       0.047485    1.000000  0.000048  1.000000

n = 29 cells in the SMPD3 dataset.
n = 36 cells in the SMPD3N130A dataset.
n = 35 cells in the Dyn1K44A dataset.
n = 50 cells in the 2aRFP dataset.


# Kolmogorov-Smirnov Test
Useful for comparing two datasets, not multiple comparisons

In [182]:
### Kolmogorov-Smirnov test - NO MULTIPLE COMPARISONS
# Define samples to compare
sample1 = '2aRFP'
sample2 = 'SMPD3'

# Run 2-sample Kolmogorov-Smirnov Test
ks_result = stats.ks_2samp(df_subset.loc[df_subset['Treatment']==sample1]['Count']
                           ,df_subset.loc[df_subset['Treatment']==sample2]['Count']
                          )
                              
# Display results of Kolmogorov-Smirnov test
print('Two-sample Kolmogorov-Smirnov test results for ' + sample1 + ' vs ' + sample2 + ': \n\t\t\t\t statistic=' + str(ks_result[0]) + 
    '\n\t\t\t\t p-value=' + str(ks_result[1]))

Two-sample Kolmogorov-Smirnov test results for 2aRFP vs SMPD3: 
				 statistic=0.3082758620689655
				 p-value=0.046374994877002695


## Examine data using other plots from iqplot

In [213]:
# Make the variety of plots
data_box =iqplot.box(data=df_subset, q='Count', cats='Treatment', q_axis='y',
#                      order=['RFP', 'Dyn1KA'], 
                     whisker_caps=True, frame_height = 300, frame_width = 200)

data_strip =iqplot.strip(data=df_subset, q='Count', cats='Treatment', q_axis='y', 
#                          order=['RFP', 'Dyn1KA'], 
                         jitter=True, frame_height=300, frame_width=200)

data_histo = iqplot.histogram(data=df_subset, q='Count', cats='Treatment', density=True, frame_width=550)

# Display plots
bokeh.io.show(bokeh.layouts.gridplot([data_box, data_strip, data_histo], ncols=3))

In [53]:
data_strip =iqplot.strip(data=df_subset, q='Count', cats='Treatment', q_axis='y', 
#                          order=['RFP', 'Dyn1KA'], 
                         jitter=True, frame_height=300, frame_width=200)
bokeh.io.show(data_strip)

In [38]:
# Define path to directory with measurements
path = os.path.abspath('2_Puncta/')
df_summary = pd.DataFrame()
list_summary = []
df_details = pd.DataFrame()
list_details = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*Summary.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['CellLine'], df['Treatment'], df['Rep'], df['Channels'], 
     df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Total Area', 'Average Size', '%Area', 'Mean', 'IntDen', 'Image', 'Channels'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)


# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*Details.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['CellLine'], df['Treatment'], df['Rep'], df['Channels'], 
     df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))
    (df['ROI'], df['?unknown?']) = zip(*df['Label'].map(lambda x:x.split(':')))
    df = df.rename({" ": "?Vesicle?"}, axis=1)

    # Drop unnecessary columns for tidiness
    df = df.drop(['Mean', 'IntDen', 'Image', 'Channels', 'Mean', 'IntDen', 'RawIntDen'], axis = 1)
    
    # Compile data
    list_details.append(df)
df_details = pd.concat(list_details, sort=False)

full_df = pd.merge(df_summary, df_details, on=['Date', 'CellLine', 'Treatment', 'Rep', 'FOV'])

# full_df = full_df.dropna()

# Preview dataframe to confirm import successful
# full_df