In [27]:
# Import necessary packages
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
import scikit_posthocs

import iqplot
import bokeh.io
from bokeh.io import output_file, show
from bokeh.layouts import column, row
bokeh.io.output_notebook()

# Import data from directory of measurement tables, collected from Fiji

In [2]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*Summary.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    
#     (df['Date'], df['CellLine'], df['Dose'], df['Time'], df['Treatment'], 
#      df['Channels'], df['Mag'], df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))

    (df['Date'], df['CellLine'], df['Treatment'], df['Rep'], df['Channels'], 
     df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))

#     (df['Date'], df['Treatment'], df['FOV'], df['Measurement']) = zip(*df['Image'].map(lambda x:x.split('_')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Total Area', 'Average Size', '%Area', 'Mean', 'IntDen', 'Image', 'Channels'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)


# Preview dataframe to confirm import successful
df_summary.head()

Unnamed: 0,Slice,Count,Date,CellLine,Treatment,Rep,FOV,Measurement
0,1,19,20200820,U2OS,SMPD3,Rep2,FOV3,PunctaSummary
1,2,19,20200820,U2OS,SMPD3,Rep2,FOV3,PunctaSummary
0,1,5,20200820,U2OS,SMPD3N130A,Rep2,FOV7,PunctaSummary
1,2,8,20200820,U2OS,SMPD3N130A,Rep2,FOV7,PunctaSummary
0,1,18,20200820,U2OS,Dyn1K44A,Rep2,FOV1,PunctaSummary


# Parse dataframe by desired 'cell line' and 'treatment' combinations, then plot results

In [3]:
# Generate and display list of cell lines and treatments present in this dataset
cell_list = df_summary['CellLine'].unique().tolist()
treatment_list = df_summary['Treatment'].unique().tolist()

print('Cells lines: ' + str(cell_list))
print('Treatments: ' + str(treatment_list))

Cells lines: ['U2OS']
Treatments: ['SMPD3', 'SMPD3N130A', 'Dyn1K44A', '2aRFP']


In [48]:
# Prepare for parsing data:
# To populate this "comment on/off" code block, copy the results of th cell lists above
cells = [
    'U2OS', 
]
treatments = [
        '2aRFP',
#         'Dyn1K44A',
        'SMPD3',
#         'SMPD3N130A', 
        ]

# Copy dataset to not disrupt raw data
df_subset = df_summary

# Pull out only cells and treaments of interest
df_subset = df_subset.loc[df_subset['CellLine'].isin(cells)]
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatments)]
# df_subset = df_subset['Count'].dropna()

# Make ECDF plot using iqplot
data_ecdf = iqplot.ecdf(
    data=df_subset, q='Count', cats='Treatment'
#     ,title=str(cells) + ' cells treated with ' + str(treatments)
    ,style='staircase'
    ,conf_int=True, n_bs_reps=1000, ptiles=[16, 84]   # ptiles values equate to 68% CIs (SEM)
#         ,conf_int=True, n_bs_reps=1000, ptiles=[2.5, 97.5]   # ptiles values equate to 68% CIs (SEM)
    ,line_kwargs=dict(line_width=3)
    ,show_legend=False

    # Other customization parameters
    ,frame_height = 350, frame_width = 450
    ,order = treatments
#     ,palette = ['#1f77b4', '#d62728', '#2ca02c', '#ff7f0e']
    ,x_axis_label='Transferrin-633 Puncta Count', y_axis_label='Cumulative Distribution Frequency'
    ,x_range=(0,50)
)

# Other customization parameters
data_ecdf.axis.axis_label_text_font_size = '20px'
data_ecdf.axis.axis_label_text_font_style = 'normal'
data_ecdf.axis.major_label_text_font_size = '18px'


# data_ecdf.output_backend = "svg"
bokeh.io.show(data_ecdf)

In [41]:
# Prepare for parsing data:
# To populate this "comment on/off" code block, copy the results of th cell lists above
cells = [
    'U2OS', 
]
treatments = [
        '2aRFP',
#         'Dyn1K44A',
        'SMPD3',
        'SMPD3N130A', 
        ]

# Copy dataset to not disrupt raw data
df_subset = df_summary

# Pull out only cells and treaments of interest
df_subset = df_subset.loc[df_subset['CellLine'].isin(cells)]
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatments)]
# df_subset = df_subset['Count'].dropna()

# Make ECDF plot using iqplot
data_ecdf = iqplot.box(
    data=df_subset, q='Count', cats='Treatment', q_axis='y'
    ,order=treatments
#     ,palette=['#1f77b4', '#ff7f0e','#2ca02c', '#d62728']    # Color scheme for Control vs nSMase2 MO vs Rescue
#     ,palette=['#9467bd', '#d62728']              # Color scheme for H2BRFP vs nSMase2-FLAG

    # Other customization parameters
    ,frame_height = 350, frame_width = 150
    ,y_axis_label='Transferrin-633 Puncta Count'


#     ,jitter=True, jitter_kwargs=dict(width=0.3)
#     ,marker_kwargs=dict(alpha=0.8, size=1)
    ,whisker_caps=True
    ,box_kwargs=dict(line_color='black', line_width=1.5)
    ,whisker_kwargs=dict(line_color='black', line_width=1.5)
    ,median_kwargs=dict(line_color='black', line_width=2)

)

# Other customization parameters
data_ecdf.axis.axis_label_text_font_size = '20px'
data_ecdf.axis.axis_label_text_font_style = 'normal'
data_ecdf.axis.major_label_text_font_size = '18px'
data_ecdf.xaxis.major_label_orientation = 7

# data_ecdf.output_backend = "svg"
show(row(data_ecdf))

# Kruskal-Wallis Test with Dunn's Multiple Comparisons Correction
Useful for comparing multiple datasets

In [13]:
# Reminder of treatments to compare
treatment_list

['SMPD3', 'SMPD3N130A', 'Dyn1K44A', '2aRFP']

In [15]:
### Kruskal-Wallis test
# Define samples to compare
sample1 = '2aRFP'
sample2 = 'Dyn1K44A'
sample3 = 'SMPD3'
sample4 = 'SMPD3N130A'

# Run Kruskal-Wallis test
kw_result = stats.kruskal(
                df_subset.loc[df_subset['Treatment']==sample1]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample2]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample3]['Count']
               ,df_subset.loc[df_subset['Treatment']==sample4]['Count']
    )

# Dunn's Posthoc for Multiple Comparisons
mult_compar = scikit_posthocs.posthoc_dunn(df_subset, val_col='Count', group_col='Treatment'
                                           , sort=False, p_adjust='bonferroni').round(10)

# Display test results
print('Kruskal-Wallis test results: \n\t\t\t\t statistic=' + str(kw_result[0]) + 
    '\n\t\t\t\t p-value=' + str(kw_result[1]))
print("\nDunn's posthoc multiple comparison result: \n" + str(mult_compar) +'\n')
# mult_compar.to_csv("Results of Dunn's Posthoc.csv")

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')

Kruskal-Wallis test results: 
				 statistic=42.81823141572586
				 p-value=2.689693437699311e-09

Dunn's posthoc multiple comparison result: 
                   SMPD3  SMPD3N130A      Dyn1K44A     2aRFP
SMPD3       1.000000e+00    0.034371  1.000000e-09  0.047485
SMPD3N130A  3.437108e-02    1.000000  6.998710e-04  1.000000
Dyn1K44A    1.000000e-09    0.000700  1.000000e+00  0.000048
2aRFP       4.748527e-02    1.000000  4.791080e-05  1.000000

n = 29 cells in the SMPD3 dataset.
n = 36 cells in the SMPD3N130A dataset.
n = 35 cells in the Dyn1K44A dataset.
n = 50 cells in the 2aRFP dataset.
