In [1]:
# Import necessary packages
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
import scikit_posthocs

# Import plotting packages
import iqplot
import bokeh.io
from bokeh.io import output_file, show
from bokeh.layouts import column, row
bokeh.io.output_notebook()

# Import data from directory of measurement tables, collected from Fiji

In [2]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*_Intensity.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['Embryo'], df['Treatment'], df['Stains'], 
             df['FOV'], df['del1']) = zip(*df['Image'].map(lambda x:x.split('_')))
    (df['del2'], df['ROI']) = zip(*df['Label'].map(lambda x:x.split(':')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Label', 'RawIntDen', 'del1', 'del2'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)

# Preview dataframe to confirm import successful
df_summary.head()

### Process intensities to generate CTCF
# Get a list of target genes measured
image_list = df_summary.Image.unique().tolist()

# Initialize for final dataframe collection
full_results = pd.DataFrame()
full_results_list = []

# Loop through image dataset:
for image in image_list:
    df_image = df_summary.loc[df_summary['Image'] == image][['Date', 'Embryo','Treatment',
                                                             'FOV','ROI','Area','Mean','IntDen']]
    background_mean = df_image.loc[df_image['ROI'] == 'background']['Mean']
    df_image_processed = df_image.loc[df_image['ROI'] != 'background']
    df_image_processed['CTCF'] = (df_image_processed['IntDen'] 
                                  - (df_image_processed['Area'] * float(background_mean)))
    full_results_list.append(df_image_processed)
    
# Assemble and view the final results
full_results = pd.concat(full_results_list,sort=False).reset_index().drop('index', axis=1)
full_results.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Date,Embryo,Treatment,FOV,ROI,Area,Mean,IntDen,CTCF
0,20210804,Emb1,DMSO;1percent,FOV3,0002-2872-2754,1579.685,1004.524,1586831.054,1293999.0
1,20210804,Emb1,DMSO;1percent,FOV3,0002-3014-1854,2612.541,426.274,1113657.189,629360.0
2,20210804,Emb1,DMSO;1percent,FOV3,0002-3268-0928,850.733,670.829,570696.681,412992.9
3,20210804,Emb1,DMSO;1percent,FOV3,0002-3218-0542,824.096,1079.102,889283.133,736517.2
4,20210804,Emb1,DMSO;1percent,FOV3,0002-2260-1270,713.246,418.297,298348.903,166131.6


In [8]:
value = 'CTCF'

treatment_list = [
    'DMSO;1percent', 
    'GW4869;25uM', 
]

embryo_list = [
    'Emb1',
    'Emb2',
    'Emb3',
    'Emb4',
]

df_subset = full_results
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatment_list)]
df_subset = df_subset.loc[df_subset['Embryo'].isin(embryo_list)]
df_subset.to_csv('source_data_NC_Tf633_GW4869.csv')

# Make ECDF plot using iqplot
data_ecdf = iqplot.ecdf(
    data=df_subset, q=value, cats='Treatment', q_axis='x'
    ,style='staircase'
    ,conf_int=True, n_bs_reps=1000, ptiles=[2.5, 97.5]   # ptiles values equate to 95% CIs

    # Other customization parameters
    ,frame_height = 300, frame_width = 400
    ,x_axis_label='Transferrin-633 Intensity', y_axis_label='ECDF'
    ,x_axis_type='log'
)

# Other customization parameters
data_ecdf.title.text_font_size = '18px'
data_ecdf.legend.title='Treatment'
data_ecdf.legend.title_text_font_size = '16px'
data_ecdf.legend.label_text_font_size = '14px'
data_ecdf.axis.axis_label_text_font_size = '16px'
data_ecdf.axis.axis_label_text_font_style = 'bold'
data_ecdf.axis.major_label_text_font_size = '14px'


# Make stripbox plot using iqplot
data_stripbox = iqplot.stripbox(
    data=df_subset, q=value, cats='Treatment', q_axis='y'
    ,jitter=True

    # Other customization parameters
    ,marker_kwargs=dict(alpha=0.4, size=3
#                         , color='darkgray'
                       )
    ,box_kwargs=dict(line_color='black', line_width=1.5)
    ,whisker_kwargs=dict(line_color='black', line_width=1.5)
    ,median_kwargs=dict(line_color='maroon', line_width=4)
    ,top_level='box'
    ,frame_width=150, frame_height=300
    ,x_axis_label='Treatment', y_axis_label='Transferrin-633 Intensity (A.U.)'
    ,order=treatment_list
    ,y_axis_type='log'
    ,y_range=(2000,5000000)
    ,color_column='Treatment'
    ,palette=('#1F77B4', #blue
              '#FF7E0E', #orange
              '#2CA02C', #green
#               '#AC8BCC', #purple
             )
)

# Other customization parameters
data_stripbox.title.text_font_size = '18px'
data_stripbox.axis.axis_label_text_font_size = '16px'
data_stripbox.axis.axis_label_text_font_style = 'bold'
data_stripbox.axis.major_label_text_font_size = '14px'
data_stripbox.xaxis.major_label_orientation = 7
data_stripbox.background_fill_color = None
data_stripbox.border_fill_color = None

# data_stripbox.output_backend = "svg"
show(row(data_stripbox))

### Kolmogorov-Smirnov test - NO MULTIPLE COMPARISONS
# Define samples to compare
sample1 = 'DMSO;1percent'
sample2 = 'GW4869;25uM'
metric = value

# Run 2-sample Kolmogorov-Smirnov Test
ks_result = stats.ks_2samp(df_subset.loc[df_subset['Treatment']==sample1][metric]
                           ,df_subset.loc[df_subset['Treatment']==sample2][metric])
                              
# Display results of Kolmogorov-Smirnov test
print('Two-sample Kolmogorov-Smirnov test results for ' + sample1 + ' vs ' + sample2 + ': \n\t\t\t\t statistic=' + str(ks_result[0]) + 
    '\n\t\t\t\t p-value=' + str(ks_result[1]))

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')
print('n = '+ str(len(embryo_list)) + ' embryos in the dataset.')

Two-sample Kolmogorov-Smirnov test results for DMSO;1percent vs GW4869;25uM: 
				 statistic=0.903126841612646
				 p-value=9.694233860159707e-172
n = 317 cells in the DMSO;1percent dataset.
n = 364 cells in the GW4869;25uM dataset.
n = 4 embryos in the dataset.
