In [1]:
# Import necessary packages
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
import scikit_posthocs

# Import plotting packages
import iqplot
import bokeh.io
from bokeh.io import output_file, show
from bokeh.layouts import column, row
bokeh.io.output_notebook()

# Import data from directory of measurement tables, collected from Fiji

In [8]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*_Intensity.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['Embryo'], df['Treatment'], df['Stains'], 
             df['FOV'], df['del1']) = zip(*df['Image'].map(lambda x:x.split('_')))
    (df['del2'], df['ROI']) = zip(*df['Label'].map(lambda x:x.split(':')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Label', 'RawIntDen', 'del1', 'del2'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)

# Preview dataframe to confirm import successful
df_summary.head()

### Process intensities to generate CTCF
# Get a list of target genes measured
image_list = df_summary.Image.unique().tolist()

# Initialize for final dataframe collection
full_results = pd.DataFrame()
full_results_list = []

# Loop through image dataset:
for image in image_list:
    df_image = df_summary.loc[df_summary['Image'] == image][['Date', 'Embryo','Treatment',
                                                             'FOV','ROI','Area','Mean','IntDen']]
    background_mean = df_image.loc[df_image['ROI'] == 'background']['Mean']
    df_image_processed = df_image.loc[df_image['ROI'] != 'background']
    df_image_processed['CTCF'] = (df_image_processed['IntDen'] 
                                  - (df_image_processed['Area'] * float(background_mean)))
    full_results_list.append(df_image_processed)
    
# Assemble and view the final results
full_results = pd.concat(full_results_list,sort=False).reset_index().drop('index', axis=1)
full_results.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Date,Embryo,Treatment,FOV,ROI,Area,Mean,IntDen,CTCF
0,20210807,Emb12,SMPD3MO;SMPD3FLAG,FOV2,0003-0718-2038,500.043,933.722,466900.874,430171.215521
1,20210807,Emb12,SMPD3MO;SMPD3FLAG,FOV2,0003-0481-1986,911.574,561.04,511430.142,444472.296978
2,20210807,Emb12,SMPD3MO;SMPD3FLAG,FOV2,0003-1167-1996,1215.433,500.452,608265.576,518988.375851
3,20210807,Emb12,SMPD3MO;SMPD3FLAG,FOV2,0003-1159-1630,846.166,444.937,376490.417,314336.985802
4,20210807,Emb12,SMPD3MO;SMPD3FLAG,FOV2,0003-1210-0490,1066.548,480.795,512790.726,434449.575756


In [9]:
full_results['Treatment'].unique().tolist()

['SMPD3MO;SMPD3FLAG', 'ControlMO', 'SMPD3FLAG', 'SMPD3MO']

In [11]:
value = 'CTCF'

treatment_list = [
    'ControlMO',
    'SMPD3MO',
    'SMPD3FLAG',
    'SMPD3MO;SMPD3FLAG'
]

df_subset = full_results
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatment_list)]
df_subset = df_subset.loc[df_subset['Embryo'].isin(embryo_list)]
df_subset.to_csv('source_data_NC_Tf633_SMPD3Perturbs.csv')

# Make stripbox plot using iqplot
data_stripbox = iqplot.stripbox(
    data=df_subset, q=value, cats='Treatment', q_axis='y'
    ,jitter=True

    # Other customization parameters
    ,marker_kwargs=dict(alpha=0.4, size=3
#                         , color='darkgray'
                       )
    ,box_kwargs=dict(line_color='black', line_width=1.5)
    ,whisker_kwargs=dict(line_color='black', line_width=1.5)
    ,median_kwargs=dict(line_color='maroon', line_width=4)
    ,top_level='box'
    ,frame_width=250, frame_height=300
    ,x_axis_label='Treatment', y_axis_label='Transferrin-633 Intensity (A.U.)'
    ,order=treatment_list
    ,y_axis_type='log'
    ,y_range=(8000,5000000)
    ,color_column='Treatment'
    ,palette=('#1F77B4', #blue
              '#2CA02C', #orange
              '#FF7E0E', #green
              '#AC8BCC', #purple
             )
    ,tooltips=[("Embryo", "@Embryo"), ("FOV", "@FOV")]
)

# Other customization parameters
data_stripbox.title.text_font_size = '18px'
data_stripbox.axis.axis_label_text_font_size = '16px'
data_stripbox.axis.axis_label_text_font_style = 'bold'
data_stripbox.axis.major_label_text_font_size = '14px'
data_stripbox.xaxis.major_label_orientation = 7
data_stripbox.background_fill_color = None
data_stripbox.border_fill_color = None

# data_stripbox.output_backend = "svg"
show(row(data_stripbox))

### Kruskal-Wallis test
metric = value
# Run Kruskal-Wallis test
kw_result = stats.kruskal(
                df_subset.loc[df_subset['Treatment']==treatment_list[0]][metric]
               ,df_subset.loc[df_subset['Treatment']==treatment_list[1]][metric]
               ,df_subset.loc[df_subset['Treatment']==treatment_list[2]][metric]
               ,df_subset.loc[df_subset['Treatment']==treatment_list[3]][metric]
#                ,df_subset.loc[df_subset['Treatment']==sample5][metric]
#                ,df_subset.loc[df_subset['Treatment']==sample6][metric]
        )

# Dunn's Posthoc for Multiple Comparisons
mult_compar = scikit_posthocs.posthoc_dunn(df_subset, val_col=metric, group_col='Treatment'
                                           , sort=False, p_adjust='bonferroni').round(6)

# Display test results
print('Kruskal-Wallis test results: \n\t\t\t\t statistic=' + str(kw_result[0]) + 
    '\n\t\t\t\t p-value=' + str(kw_result[1]))
print("\nDunn's posthoc multiple comparison result: \n" + str(mult_compar) +'\n')
# mult_compar.to_csv("Results of Dunn's Posthoc.csv")

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')
print('n = '+ str(len(embryo_list)) + ' embryos in the dataset.')

Kruskal-Wallis test results: 
				 statistic=87.43786018059654
				 p-value=7.776498506929175e-19

Dunn's posthoc multiple comparison result: 
                   SMPD3MO;SMPD3FLAG  ControlMO  SMPD3FLAG   SMPD3MO
SMPD3MO;SMPD3FLAG           1.000000   0.173993   0.027113  0.000019
ControlMO                   0.173993   1.000000   0.000000  0.016574
SMPD3FLAG                   0.027113   0.000000   1.000000  0.000000
SMPD3MO                     0.000019   0.016574   0.000000  1.000000

n = 74 cells in the SMPD3MO;SMPD3FLAG dataset.
n = 135 cells in the ControlMO dataset.
n = 163 cells in the SMPD3FLAG dataset.
n = 112 cells in the SMPD3MO dataset.
n = 9 embryos in the dataset.
