In [1]:
# Import necessary packages
import os
import glob
import pandas as pd
import numpy as np
from scipy import stats
import scikit_posthocs

# Import plotting packages
import iqplot
import bokeh.io
from bokeh.transform import jitter

from bokeh.io import output_file, show
from bokeh.layouts import column, row
bokeh.io.output_notebook()

# Import data from directory of measurement tables, collected from Fiji

In [2]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*_InternalLRP6.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['Embryo'], df['Treatment'], df['Stains'], 
             df['FOV'], df['Channel']) = zip(*df['Image'].map(lambda x:x.split('_')))
    (df['del1'], df['ROI']) = zip(*df['Label'].map(lambda x:x.split(':')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Label', 'RawIntDen', 'del1'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)

### Process intensities to generate CTCF
# Get a list of target genes measured
image_list = df_summary.Image.unique().tolist()

# Initialize for final dataframe collection
full_results = pd.DataFrame()
full_results_list = []

# Loop through image dataset:
for image in image_list:
    df_image = df_summary.loc[df_summary['Image'] == image][['Date', 'Embryo','Treatment',
                                                             'FOV','ROI','Area','Mean','IntDen']]
    background_mean = df_image.loc[df_image['ROI'] == 'background']['Mean']
    df_image_processed = df_image.loc[df_image['ROI'] != 'background']
    df_image_processed['CTCF'] = (df_image_processed['IntDen'] 
                                  - (df_image_processed['Area'] * float(background_mean)))
    full_results_list.append(df_image_processed)
    
# Assemble and view the final results
full_results = pd.concat(full_results_list,sort=False).reset_index().drop('index', axis=1)
internal_lrp6 = full_results
internal_lrp6 = internal_lrp6.rename({'CTCF': 'Internal LRP6 CTCF', 'Mean': 'Internal LRP6 Mean', 'IntDen': 'Internal LRP6 IntDen'}, axis=1)
internal_lrp6.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Date,Embryo,Treatment,FOV,ROI,Area,Internal LRP6 Mean,Internal LRP6 IntDen,Internal LRP6 CTCF
0,20210807,Emb8,SMPD3MO,FOV3,0004-0532-0516,228.1,451.249,102929.906,28168.0781
1,20210807,Emb1,ControlMO,FOV2,0004-0290-1029,334.529,846.268,283101.573,183439.696907
2,20210807,Emb1,ControlMO,FOV2,0004-0576-0675,383.243,494.507,189516.174,75341.569169
3,20210807,Emb1,ControlMO,FOV2,0004-0941-0515,384.532,591.308,227376.522,112817.902156
4,20210807,Emb1,ControlMO,FOV2,0004-1104-0483,363.102,448.142,162721.112,54546.853466


In [3]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*_SurfaceLRP6.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['Embryo'], df['Treatment'], df['Stains'], 
             df['FOV'], df['Channel']) = zip(*df['Image'].map(lambda x:x.split('_')))
    (df['del1'], df['ROI']) = zip(*df['Label'].map(lambda x:x.split(':')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Label', 'RawIntDen', 'del1'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)

### Process intensities to generate CTCF
# Get a list of target genes measured
image_list = df_summary.Image.unique().tolist()

# Initialize for final dataframe collection
full_results = pd.DataFrame()
full_results_list = []

# Loop through image dataset:
for image in image_list:
    df_image = df_summary.loc[df_summary['Image'] == image][['Date', 'Embryo','Treatment',
                                                             'FOV','ROI','Area','Mean','IntDen']]
    background_mean = df_image.loc[df_image['ROI'] == 'background']['Mean']
    df_image_processed = df_image.loc[df_image['ROI'] != 'background']
    df_image_processed['CTCF'] = (df_image_processed['IntDen'] 
                                  - (df_image_processed['Area'] * float(background_mean)))
    full_results_list.append(df_image_processed)
    
# Assemble and view the final results
full_results = pd.concat(full_results_list,sort=False).reset_index().drop('index', axis=1)
surface_lrp6 = full_results
surface_lrp6 = surface_lrp6.rename({'CTCF': 'Surface LRP6 CTCF', 'Mean': 'Surface LRP6 Mean', 'IntDen': 'Surface LRP6 IntDen'}, axis=1)
surface_lrp6.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Date,Embryo,Treatment,FOV,ROI,Area,Surface LRP6 Mean,Surface LRP6 IntDen,Surface LRP6 CTCF
0,20210807,Emb3,SMPD3MO,FOV6,0004-0498-0447,768.542,264.778,203492.918,143692.66498
1,20210807,Emb3,SMPD3MO,FOV6,0004-0884-0356,837.425,199.288,166888.747,101728.70775
2,20210807,Emb3,SMPD3MO,FOV6,0004-0152-0941,691.216,173.809,120139.285,66355.76804
3,20210807,Emb1,SMPD3MO,FOV2,0004-0384-0938,569.044,505.708,287770.363,247014.293632
4,20210807,Emb1,SMPD3MO,FOV2,0004-0479-0562,314.182,606.157,190443.735,167941.391796


In [4]:
# Define path to directory with measurements
path = os.path.abspath('raw_data_csvs/')
df_summary = pd.DataFrame()
list_summary = []

# For loop to bring in files and concatenate them into a single dataframe
for file_ in glob.glob(path + "/*_H2BRFP.csv"):
    df = pd.read_csv(file_)
    # Determine Image name from file name, then parse experiment details from Image name
    df['Image'] = os.path.splitext(os.path.basename(file_))[0]
    (df['Date'], df['Embryo'], df['Treatment'], df['Stains'], 
             df['FOV'], df['Channel']) = zip(*df['Image'].map(lambda x:x.split('_')))
    (df['del1'], df['ROI']) = zip(*df['Label'].map(lambda x:x.split(':')))

    # Drop unnecessary columns for tidiness
    df = df.drop(['Label', 'RawIntDen', 'del1'], axis = 1)
    
    # Compile data
    list_summary.append(df)
df_summary = pd.concat(list_summary, sort=False)

### Process intensities to generate CTCF
# Get a list of target genes measured
image_list = df_summary.Image.unique().tolist()

# Initialize for final dataframe collection
full_results = pd.DataFrame()
full_results_list = []

# Loop through image dataset:
for image in image_list:
    df_image = df_summary.loc[df_summary['Image'] == image][['Date', 'Embryo','Treatment',
                                                             'FOV','ROI','Area','Mean','IntDen']]
    background_mean = df_image.loc[df_image['ROI'] == 'background']['Mean']
    df_image_processed = df_image.loc[df_image['ROI'] != 'background']
    df_image_processed['CTCF'] = (df_image_processed['IntDen'] 
                                  - (df_image_processed['Area'] * float(background_mean)))
    full_results_list.append(df_image_processed)
    
# Assemble and view the final results
full_results = pd.concat(full_results_list,sort=False).reset_index().drop('index', axis=1)
h2brfp = full_results
h2brfp = h2brfp.rename({'CTCF': 'H2BRFP CTCF', 'Mean': 'H2BRFP Mean', 'IntDen': 'H2BRFP LRP6 IntDen'}, axis=1)
h2brfp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Date,Embryo,Treatment,FOV,ROI,Area,H2BRFP Mean,H2BRFP LRP6 IntDen,H2BRFP CTCF
0,20210807,Emb8,SMPD3MO,FOV5,0004-0414-0767,335.248,1493.123,500565.833,407192.895288
1,20210807,Emb4,ControlMO,FOV4,0004-0348-0333,245.85,2259.733,555554.33,484692.24695
2,20210807,Emb4,ControlMO,FOV4,0004-0418-0276,193.201,905.623,174966.814,119279.910167
3,20210807,Emb4,ControlMO,FOV4,0004-0396-0111,233.718,1164.925,272263.777,204898.536706
4,20210807,Emb4,ControlMO,FOV4,0004-0508-0533,363.929,979.071,356311.745,251415.397543


In [7]:
merged = pd.merge(h2brfp, surface_lrp6, on=['Date', 'Embryo', 'Treatment', 'FOV', 'ROI', 'Area'])
merged = pd.merge(merged, internal_lrp6, on=['Date', 'Embryo', 'Treatment', 'FOV', 'ROI', 'Area'])
merged['Total LRP6 CTCF'] = merged['Internal LRP6 CTCF'] + merged['Surface LRP6 CTCF']
merged['Internal / Surface LRP6 CTCF'] = merged['Internal LRP6 CTCF'] / merged['Surface LRP6 CTCF']
merged['Internalized Fraction CTCF'] = merged['Internal LRP6 CTCF'] / merged['Total LRP6 CTCF']
merged['Total LRP6 Mean'] = merged['Internal LRP6 Mean'] + merged['Surface LRP6 Mean']
merged['Internalized Fraction Mean'] = merged['Internal LRP6 Mean'] / merged['Total LRP6 Mean']
merged['Total LRP6 IntDen'] = merged['Internal LRP6 IntDen'] + merged['Surface LRP6 IntDen']
merged['Internalized Fraction IntDen'] = merged['Internal LRP6 IntDen'] / merged['Total LRP6 IntDen']
merged['Internal / RFP'] = merged['Internal LRP6 Mean'] / merged['H2BRFP Mean']
merged['Norm'] = merged['Internalized Fraction Mean'] / merged['H2BRFP Mean']

merged.head()

Unnamed: 0,Date,Embryo,Treatment,FOV,ROI,Area,H2BRFP Mean,H2BRFP LRP6 IntDen,H2BRFP CTCF,Surface LRP6 Mean,...,Internal LRP6 CTCF,Total LRP6 CTCF,Internal / Surface LRP6 CTCF,Internalized Fraction CTCF,Total LRP6 Mean,Internalized Fraction Mean,Total LRP6 IntDen,Internalized Fraction IntDen,Internal / RFP,Norm
0,20210807,Emb8,SMPD3MO,FOV5,0004-0414-0767,335.248,1493.123,500565.833,407192.895288,343.962,...,40381.144064,130913.106648,0.446043,0.308458,779.196,0.558568,261223.669,0.558568,0.291492,0.000374
1,20210807,Emb4,ControlMO,FOV4,0004-0348-0333,245.85,2259.733,555554.33,484692.24695,300.02,...,43897.08285,98004.04055,0.811302,0.447911,787.172,0.618863,193525.862,0.618864,0.215579,0.000274
2,20210807,Emb4,ControlMO,FOV4,0004-0418-0276,193.201,905.623,174966.814,119279.910167,230.337,...,34464.288601,63521.385063,1.186089,0.542562,717.323,0.678894,138587.122,0.678894,0.537736,0.00075
3,20210807,Emb4,ControlMO,FOV4,0004-0396-0111,233.718,1164.925,272263.777,204898.536706,253.471,...,29702.785918,70260.663434,0.732356,0.422751,689.158,0.632202,161068.754,0.632202,0.374004,0.000543
4,20210807,Emb4,ControlMO,FOV4,0004-0508-0533,363.929,979.071,356311.745,251415.397543,152.672,...,13251.312529,39721.426127,0.500614,0.333606,497.683,0.693234,181121.308,0.693234,0.352386,0.000708


In [9]:
value = 'Internal / Surface LRP6 CTCF'

treatment_list = [
    'ControlMO',
    'SMPD3MO',
]


df_subset = merged
df_subset = df_subset.loc[df_subset['Treatment'].isin(treatment_list)]
df_subset.to_csv('source_data_NC_InternalLRP6_CTCF.csv')


# Make stripbox plot using iqplot
data_stripbox = iqplot.stripbox(
    data=df_subset, q=value, cats='Treatment', q_axis='y'
    ,jitter=True

    # Other customization parameters
    ,marker_kwargs=dict(alpha=0.4, size=3
#                         , color='darkgray'
                       )
    ,box_kwargs=dict(line_color='black', line_width=1.5)
    ,whisker_kwargs=dict(line_color='black', line_width=1.5)
    ,median_kwargs=dict(line_color='maroon', line_width=4)
    ,top_level='box'
    ,frame_width=150, frame_height=300
    ,x_axis_label='Treatment', y_axis_label='Internalized LRP6-FLAG Intensity (A.U.)'
    ,order=treatment_list
    ,y_axis_type='log'
#     ,y_range=(1000,2000000)
    ,color_column='Treatment'
    ,palette=('#1F77B4', #blue
              '#FF7E0E', #orange
#               '#2CA02C', #green
#               '#AC8BCC', #purple
             )
)

# Other customization parameters
data_stripbox.title.text_font_size = '18px'
data_stripbox.axis.axis_label_text_font_size = '16px'
data_stripbox.axis.axis_label_text_font_style = 'bold'
data_stripbox.axis.major_label_text_font_size = '14px'
data_stripbox.xaxis.major_label_orientation = 7
data_stripbox.background_fill_color = None
data_stripbox.border_fill_color = None

# data_stripbox.output_backend = "svg"
show(row(data_stripbox))

### Kolmogorov-Smirnov test - NO MULTIPLE COMPARISONS
# Define samples to compare
sample1 = 'ControlMO'
sample2 = 'SMPD3MO'
metric = value

# Run 2-sample Kolmogorov-Smirnov Test
ks_result = stats.ks_2samp(df_subset.loc[df_subset['Treatment']==sample1][metric]
                           ,df_subset.loc[df_subset['Treatment']==sample2][metric])
                              
# Display results of Kolmogorov-Smirnov test
print('Two-sample Kolmogorov-Smirnov test results for ' + sample1 + ' vs ' + sample2 + ': \n\t\t\t\t statistic=' + str(ks_result[0]) + 
    '\n\t\t\t\t p-value=' + str(ks_result[1]))

# Get number of cells within this test
for treatment in df_subset['Treatment'].unique().tolist():
    temp_df = df_subset.loc[df_subset['Treatment'] == treatment]
    print('n = ' + str(len(temp_df)) + ' cells in the ' + str(treatment) + ' dataset.')
# print('n = '+ str(len(embryo_list)) + ' embryos in the dataset.')



Two-sample Kolmogorov-Smirnov test results for ControlMO vs SMPD3MO: 
				 statistic=0.23335621139327384
				 p-value=0.004706448751999437
n = 94 cells in the SMPD3MO dataset.
n = 124 cells in the ControlMO dataset.
