In [1]:
import pandas as pd
import scipy
from scipy import stats

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
# Data import

root = "/Volumes/TOB_WD2/Image_Analysis/Mitosis/Dataframes/SpinningDisc_2i_vs_FBS" + "/"
df_Path = root + "MasterDataFrame_Filtered.csv"

df = pd.read_csv(df_Path)

df = df[df["Incubation_bin"] == 48]

keep_datasets = [20220517, 20220602, 20220603, 20220617, 20220623, 20220624, 20220720, 20220721, 20220722, 20220927, 20221124]
df = df[df['Date'].isin(keep_datasets)]

print("The shape of the df after filtering in visualisation notebook: " + str(df.shape))

The shape of the df after filtering in visualisation notebook: (745, 46)


In [3]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Chromatin_Dilation',
       'Chromatin_Volume_um3', 'DNA_Threshold', 'MetaphasePlate_Length_um',
       'MetaphasePlate_Width_um', 'Path_InputImage', 'Path_OutputImage',
       'Spindle_Angle_Degrees', 'Spindle_Aspect_Ratio',
       'Spindle_Center_To_MetaphasePlate_Center_Distance_um',
       'Spindle_Length_um', 'Spindle_SNR', 'Spindle_Volume_um3',
       'Spindle_Width_Avg_um', 'Spindle_Width_Max_um', 'Spindle_Width_Min_um',
       'Tubulin_Cellular_Avg_Intensity', 'Tubulin_Spindle_Average_Intensity',
       'Tubulin_Spindle_Intensity_Threshold',
       'Tubulin_Spindle_Intensity_Variation', 'Version', 'Cell_ID',
       'Experiment', ' ', 'Cell_Volume_um3', 'Cell_Surface_Area_um2',
       'Cell_Sphericity', 'Tubulin_Cell_Average', 'Tubulin_Cell_Minimum',
       'Tubulin_Cell_Maximum', 'Tubulin_Cell_IntegratedDensity', 'Date',
       'Medium', 'Incubation', 'Incubation_bin', 'Fraction_SpindleVol_in_Cell',
       'SpindleVolume_ChromatinVolume_Rat

In [4]:
df["Medium"].value_counts()

2i+LIF          258
FBS+LIF         202
N2B27from2i     146
N2B27fromFBS    139
Name: Medium, dtype: int64

In [5]:
df[df.Fraction_SpindleVol_in_Cell > 0].Medium.value_counts()

2i+LIF          215
FBS+LIF         181
N2B27fromFBS    119
N2B27from2i      98
Name: Medium, dtype: int64

In [6]:
# t-test

def ttest(medium1, medium2, measurement, incubation):
    statistic, pvalue = scipy.stats.ttest_ind(
        df[(df["Medium"] == medium1) & (df["Incubation_bin"] == incubation)][measurement], 
        df[(df["Medium"] == medium2) & (df["Incubation_bin"] == incubation)][measurement], 
        axis = 0, 
        equal_var = False, 
        nan_policy = 'omit'
    )
    print ("The p-value for {} is: ".format(measurement) + str(pvalue))

In [7]:
#ttest("Fraction_SpindleVol_in_Cell", 5)
#ttest("Fraction_SpindleVol_in_Cell", 24)
ttest(medium1 = "2i+LIF", medium2 = "N2B27from2i", measurement = "Fraction_SpindleVol_in_Cell", incubation = 48)
ttest(medium1 = "FBS+LIF", medium2 = "N2B27fromFBS", measurement = "Fraction_SpindleVol_in_Cell", incubation = 48)

The p-value for Fraction_SpindleVol_in_Cell is: 2.1902342567260555e-28
The p-value for Fraction_SpindleVol_in_Cell is: 2.2847013483555973e-06


In [8]:
ttest(medium1 = "2i+LIF", medium2 = "N2B27from2i", measurement = "Cell_Volume_um3", incubation = 48)
ttest(medium1 = "FBS+LIF", medium2 = "N2B27fromFBS", measurement = "Cell_Volume_um3", incubation = 48)

The p-value for Cell_Volume_um3 is: 1.1500074180945582e-41
The p-value for Cell_Volume_um3 is: 5.430628237224075e-13


In [9]:
ttest(medium1 = "2i+LIF", medium2 = "N2B27from2i", measurement = "Chromatin_Volume_um3", incubation = 48)
ttest(medium1 = "FBS+LIF", medium2 = "N2B27fromFBS", measurement = "Chromatin_Volume_um3", incubation = 48)

The p-value for Chromatin_Volume_um3 is: 0.02018780884736471
The p-value for Chromatin_Volume_um3 is: 0.0047504072168199005


In [10]:
df = df[df["Incubation_bin"] == 48]

# ANOVA Testing
# ANOVA as generalized linear model (GLM):
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pingouin as pg
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

measurement = 'Fraction_SpindleVol_in_Cell'
group_variable = 'Medium'
nan_elements = df[measurement].isnull()
data = df[~nan_elements]

# Bartlett's test for equal variances (One-way ANOVA requires equal variances!)

BartlettResult = stats.bartlett(data[data.Medium == '2i+LIF'][measurement], data[data.Medium == 'FBS+LIF'][measurement], data[data.Medium == 'N2B27RAfrom2i'], data[data.Medium == 'N2B27RAfromFBS'][measurement])

print("The Bartlett test for equal variances of {}: ".format(measurement) + str(BartlettResult))


#results = ols('Aspect-Ratio~ C('+group_variable+')', data=data).fit()
results = ols(measurement + '~ C('+group_variable+')', data = data).fit()
print(results.summary())

aov_table = sm.stats.anova_lm(results, typ = 2)

def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq'] / aov[:]['df']
    
    aov['eta_sq'] = aov[:-1]['sum_sq'] / sum(aov['sum_sq'])
    
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df'] * aov['mean_sq'][-1])) / (sum(aov['sum_sq']) + aov['mean_sq'][-1])
    
    cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

aov_table = anova_table(aov_table)
print("\n ANOVA TABLE: ")
print(aov_table)

# Post-hoc testing
mc = MultiComparison(data[measurement], data[group_variable])
mc_results = mc.tukeyhsd()
print("\n\n POST-HOC testing for {}: \n".format(measurement))
print(mc_results)
print("If \"reject\" = True, then H0 should be rejected")

# Welch's ANOVA when variances are unequal
aov_table_WELCH = pg.welch_anova(dv = measurement, between = group_variable, data = data)
print("\n Welch's ANOVA table") 
print(aov_table_WELCH)

# Post-hoc testing using Games-Howell post-hoc test
mc_results_GamesHowell = pg.pairwise_gameshowell(dv = measurement, between = group_variable, data = data)
print("\n Games-Howell post-hoc test table") 
print(mc_results_GamesHowell)

The Bartlett test for equal variances of Fraction_SpindleVol_in_Cell: BartlettResult(statistic=nan, pvalue=nan)
                                 OLS Regression Results                                
Dep. Variable:     Fraction_SpindleVol_in_Cell   R-squared:                       0.189
Model:                                     OLS   Adj. R-squared:                  0.185
Method:                          Least Squares   F-statistic:                     47.38
Date:                         Wed, 08 Nov 2023   Prob (F-statistic):           1.57e-27
Time:                                 11:48:36   Log-Likelihood:                -1386.4
No. Observations:                          613   AIC:                             2781.
Df Residuals:                              609   BIC:                             2799.
Df Model:                                    3                                         
Covariance Type:                     nonrobust                                         
        

  return warn(
  return warn(


In [11]:
df_FBS = df[df["Medium"] == "FBS+LIF"]
df_N2B27 = df[df["Medium"] == "N2B27fromFBS"]
print(df_FBS.shape)
print(df_N2B27.shape)

(202, 46)
(139, 46)


In [12]:
# correlation
def correlation(dataframe_list, independent_column, dependent_column):
    for dataframe in dataframe_list:
        dataframe_name = dataframe.Medium.head(1)
        spearman = stats.spearmanr(
            dataframe[independent_column], 
            dataframe[dependent_column], 
            nan_policy = 'omit')
        print(
            "Spearman correlation for x = {} vs y = {} in {}: {}.".format(
                dependent_column, 
                independent_column,
                dataframe_name, spearman
            )
        )


In [13]:
correlation([df_FBS, df_N2B27], "Population_Density_per_mm2", "Cell_Volume_um3")

Spearman correlation for x = Cell_Volume_um3 vs y = Population_Density_per_mm2 in 57    FBS+LIF
Name: Medium, dtype: object: SpearmanrResult(correlation=0.1849099637880859, pvalue=0.07439095716151867).
Spearman correlation for x = Cell_Volume_um3 vs y = Population_Density_per_mm2 in 256    N2B27fromFBS
Name: Medium, dtype: object: SpearmanrResult(correlation=0.10904119087710568, pvalue=0.21152496820541258).


In [14]:
correlation([df_FBS, df_N2B27], "Population_Density_per_mm2", "Spindle_Volume_um3")

Spearman correlation for x = Spindle_Volume_um3 vs y = Population_Density_per_mm2 in 57    FBS+LIF
Name: Medium, dtype: object: SpearmanrResult(correlation=0.24030463174992533, pvalue=0.020331130147221946).
Spearman correlation for x = Spindle_Volume_um3 vs y = Population_Density_per_mm2 in 256    N2B27fromFBS
Name: Medium, dtype: object: SpearmanrResult(correlation=-0.08203222616267054, pvalue=0.3855719937651838).


In [15]:
# Linear Regression

from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

def linear_fit(dataframe, independent_column, dependent_column): 
    dataframe = dataframe[dataframe[[independent_column, dependent_column]].notnull().all(1)]
    length = dataframe.shape[0]

    X = dataframe[independent_column].values.reshape(length, 1)
    y = dataframe[dependent_column].values.reshape(length, 1)

    regr = linear_model.LinearRegression()
    regr.fit(X, y)
    y_predicted = regr.predict(X)

    # model evaluation
    rmse = mean_squared_error(y, y_predicted)
    R2 = r2_score(y, y_predicted)
    slope = regr.coef_
    interc = regr.intercept_
    
    return rmse, R2, slope, interc

In [16]:
print("Cell Volume vs Pop density, FBS+LIF")
rmse, R2, slope, interc = linear_fit(df_FBS, "Population_Density_per_mm2", "Cell_Volume_um3")  
print('Slope: ', slope)
print('Intercept: ', interc)
print('Root mean squared error: ', rmse)
print('R2 score: ', R2)

Cell Volume vs Pop density, FBS+LIF
Slope:  [[0.08074075]]
Intercept:  [2449.12286467]
Root mean squared error:  128241.83420533144
R2 score:  0.044674357072580384


In [17]:
print("Cell Volume vs Pop density, N2B27")
rmse, R2, slope, interc = linear_fit(df_N2B27, "Population_Density_per_mm2", "Cell_Volume_um3")  
print('Slope: ', slope)
print('Intercept: ', interc)
print('Root mean squared error: ', rmse)
print('R2 score: ', R2)

Cell Volume vs Pop density, N2B27
Slope:  [[0.03171068]]
Intercept:  [2287.71705084]
Root mean squared error:  119075.51046660525
R2 score:  0.007314946355016505


In [18]:
print("Spindle Volume vs Pop density, FBS+LIF")
rmse, R2, slope, interc = linear_fit(df_FBS, "Population_Density_per_mm2", "Spindle_Volume_um3")  
print('Slope: ', slope)
print('Intercept: ', interc)
print('Root mean squared error: ', rmse)
print('R2 score: ', R2)

Spindle Volume vs Pop density, FBS+LIF
Slope:  [[0.02054809]]
Intercept:  [284.92855847]
Root mean squared error:  5166.260696366979
R2 score:  0.07062241191508267


In [19]:
print("Spindle Volume vs Pop density, N2B27")
rmse, R2, slope, interc = linear_fit(df_N2B27, "Population_Density_per_mm2", "Spindle_Volume_um3")  
print('Slope: ', slope)
print('Intercept: ', interc)
print('Root mean squared error: ', rmse)
print('R2 score: ', R2)

Spindle Volume vs Pop density, N2B27
Slope:  [[-0.00430082]]
Intercept:  [293.51210588]
Root mean squared error:  2360.5415167254537
R2 score:  0.006370304587267728


In [20]:
print("Spindle Occupancy vs Pop density, FBS+LIF")
rmse, R2, slope, interc = linear_fit(df_FBS, "Population_Density_per_mm2", "Fraction_SpindleVol_in_Cell")  
print('Slope: ', slope)
print('Intercept: ', interc)
print('Root mean squared error: ', rmse)
print('R2 score: ', R2)

Spindle Occupancy vs Pop density, FBS+LIF
Slope:  [[0.00036255]]
Intercept:  [11.83295702]
Root mean squared error:  5.268279997276588
R2 score:  0.022672577171331754


In [21]:
print("Spindle Occupancy vs Pop density, N2B27")
rmse, R2, slope, interc = linear_fit(df_N2B27, "Population_Density_per_mm2", "Fraction_SpindleVol_in_Cell")  
print('Slope: ', slope)
print('Intercept: ', interc)
print('Root mean squared error: ', rmse)
print('R2 score: ', R2)

Spindle Occupancy vs Pop density, N2B27
Slope:  [[-0.00030684]]
Intercept:  [12.78857207]
Root mean squared error:  5.588599175604028
R2 score:  0.013596581727408297
