In [7]:
import os 
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.linear_model

import scipy
from scipy import stats
from scipy.optimize import curve_fit

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [8]:
root_dir = "/Volumes/TOB_WD2/Image_Analysis/CCB02/Dataframes" + "/"

# load data
df = pd.read_csv(root_dir + "MasterDataFrame_CCB02_Live_for_stats.csv")

In [11]:
df.groupby("Condition").Cell_Volume_um3.mean()

Condition
1_DMSO        2776.499772
5_10µM        2926.465597
6_N2B27+RA    2582.392529
Name: Cell_Volume_um3, dtype: float64

In [12]:
df.groupby("Condition").Cell_Volume_um3.std()

Condition
1_DMSO        412.036349
5_10µM        482.929864
6_N2B27+RA    463.374036
Name: Cell_Volume_um3, dtype: float64

In [17]:
df.groupby("Condition").Spindle_Volume_um3.mean()

Condition
1_DMSO        385.589499
5_10µM        328.996061
6_N2B27+RA    284.661075
Name: Spindle_Volume_um3, dtype: float64

In [18]:
df.groupby("Condition").Spindle_Volume_um3.std()

Condition
1_DMSO        62.178657
5_10µM        67.454413
6_N2B27+RA    63.179689
Name: Spindle_Volume_um3, dtype: float64

In [14]:
def ttest(medium1, medium2, measurement):
    statistic, pvalue = scipy.stats.ttest_ind(
        df[(df["Condition"] == medium1)][measurement], 
        df[(df["Condition"] == medium2)][measurement], 
        axis = 0, 
        equal_var = False, 
        nan_policy = 'omit'
    )
    print ("The p-value for {} is: ".format(measurement) + str(pvalue))

In [15]:
ttest("1_DMSO", "5_10µM", "Spindle_Occupancy")

The p-value for Spindle_Occupancy is: 3.8046894926789685e-20


In [16]:
# ANOVA Testing
# ANOVA as generalized linear model (GLM):
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pingouin as pg
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

measurement = 'Spindle_Occupancy'
group_variable = 'Condition'
nan_elements = df[measurement].isnull()
data = df[~nan_elements]

print(data[group_variable].value_counts())

# Bartlett's test for equal variances (One-way ANOVA requires equal variances!)

BartlettResult = stats.bartlett(data[data.Condition == 'Full+CCB1uM'][measurement], data[data.Condition == 'Full+DMSO'][measurement])

print("The Bartlett test for equal variances of {}: ".format(measurement) + str(BartlettResult))


#results = ols('Aspect-Ratio~ C('+group_variable+')', data=data).fit()
results = ols(measurement + '~ C('+group_variable+')', data = data).fit()
print(results.summary())

aov_table = sm.stats.anova_lm(results, typ = 2)

def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq'] / aov[:]['df']
    
    aov['eta_sq'] = aov[:-1]['sum_sq'] / sum(aov['sum_sq'])
    
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df'] * aov['mean_sq'][-1])) / (sum(aov['sum_sq']) + aov['mean_sq'][-1])
    
    cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

aov_table = anova_table(aov_table)
print("\n ANOVA TABLE: ")
print(aov_table)

# Post-hoc testing
mc = MultiComparison(data[measurement], data[group_variable])
mc_results = mc.tukeyhsd()
print("\n\n POST-HOC testing for {}: \n".format(measurement))
print(mc_results)
print("If \"reject\" = True, then H0 should be rejected")

# Welch's ANOVA when variances are unequal
aov_table_WELCH = pg.welch_anova(dv = measurement, between = group_variable, data = data)
print("\n Welch's ANOVA table") 
print(aov_table_WELCH)

# Post-hoc testing using Games-Howell post-hoc test
mc_results_GamesHowell = pg.pairwise_gameshowell(dv = measurement, between = group_variable, data = data)
print("\n Games-Howell post-hoc test table") 
print(mc_results_GamesHowell)

ModuleNotFoundError: No module named 'statsmodels'