In [None]:
import matplotlib.gridspec as gs
import matplotlib.pyplot as pp
import pandas as pd
import scipy.stats as ss
import seaborn as sb
import statsmodels.api as sm

from pandas.api.types import CategoricalDtype

In [None]:
def format_p_val(p):
    """Given a p-value format it to a nice string.

    Examples:
        format_p_val(0.43242323) -> p = 0.432
        format_p_val(0.000001) -> p < 0.001
    """
    if p < 0.001:
        return "p < 0.001"
    else:
        return "p = {}".format(round(p, 3))

First let's load the dataset and set the columns to the appropriate type.

Key point is to set the categorical/ordinal columns using the `astype` method and the `CategoricalDtype` class.

### Loading the data

In [None]:
# Read the data
file_name = "../data/mouse_linear_regression.csv"
df = pd.read_csv(file_name)
# Set relevant variables to categorical/ordinal with appropriate levels and order
df["Genotype"] = df["Genotype"].astype(
    CategoricalDtype(categories=["WT", "-/-"], ordered=True)
)
df["Sex"] = df["Sex"].astype(CategoricalDtype(categories=["F", "M"], ordered=False))
df["Treatment group"] = df["Treatment group"].astype(
    CategoricalDtype(categories=["Control", "TX", "TX2"], ordered=True)
)
# Take a look a the data
df.head()

### Exploratory data analysis

Now let's take a quick look at the data.
Here we use the seaborn [`catplot`](https://seaborn.pydata.org/generated/seaborn.catplot.html) function which allows us to create panelled plots for categorical data.
There are various types of plot supported throught the `kind` argument.

In [None]:
sb.catplot(
    df,
    kind="swarm",
    col="Sex",
    row="Genotype",
    x="Treatment group",
    y="Weight",
    color="k",
    aspect=1, # Width to height ratio
    height=4, # Height
)

The `catplot` function is useful for quickly getting a look at the data, but lacks flexibility if we want to combine different types of plots.
Previously we had seen how could use the `pointplot` and `swarmplot` functions to data plots with errors bars.
One way to do this is to use the [plotnine](https://plotnine.org) package which attempts to mimic R's ggplot library.
I prefer to use [matplotlib](https://matplotlib.org) along with seaborn.
It is a steeper learning curve, but ultimately gives a lot more control.

The key idea below is to use the `GridSpec` class which allows us to create a grid of `Axes` object.
`GridSpec` really only lays out the positions of the boxes for the `Axes`, to create them we first create a `Figure` object and then create `Axes` in the relevant location using `add_subplot`.

> Note: The `tight_layout` method is really useful for getting the `Axes` to be nicely spaced.

In [None]:
# Start by defining which columns we need for the plot
col = "Sex"
row = "Genotype"
x = "Treatment group"
y = "Weight"

In [None]:
# Now we need to find out how many rows and columns we need.
# To do so we just ask how many unique values are present in the relevant columns of our dataset.
ncols = df[col].nunique()
nrows = df[row].nunique()

In [None]:
# Create the grid and do our plotting
grid = gs.GridSpec(nrows=nrows, ncols=ncols)
# Create a figure to plot to
fig = pp.figure(figsize=(8, 8))
# Now we loop through the different row and column values to create our plots
for i, row_val in enumerate(df[row].unique()):
    for j, col_val in enumerate(df[col].unique()):
        # Create an Axes object for plotting
        ax = fig.add_subplot(grid[i, j])
        # Subset our DataFrame to the relevant entries
        plot_df = df[(df[row] == row_val) & (df[col] == col_val)]
        # Now we can plot using the code from worksheet 2
        sb.swarmplot(
            plot_df,
            ax=ax,
            x=x,
            y=y,
            color="k",
        )
        sb.pointplot(
            plot_df,
            ax=ax,
            x=x,
            y=y,
            capsize=0.4,
            color="k",
            estimator="median",
            errorbar=("pi", 50),
            linestyle="none",
        )
        ax.set_xlabel(x)
        ax.set_ylabel(y)
        ax.set_title("Genotype = {0} | Sex = {1}".format(row_val, col_val))
# Usually a good idea to do tight_layout
grid.tight_layout(fig)

### Testing difference of means

Our EDA has revealed that genotype -/- seems to have an interaction with the treatment, whereas the WT genotype doesn't seem to be impacted by treatment.
To start with let's split the data by genotype and test the difference of means.

In [None]:
# Subset by genotype
df_geno = df[df["Genotype"] == "WT"]
# Create one group for control
x = df_geno.loc[df_geno["Treatment group"] == "Control", "Weight"]
# Create another group for single treatment
y = df_geno.loc[df_geno["Treatment group"] == "TX", "Weight"]
# Use scipy to run a t test noting it returns a results object
res = ss.ttest_ind(x, y)
# Get the p-value
p = res.pvalue
# Get the confidence interval
ci = res.confidence_interval()
# Finally report them
"{0}, ci = [{1}, {2}]".format(format_p_val(p), round(ci.low, 3), round(ci.high, 3))

As expected, no effect for the WT genotype.
Let's try for the knockout genotype.

In [None]:
# Subset by genotype
df_geno = df[df["Genotype"] == "-/-"]
# Create one group for control
x = df_geno.loc[df_geno["Treatment group"] == "Control", "Weight"]
# Create another group for single treatment
y = df_geno.loc[df_geno["Treatment group"] == "TX", "Weight"]
# Use scipy to run a t test noting it returns a results object
res = ss.ttest_ind(x, y)
# Get the p-value
p = res.pvalue
# Get the confidence interval
ci = res.confidence_interval()
# Finally report them
"{0}, ci = [{1}, {2}]".format(format_p_val(p), round(ci.low, 3), round(ci.high, 3))

Now we see an effect.
We should conduct our assumptions check on the data.
Here we will do Shapiro-Wilks for normality and Levene's test for equal variance of the populations.

In [None]:
# Shapiro-Wilks
ss.shapiro(x).pvalue, ss.shapiro(y).pvalue

In [None]:
# Levene's test
ss.levene(x, y).pvalue

Normality and equal variance hold.
So far this looks like a lot more work than JASP.
The benefit comes if we want to do a few different comparisons.
We can reuse the previous code but put it inside a loop.

In [None]:
genos = ["-/-", "WT"]
txs = ["TX", "TX2"]
for g in genos:
    for t in txs:
        df_geno = df[df["Genotype"] == g]
        # Create one group for control
        x = df_geno.loc[df_geno["Treatment group"] == "Control", "Weight"]
        # Create another group for single treatment
        y = df_geno.loc[df_geno["Treatment group"] == t, "Weight"]
        # Use scipy to run a t test noting it returns a results object
        res = ss.ttest_ind(x, y)
        # Get the p-value
        p = res.pvalue
        # Get the confidence interval
        ci = res.confidence_interval()
        # Finally report them
        result_str = "{0}, ci = [{1}, {2}]".format(format_p_val(p), round(ci.low, 3), round(ci.high, 3))
        print("#" * 80)
        print(g, t)
        print(result_str)
        print("Shapiro-Wilks", ss.shapiro(x).pvalue, ss.shapiro(y).pvalue)
        print("Levene", ss.levene(x, y).pvalue)
        print()

Now it would be nice to save the results in a table instead of printing.

In [None]:
# Our table
results = []
# Previous code
genos = ["-/-", "WT"]
txs = ["TX", "TX2"]
for g in genos:
    for t in txs:
        df_geno = df[df["Genotype"] == g]
        # Create one group for control
        x = df_geno.loc[df_geno["Treatment group"] == "Control", "Weight"]
        # Create another group for single treatment
        y = df_geno.loc[df_geno["Treatment group"] == t, "Weight"]
        # Use scipy to run a t test noting it returns a results object
        res = ss.ttest_ind(x, y)
        # Get the p-value
        p = res.pvalue
        # Get the confidence interval
        ci = res.confidence_interval(0.95)
        # Create a row as a dictionary
        row = {
            "genotype": g,
            "treatment": t,
            "p": res.pvalue,
            "CI Lower": ci.low,
            "CI Upper": ci.high,
            "Shapiro-Wilks (control)": ss.shapiro(x).pvalue,
            "Shapiro-Wilks (treatment)": ss.shapiro(y).pvalue,
            "Levene": ss.levene(x, y).pvalue
        }
        # Add the row to our list which will become our table
        results.append(row)
results = pd.DataFrame(results)
results

Now we should also be doing multiple test correction on the p-values.
The `statsmodels` (imported as `sm`) package provides support for this.
Below I will use Bonferroni to adjust the p-values and Benjamini/Hochberg to get the false discovery rate.

In [None]:
results["p_bon"] = sm.stats.multipletests(results["p"], method="bonferroni")[1]
results["fdr"] = sm.stats.multipletests(results["p"], method="fdr_bh")[1]
results

If the normality tests had failed we could also perform Mann Whitney.

> Caveat: scipy does not seem to support CI's for Mann Whitney.

In [None]:
df_geno = df[df["Genotype"] == "-/-"]
# Create one group for control
x = df_geno.loc[df_geno["Treatment group"] == "Control", "Weight"]
# Create another group for single treatment
y = df_geno.loc[df_geno["Treatment group"] == "TX", "Weight"]
# Mann Whitney
ss.mannwhitneyu(x, y)

If the equal variance test fails we can adjust the t test.

In [None]:
df_geno = df[df["Genotype"] == "-/-"]
# Create one group for control
x = df_geno.loc[df_geno["Treatment group"] == "Control", "Weight"]
# Create another group for single treatment
y = df_geno.loc[df_geno["Treatment group"] == "TX", "Weight"]
# Mann Whitney
ss.ttest_ind(x, y, equal_var=False)