## Generate the interquartile range for a large sample of cell sizes.

Cells outside this range can then be labelled as outliers.

To correctly label small cells as outliers we need to use the log10 of all the cell sizes to find the IQR.
I have set the upper and lower interquartile percentiles to 35 and 85 because the smaller cells are more likely to be outliers.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
%matplotlib inline

In [None]:
def getIQR(inData):
    """
    Get the interquartile range for a 1D input data set inData.
    Return a the lower and upper bounds.
    """
    quart1, quart3 = np.percentile(inData, [35, 85])
    IQR = quart3 - quart1
    lowerBound = quart1 - (IQR * 1.5)
    upperBound = quart3 + (IQR * 1.5)
    return lowerBound, upperBound

In [None]:
# Import large dataset of cell information
cellData = np.genfromtxt("./logs/vstackedCellData.csv", delimiter=",", dtype=int)
# Get the log10 of sizes
cellsLogged = np.log10(cellData[:,2])
loBound, hiBound = getIQR(cellsLogged)
mask = (cellsLogged < loBound) | (cellsLogged > hiBound)

print("Low, high thresholds of IQR:", int(10**loBound), int(10**hiBound))
print("Number of outliers, inliers:", cellLogged[mask].shape[0], cellLogged[~mask].shape[0])

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,6), sharex=True)

# Get heatmap scatter plot data
x = cellsLogged 
y = cellData[:,3]
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

ax[0].scatter(x, y, c=z, s=10, edgecolor='')
ax[0].axvline(x=loBound, ls=":", c="r")
ax[0].axvline(x=hiBound, ls=":", c="r")

ax[0].set_ylabel("Detected vacuoles")
#ax[0].set_xlabel(r"Size of cell ($log_{10}(pixels^2)$)")
ax[0].set_title("Density heatmap visualisation of cell data")
ax[0].set_xlabel(r"Size of cell ($log_{10}(pixels^2)$)")

ax[1].set_title("Histogram of logarithmic cell sizes")
ax[1].hist(cellsLogged[mask], bins=200, label="Outliers")
ax[1].hist(cellsLogged[~mask], bins=50, label="Inliers", color="r")
ax[1].legend(loc=2)
ax[1].set_ylabel("Population")
ax[1].set_xlabel(r"Size of cell ($log_{10}(pixels^2)$)")
ax[1].axvline(x=loBound, ls=":", c="r")
ax[1].axvline(x=hiBound, ls=":", c="r")
#ax[1].set_title("Histogram of cell sizes")
plt.tight_layout()
plt.savefig("./figures/interquartRangeLS.pdf")