In [None]:
import numpy as np

np.random.seed(42)

n = 10_000

# 1) mayoría: distribución normal
expression = np.random.normal(loc=10, scale=2, size=n)

# 2) introducir ceros (por ejemplo 5%)
# la funcion choice espera un array, lista o entero. Si es entero lo transforma en array de 0 al valor pasado.
zero_idx = np.random.choice(n, size=int(0.05*n), replace=False)
expression[zero_idx] = 0

# 3) introducir outliers altos (por ejemplo 1%)
remaining_idx = np.setdiff1d(np.arange(n), zero_idx)
outliers_idx = np.random.choice(remaining_idx, size=int(0.01*n), replace=False)
expression[outliers_idx] += 30

#Otra forma de hacerlo usando where:
# mask_zero = np.random.rand(n) < 0.05
# expression = np.where(mask_zero, 0, expression)


In [13]:
mean_expression = np.mean(expression)
std_expression = np.std(expression)
p1_expression, p50_expression, p99_expression = np.percentile(expression, [1, 50, 99])

mean_expression, std_expression, p1_expression, p50_expression, p99_expression

(np.float64(9.795728165513445),
 np.float64(4.209778703312631),
 np.float64(0.0),
 np.float64(9.890926773143363),
 np.float64(17.87634560974353))

In [25]:
# Filtrado de outliers segun percentile 1 y 99
expression_filtered_mask = (expression >= p1_expression) & (expression <= p99_expression)
expression_filtered = expression[expression_filtered_mask]

low, high = np.percentile(expression, [1, 99])
expression_clipped = np.clip(expression, low, high)

In [19]:
# Operación incorrecta a propósito: x/x (cuando x=0 -> 0/0 = NaN)
bad = expression_filtered / expression_filtered

# Esto detecta exactamente las posiciones donde expression era 0
zero_mask = np.isnan(bad)

# Comprobación rápida (opcional pero útil)
print("Ceros detectados:", zero_mask.sum())
print("Ceros reales:", (expression_filtered == 0).sum())

eps = 1e-8  # pequeño y > 0

log_expression = np.log(expression_filtered + eps)

print("Ejemplo (ceros ->):", log_expression[zero_mask][:5])
print("Ejemplo (otros valores ->):", log_expression[~zero_mask][:5])

Ceros detectados: 500
Ceros reales: 500
Ejemplo (ceros ->): [-18.42068074 -18.42068074 -18.42068074 -18.42068074 -18.42068074]
Ejemplo (otros valores ->): [2.39729767 2.2745427  2.56848615 2.25462238 2.25462582]


  bad = expression_filtered / expression_filtered


In [16]:
def summarize(x):
    return {
        "mean": np.mean(x),
        "std": np.std(x),
        "p1": np.percentile(x, 1),
        "p50": np.percentile(x, 50),
        "p99": np.percentile(x, 99),
    }


In [27]:

summary_raw = summarize(expression)
summary_clean = summarize(expression_filtered)
summary_clipped = summarize(expression_clipped)
summary_log = summarize(log_expression)

keys = np.array(["mean","std","p1","p50","p99"])
raw_vals = np.array([summary_raw[k] for k in keys])
clean_vals = np.array([summary_clean[k] for k in keys])
clipped_vals = np.array([summary_clipped[k] for k in keys])
log_vals = np.array([summary_log[k] for k in keys])
np.column_stack([keys, raw_vals, clean_vals, clipped_vals, log_vals])


array([['mean', '9.795728165513445', '9.49149248630511',
        '9.575341017539493', '1.235145565446647'],
       ['std', '4.209778703312631', '2.933424553349326',
        '3.035614741187846', '4.538021811028718'],
       ['p1', '0.0', '0.0', '0.0', '-18.420680743952367'],
       ['p50', '9.890926773143363', '9.872044403019936',
        '9.890926773143363', '2.2897069658355917'],
       ['p99', '17.87634560974353', '14.635307432373914',
        '17.70717180759382', '2.6834369269951486']], dtype='<U32')

While percentile filtering reduced dispersion by removing extreme values, the logarithmic transformation increased the standard deviation due to the expansion of low values close to zero. This is an expected effect of log transformations and highlights the importance of careful zero handling.

Outliers were handled using percentile-based filtering. Alternatively, clipping could be used to preserve array size while limiting extreme values, which may be preferable when index correspondence must be maintained, for example when working with gene-by-sample matrices.

The choice between filtering and clipping depends on whether removing observations or preserving structural alignment is more critical for downstream analysis.