In [None]:
%load_ext watermark
%watermark -a Filippo_Valle -p pandas,numpy,scipy,matplotlib -m -v -g

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Single Tissue

In [None]:
tissue = "Muscle_"
tissues = tissue.split("_")
#df = pd.read_csv(f"../Smartseq3.HEK.fwdprimer.UMIcounts.txt", sep="\t", index_col=0, header=0)
#ortho = pd.read_csv("orthologues.txt").set_index("Gene stable ID")
#df = pd.read_csv(f"../Smartseq3.Fibroblasts.NovaSeq.UMIcounts.txt", sep="\t", index_col=0, header=0).join(ortho["Human gene stable ID"], how="inner").set_index("Human gene stable ID")
df = pd.read_csv(f"mca/mainTable_{tissues[0]}.csv", sep=",", index_col=0, header=0).sample(1000, axis=1, replace=False)
#.join(
#    pd.read_csv(f"mca/mainTable_{tissues[1]}.csv", sep=",", index_col=0, header=0).sample(1000, axis=1, replace=False), how="inner"
#)

In [None]:
M = df.sum(axis=0).sort_values(ascending=False)

In [None]:
M.hist(bins=50)

## Core prediction

In [None]:
quantiles = np.quantile(M.values, q=np.linspace(0,1,11))[:-1]
#idxs = np.linspace(500,M.max(),num=20)

sizes = {}
sizes_pred = {}
thetas_c = [0.6,0.65,0.7,0.75,0.8]
for theta in thetas_c:
    sizes["%.2f"%theta] = []
    sizes_pred["%.2f"%theta] = []

P = df.divide(df.sum(0),1).mean(1)

#for (m_, m) in zip(M[idxs].index[:-1], M[idxs].index[1:]):
for (m_,m) in zip(quantiles[:-1], quantiles[1:]):
    print(f"[{m_},{m})")
    M_avg = (m_+m)/2.
    r = len(M[(m_<=M)&(M<m)])
    O = df.reindex(columns=M[(m_<=M)&(M<m)].index).dropna(how="any", axis=1).apply(lambda g: len(g[g>0])/float(len(g)), axis=1)
    O_pred = np.array(list(map(lambda p: 1-1./r*np.sum(np.exp(-p*M[(m_<=M)&(M<m)].values)),P.values)))
    for theta in thetas_c:
        sizes["%.2f"%theta].append(len(O[O>theta])/len(O))
        sizes_pred["%.2f"%theta].append(len(O_pred[O_pred>theta])/len(O_pred))
    del O

In [None]:
q_means = (quantiles[1:]+quantiles[:-1])/2.
#q_means = (M[idxs].index[1:]+M[idxs].index[:-1])/2.

M_bin=q_means

N=df.shape[0]

def fit_teo(X_data, gamma, thetac, N=df.dropna(how="all", axis=1).shape[0]):
    """
    - gamma: Zipf exponent
    - theta_c: Occurrences thresholds
    """
    M_bin = X_data
    
    i = np.arange(1,N+1, step=1)
    alpha_i=i**(-gamma)
    alpha=np.sum(alpha_i, axis=0)
    # predicted core size 

    k=M_bin**(1/gamma)/(alpha**(1/gamma)*N)
    c=k*(-np.log(1-thetac))**(-1/gamma)
    return c

In [None]:
from scipy.optimize import curve_fit
fig,ax = plt.subplots(figsize=(18,15))

for (threshold,sizes_arr),(_, sizes_pred_arr), color in zip(sizes.items(),sizes_pred.items(),["blue", "gray", "red", "orange", "green"]):
    ax.plot(q_means, sizes_arr, lw=10, marker="o", ms=45, c=color, label=f"data")
    popt = [0.8, float(threshold)]
    ax.plot(q_means, sizes_pred_arr, c="dark"+color, lw=15, ls="--", label="sampling ($\\theta_c$=%.2f)"%(popt[1]), alpha=0.7)
    #ax.plot(q_means, fit_teo(q_means, *popt), c="dark"+color, lw=15, ls="--", label="predicted ($\\theta_c$=%.2f)"%(popt[1]), alpha=0.7)

threshold = 0.9
#ax.plot(q_means, sizes["0.9"], lw=15, marker="o", ms=30, c="gray", label=f"data with thr={threshold}")


#popt, pcov = curve_fit(fit_teo, q_means, sizes["0.9"], p0=[0.8, 0.95])
popt = [0.8, 0.9]
#ax.plot(q_means, fit_teo(q_means, *popt), lw=15, ls="--", label="predicted (gamma=%.2f; thetac=%.2f)"%(popt[0],popt[1]), alpha=0.8, c="orange")

ax.set_ylim(0.,max(list(map(max,list(zip(*sizes.items()))[1])))*1.8)

ax.set_ylabel("Core size", fontsize=45)
ax.set_xlabel("Total reads per cell, $M$", fontsize=45)

ax.tick_params(labelsize=35, width=8, length=20)
ax.tick_params(which="minor", labelsize=35, width=5, length=15)

ax.legend(ncol=2,fontsize=30)
plt.show()
fig.savefig(f"U_core_prediction_{tissue}_allthr.pdf")

In [None]:
df = df[M.index]
A = df.sum(1)
f = df.divide(df.sum(0), 1).mean(1).sort_values(ascending=False)
#df=df[df.index.isin(f[49:4500].index)]
#save_model(df,tissue="bonemarrow_M100",name="data", n_bins=15)
#mazzolini(np.repeat(M.mean(), len(M)), A/A.sum(), "bonemarrow_M_avg", n_bins=15)

In [None]:
fig,ax = plt.subplots(figsize=(18,15))
x = np.arange(1, len(f))
ax.set_ylabel("$Frequency, f_i$", fontsize=35)
ax.set_xlabel("$RANK, i$", fontsize=35)
ax.set_yscale('log')
ax.set_xscale('log')
#plt.ylim(1e-7,1)
ax.plot(x, np.power(x,-0.8)*1e-1, 'g--', lw=10, label='$k*i^{-0.8}$')
ax.plot(np.sort(f.dropna())[::-1]/f.sum(), c='blue', lw=15, label='Genes')
if "f_null" in vars().keys():
    ax.plot(np.sort(f_null/f_null.sum())[::-1], c='orange', ls='--', lw=15, label='null_model')
    
ax.tick_params(labelsize=35, width=8, length=20)
ax.tick_params(which="minor", labelsize=35, width=5, length=15)

ax.legend(fontsize=35)
plt.show()
#fig.savefig("zipf.pdf")

## Predict O

In [None]:
P = df.divide(df.sum(0),1).mean(1)
P = P/P.sum()
M = df.sum(0)
O = df.apply(lambda x: (x>0).sum()/len(x), axis=1)
R = M.shape[0]

In [None]:
O_real = np.array([])
O_pred = np.array([])
O_pred_ds = np.array([])


step = 1500
for start in range(0,len(O),step):
    print(start)
    mask = np.repeat(False, len(O))
    mask[start:start+step]=True

    O_real = np.concatenate([O_real,O[mask]])
    O_pred = np.concatenate([O_pred,list(map(lambda p: 1-1/R*np.sum(np.exp(-p*M)),P[mask]))])
    
genes = df.index[O_real < (0.8*O_pred)].values

In [None]:
f = P

pos_mask = (~f.isna()) & (f>0)

g_high = f.index[O_pred > O_real + 0.2].values
g_low = f.index[O_pred < O_real * 0.6 - 0.03].values

#mask_high = (f.index.isin(g_high)) & (pos_mask)
#mask_low = (f.index.isin(g_low)) & (pos_mask)
#mask_nc = (f.index.isin(nc)) & (pos_mask)

f_sorted = f[f>0].sort_values(ascending=False)
mask_sorted_high = f_sorted.index.isin(g_high)
mask_sorted_low = f_sorted.index.isin(g_low)

In [None]:
def p(o, M, gamma, N):
    i = np.arange(1, N+1, step=1)
    alpha_i = i**(-gamma)
    alpha = np.sum(alpha_i, axis=0)
    p_num = (1-o)**(1/M-1)
    K = (1-(1-o)**(1/M))**(1+1/gamma)
    p_den = gamma*M*N*(alpha**(1/gamma))*K
    return p_num/p_den

In [None]:
fig, ax = plt.subplots(figsize=(18,15))

nbins = 25
rang = (0-0.5/nbins, 1+0.5/nbins)

#bins = np.logspace(np.log10(1e-2),np.log10(rang[1]),num=nbins)
bins=np.linspace(1e-4,rang[1],num=nbins)

ax.hist(O, bins=bins, color = "gray", label="data", density=True)

O_pred_hist, bin_edges = np.histogram(O_pred, bins=bins, density=True)
ax.plot((bin_edges[:-1]+bin_edges[1:])/2, O_pred_hist, ls='--', lw=10, c="red", label="O_predicted")
#ax.hist(load_tissue("Bone-Marrow_c-kit",name="mazzolini")["O"], histtype="step", ls='--', bins=bins, density=True, color="orange", lw=10, label="sampling")

ax.plot(bins,[p(x, M.mean(), 0.8, len(O)) for x in bins], lw=15, ls="--", color="darkred", label="teo_prediction <M>")


ax.tick_params(labelsize=35, width=8, length=20)
ax.tick_params(which="minor", width=5, length=10)

ax.set_yscale('log')
#ax.set_xscale('log')

ax.set_ylim(5e-4,1e2)

ax.legend(fontsize=35)
ax.set_xlabel("Occurrence, $O_i$", fontsize=35)
ax.set_ylabel("pdf", fontsize=35)

plt.tight_layout()
plt.show()
fig.savefig(f"U_{tissue}_sampling_pred.pdf")