In [None]:
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import logging, sys, gc
import pandas as pd
from scipy.stats import binned_statistic
from scipy.optimize import curve_fit
from frontiers_analysis import load_tissue

In [None]:
tissues = ["Placenta", "Lung"]
colors = ["orange", "blue"]
tissue, other_tissue = tissues

In [None]:
df = pd.read_csv(f"mca/mainTable_{tissue}.csv", index_col=0)
M = df.sum(0)
f = df.divide(M,1).mean(1)
O = df.apply(lambda x: (x>0).sum(), 1)

In [None]:
other_df = pd.read_csv(f"mca/mainTable_{other_tissue}.csv", index_col=0)
other_M = other_df.sum(0)
other_f = other_df.divide(other_M,1).mean(1)
other_O = other_df.apply(lambda x: (x>0).sum(), 1)

In [None]:
merged_df = df.join(other_df, how="outer").fillna(0)
print(merged_df.shape)
merged_M = merged_df.sum(0)
merged_f = merged_df.divide(merged_M,1).mean(1)
merged_O = merged_df.apply(lambda x: (x>0).sum(), 1)

In [None]:
from methods import mazzolini as sampling

# Create models

In [None]:
model = sampling(M=M, f=f)
other_model = sampling(M=other_M, f=other_f)
models = [model, other_model]
for method in models:
    print(method)
    method.run()
    
merged_model = sampling(M=merged_M, f=merged_f)
merged_model.run()

## Zipf

In [None]:
plt.plot(np.sort(f/f.sum())[::-1], lw=10, c=colors[0], alpha=0.5, label=tissue)
plt.plot(np.sort(other_f/other_f.sum())[::-1], lw=10, c=colors[1], alpha=0.5, label=other_tissue)

for model, c in zip(models, colors):
    plt.plot(model.get_f(), lw=10, alpha=0.5, ls=":", c="dark"+c)

plt.xlabel("i")
plt.ylabel("f")

plt.xscale("log")
plt.yscale("log")

plt.legend()

## Heaps

In [None]:
bins = np.logspace(np.log10(merged_M.min()), np.log10(merged_M.max()), 35)
#bins = np.linspace(M.min(), M.max(), 35)

h = merged_df.apply(lambda x: (x>0).sum())
plt.scatter(merged_M, h, c="gray", alpha=0.8, label="data")

means, edges, _ = binned_statistic(merged_M, h, bins=bins)
var, edges, _ = binned_statistic(merged_M, h, statistic="std", bins=bins)
cnt, edges, _ = binned_statistic(merged_M, h, statistic="count", bins=bins)
var = var*var
mask = cnt > 10
means = means[mask]
var = var[mask]
l_edges = (edges[:-1])[mask]
r_edges = (edges[1:])[mask]
plt.hlines(means, l_edges, r_edges, lw=5, color="black", ls="--")

print(model.name_)
means, edges, _ = binned_statistic(merged_M, merged_model.get_h(), bins=bins)
var, edges, _ = binned_statistic(merged_M, merged_model.get_h(), statistic="std", bins=bins)
cnt, edges, _ = binned_statistic(merged_M, merged_model.get_h(), statistic="count", bins=bins)
var = var*var
mask = cnt > 100
means = means[mask]
var = var[mask]
l_edges = (edges[:-1])[mask]
r_edges = (edges[1:])[mask]

merged_model.hmean = means
merged_model.hvar = var
merged_model.cnt = cnt

plt.hlines(means, l_edges, r_edges, lw=5, color="dark"+model.color_, ls="--")

plt.scatter(merged_M, merged_model.get_h(), alpha=0.2, c=model.color_, label=model.name_)

plt.xlabel("M")
plt.ylabel("h")

plt.legend()

### Fluctuations

In [None]:
h = merged_df.apply(lambda x: (x>0).sum())
means, edges, _ = binned_statistic(merged_M, h, bins=bins)
var, edges, _ = binned_statistic(merged_M, h, statistic="std", bins=bins)
var = var*var
mask = cnt > 100
means = means[mask]
var = var[mask]

x = means

plt.scatter(means, var, c="gray", alpha=0.8, label="data")

popt, pcov= curve_fit(lambda x, C: C*x, means, var)
plt.plot(x, popt[0]*x, lw=5, ls="--", c="cyan", alpha=0.8, label="C*<h>")

popt, pcov= curve_fit(lambda x, C: C*x*x, means, var)
plt.plot(x, popt[0]*x**2, lw=5, ls="--", c="purple", alpha=0.8, label ="C*<h>^2")

plt.xlabel("<h>")
plt.ylabel("var(h)")

plt.xscale("log")
plt.yscale("log")

plt.legend()

#plt.ylim(1e2,1e3)

## Predicted occurrences

In [None]:
Os = []
for i in range(5):
    method = sampling(M=M, f=f)
    method.run()
    print(i, method)
    Os.append(method.get_O())
O_sampling = np.average(Os, axis=0)

Os = []
for i in range(5):
    method = sampling(M=other_M, f=other_f)
    method.run()
    print(i, method)
    Os.append(method.get_O())
O_other_sampling = np.average(Os, axis=0)

Os = []
for i in range(5):
    method = sampling(M=merged_M, f=merged_f)
    method.run()
    print(i, method)
    Os.append(method.get_O())
O_merged_sampling = np.average(Os, axis=0)

In [None]:
mask = (O_merged_sampling-merged_O/merged_df.shape[1])>0.2

In [None]:
fig, axs = plt.subplots(1, 1+len(models), figsize=(30, 15))

models[0].color_="gray"
models[1].color_="gray"
for model, ax in zip(models, axs):
    ax.set_title(model.name_, fontsize=35)
    
axs[0].scatter(O/df.shape[1], O_sampling, alpha=0.5, s=350, color="gray", marker="o")    
axs[1].scatter(other_O/other_df.shape[1], O_other_sampling, alpha=0.5, s=350, color="gray", marker="o")  
axs[2].scatter(merged_O/merged_df.shape[1], O_merged_sampling, alpha=0.5, s=350, color="gray", marker="o")
axs[2].scatter((merged_O/merged_df.shape[1])[mask], O_merged_sampling[mask], alpha=0.5, s=350, color="red", marker="o")

for ax in axs:
    ax.plot([0,1], [0,1], lw=20, alpha=0.7,ls="--", c="black")
    ax.tick_params(labelsize=40, width=5, size=10)
    ax.set_xlabel("$o_i$, empirical", fontsize=65)
    ax.set_ylabel("$o_i$, predicted", fontsize=65)
    
axs[0].set_title("Sampling model {}".format(tissue), fontsize=55)
axs[1].set_title("Sampling model {}".format(other_tissue), fontsize=55)
axs[2].set_title("Merged", fontsize=55)

plt.tight_layout()
plt.show()
fig.savefig(f"Oreal_Opred_poissonModel_{tissue}.pdf")

In [None]:
for g in (merged_O/merged_df.shape[1])[mask].index:
    print(g)