In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from validphys.loader import FallbackLoader as Loader
from validphys.api import API
from collections import defaultdict
from scipy.stats import norm
from validphys.plotutils import kde_plot

In [None]:
fit_names = [f"240526-rs-mhou-alphas_0{n}" for n in range(1140,1250+1,10)]


In [None]:
l = Loader()
fits = [l.check_fit(f) for f in fit_names]

# Correlated Replica Method

In [None]:
as_fits = defaultdict(list)
for f in fits:
    th = f.as_input()["theory"]["theoryid"]
    alpha = API.theory_info_table(theory_db_id = th).loc["alphas"].iloc[0]
    as_fits[alpha].append(f)
as_fits = dict(as_fits)

In [None]:
indexes = {f: API.fitted_replica_indexes(pdf=f.name) for f in fits}
replica_data = {f: API.replica_data(fit=f.name) for f in fits}

In [None]:
def measure(replica_data):
    return replica_data.training*3 + replica_data.validation*1
    # return replica_data.chi2

In [None]:
min_values = {}
for alpha, flist in as_fits.items():
    series = []
    for f in flist:
        s = [measure(d) for d in replica_data[f]]
        series.append(pd.Series(s, index=indexes[f]))
    min_values[alpha] = pd.DataFrame(series).min()
data = pd.DataFrame(min_values)

In [None]:
mins = {}
for ind, row in data.iterrows():
    a, b, c = np.polyfit(data.columns, row, 2)
    if not np.isnan(b): # NaN if not all replicas passed postfit
        mins[ind] = -b / 2 / a
    # mins[ind] = data.columns[np.where(row==row.min())][0]

mins = pd.Series(mins)

In [None]:
print(mins.describe(percentiles=[0.16,0.84]))
print("")
print(f"cv±std = {mins.mean():.5f} ± {mins.std():.5f} ")
print(f"1std interval:  {mins.mean()-mins.std():.5f} to {mins.mean()+mins.std():.5f} ")
print(f"68% c.i:        {mins.describe(percentiles=[0.16,0.84])[4]:.5f} to {mins.describe(percentiles=[0.16,0.84])[6]:.5f} ")

In [None]:
fig, ax = plt.subplots()
kde_plot(mins,ax=ax)
central = (mins.describe(percentiles=[0.16,0.84])[6] + mins.describe(percentiles=[0.16,0.84])[4])/2
unc = (mins.describe(percentiles=[0.16,0.84])[6] - mins.describe(percentiles=[0.16,0.84])[4])/2
ax.set_title(f"68% c.i: {central:.5f}  ± {unc:.5f}  -- MHOU")
ax.set_xlim(0.118,0.13)
ax.set_xlabel(r"$\alpha_s$")

In [None]:
plt.hist(mins,bins=data.columns-0.0005,edgecolor='black',density=True)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
# p = np.exp(-((x-mins.mean())/mins.std())**2/2)*mins.size/np.sqrt(2*np.pi)
p = norm.pdf(x, mins.mean(), mins.std())
plt.plot(x,p,'k',label=f"{mins.mean():.5f} +/- {mins.std():.5f}")
plt.yticks([])
plt.legend()

In [None]:
# Plot parabola per replica

xgrid = np.linspace(min(data.columns),max(data.columns))
for i in range(len(data)):
    chi2_values = data.iloc[i]
    a, b, c = np.polyfit(data.columns, chi2_values, 2)
    plt.figure()
    plt.plot(data.columns, chi2_values, '.')
    plt.plot(xgrid, a*xgrid**2 + b*xgrid + c, color="black", linestyle="--")


In [None]:
plt.plot(data.columns, np.array(data.T))

# Experimental/naive method

In [None]:
naive_dict = dict(
    fit=fit_names[0],
    dataset_inputs={"from_": "fit"},
    pdf={"from_": "fit"},
    use_cuts="fromfit",
    theory={"from_": "fit"},
    theoryid={"from_": "theory"},
)

# Experimental covariance matrix
# C = API.groups_covmat(
#     use_t0 = False,
#     **naive_dict
# )



# t0 covariance matrix (the correct one, see bottom of page 15 of https://arxiv.org/pdf/1802.03398)
C = API.groups_covmat(
    fit=fit_names[0],
    use_t0 = True,
    use_cuts="fromfit",
    datacuts={"from_": "fit"},
    t0pdfset={"from_": "datacuts"},
    dataset_inputs={"from_": "fit"},
    theoryid=API.fit(fit=fit_names[0]).as_input()["theory"]["t0theoryid"],
)




In [None]:
try:
    stored_covmat = pd.read_csv(
        fits[0].path / "tables/datacuts_theory_theorycovmatconfig_theory_covmat_custom.csv",
        index_col=[0, 1, 2],
        header=[0, 1, 2],
        sep="\t|,",
        engine="python",
    ).fillna(0)
    storedcovmat_index = pd.MultiIndex.from_tuples(
        [(aa, bb, np.int64(cc)) for aa, bb, cc in stored_covmat.index],
        names=["group", "dataset", "id"],
    ).droplevel(0)  # make sure "id" is an integer, same as in C, and drop the group since that may differ
    stored_covmat = pd.DataFrame(
        stored_covmat.values, index=storedcovmat_index, columns=storedcovmat_index
    )
    stored_covmat = stored_covmat.reindex(C.index.droplevel(0)).T.reindex(C.index.droplevel(0))
    t0covmat = pd.DataFrame(
        C.values, index=C.index.droplevel(0), columns=C.index.droplevel(0)
    )
    invcov = np.linalg.inv(t0covmat+stored_covmat)
except:
    invcov = np.linalg.inv(C)

In [None]:
chi2_values = []
alphas_values = []
for fitname in fit_names:
    naive_dict["fit"] = fitname
    central_preds_and_data = API.group_result_central_table_no_table(**naive_dict)

    theory_db_id = API.fit(fit=fitname).as_input()["theory"]["theoryid"]
    alphas_values.append(API.theory_info_table(theory_db_id = theory_db_id).loc["alphas"].iloc[0])

    # compute chi2
    diff = central_preds_and_data.theory_central - central_preds_and_data.data_central
    chi2_values.append(diff @ invcov @ diff / diff.size)


In [None]:
a, b, c = np.polyfit(alphas_values, chi2_values, 2)

central = -b / 2 / a
ndata = C.shape[0]
unc = np.sqrt(1/a/ndata)

plt.scatter(alphas_values, chi2_values, color="blue" )
xgrid = np.linspace(min(alphas_values),max(alphas_values))
plt.plot(xgrid, [a*x*x + b*x + c for x in xgrid], color="black", linestyle="--")
plt.title(rf"$\alpha_s$={central:.4f}$\pm${unc:.4f}")
print(f"{central:.4f} ± {unc:.4f}")