In [1]:
import numpy as np
import pandas as pd
from scipy import interpolate as scint
from validphys.api import API
from validphys.loader import FallbackLoader
from validphys.theorycovariance.construction import compute_normalisation_by_experiment
from matplotlib import pyplot as plt
%matplotlib inline

l = FallbackLoader()

In [2]:
fitname = "240921_01_ht_preds"
fitname_ref = "240807-midcuts"

In [3]:
fit = API.fit(fit=fitname)
fit_ref = API.fit(fit=fitname_ref)

settings_dict = dict(
    dataset_inputs={"from_": "fit"},
    fit=fit.name,
    use_cuts="fromfit",
    metadata_group="nnpdf31_process",
    theory={"from_": "fit"},
    theoryid={"from_": "theory"},
    pdf={"from_": "fit"},
    datacuts={"from_": "fit"},
    t0pdfset={"from_": "datacuts"},
)

settings_dict_ref = dict(
    dataset_inputs={"from_": "fit"},
    fit=fit_ref.name,
    use_cuts="fromfit",
    metadata_group="nnpdf31_process",
    theory={"from_": "fit"},
    theoryid={"from_": "theory"},
    pdf={"from_": "fit"},
    datacuts={"from_": "fit"},
    t0pdfset={"from_": "datacuts"},
)

In [4]:
theorypreds_all = API.group_result_table_no_table(**settings_dict)
kinematics_all = API.group_kin_table_no_table(**settings_dict)

theorypreds_all_ref = API.group_result_table_no_table(**settings_dict_ref)

# Sanity check
try:
  pd.testing.assert_index_equal(kinematics_all.index, theorypreds_all.index)
except AssertionError as e:
  print("Different index")
  print(e)

LHAPDF 6.5.4 loading all 501 PDFs in set 240921_01_ht_preds
240921_01_ht_preds, version 1; 501 PDF members
LHAPDF 6.5.4 loading all 501 PDFs in set 240807-midcuts
240807-midcuts, version 1; 501 PDF members


In [5]:
theorypred_reps = theorypreds_all.iloc[:, 2:]
theorypdres_mean = theorypred_reps.mean(axis=1)
theorypred_rep0 = theorypreds_all.iloc[:, 1]

theorypred_reps_ref = theorypreds_all_ref.iloc[:, 2:]
theorypdres_mean_ref = theorypred_reps_ref.mean(axis=1)
theorypred_rep0_ref = theorypreds_all_ref.iloc[:, 1]

In [6]:
pseudodata = API.read_pdf_pseudodata(**settings_dict)
data_reps = pd.concat(
    [i.pseudodata.reindex(theorypreds_all.index) for i in pseudodata], axis=1
)
data_reps_mean = data_reps.mean(axis=1)
data_exp = theorypreds_all["data_central"]


data_exp_ref = theorypreds_all_ref["data_central"]

In [7]:
Ct0 = API.groups_covmat(
    use_t0=True,
    **settings_dict
)
Cexp = API.groups_covmat(
    use_t0=False,
    **settings_dict
)

Ct0_ref = API.groups_covmat(
    use_t0=True,
    **settings_dict_ref
)
Cexp_ref = API.groups_covmat(
    use_t0=False,
    **settings_dict_ref
)

LHAPDF 6.5.4 loading /opt/homebrew/Caskroom/miniconda/base/envs/nnpdf/share/LHAPDF/210715-n3fit-1000-001/210715-n3fit-1000-001_0000.dat
210715-n3fit-1000-001 PDF set, member #0, version 1
LHAPDF 6.5.4 loading /opt/homebrew/Caskroom/miniconda/base/envs/nnpdf/share/LHAPDF/210619-n3fit-001/210619-n3fit-001_0000.dat
210619-n3fit-001 PDF set, member #0, version 1


In [8]:
Cinvt0 = np.linalg.inv(Ct0)
Cinvexp = np.linalg.inv(Cexp)

Cinvt0_ref = np.linalg.inv(Ct0_ref)
Cinvexp_ref = np.linalg.inv(Cexp_ref)

In [9]:
ht_theory = pd.Series(data=np.zeros_like(theorypred_rep0.to_numpy()), index=theorypred_rep0.index )
ht_theory = pd.DataFrame(ht_theory, columns=['ht'])

In [10]:
# Concatenate kinematics to HT dataframe
try:
  pd.testing.assert_index_equal(kinematics_all.index, ht_theory.index)
  ht_theory = pd.concat([ht_theory, kinematics_all],axis=1)
except AssertionError as e:
  print("Different index")
  print(e)

In [11]:
# Posteriors from 240812-01-ABMP-large-prior-7k
x_knots = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1]
y_h2_p = [-0.00441, 0.11169, -0.01632, 0.00000, -0.08742, -0.07279, 0.00000]
y_hl_p = [0.00000, -0.06241, -0.08655, -0.03306, 0.00000, -0.05987, 0.0000]
y_h2_d = [-0.04117, 0.00000, 0.03124, -0.01059, 0.04763, 0.00000, 0.00000]
y_hl_d = [0.00316, 0.00469, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]

H_2p = scint.CubicSpline(x_knots, y_h2_p)
H_lp = scint.CubicSpline(x_knots, y_hl_p)
H_2d = scint.CubicSpline(x_knots, y_h2_d)
H_ld = scint.CubicSpline(x_knots, y_hl_d)

H_2p = np.vectorize(H_2p)
H_lp = np.vectorize(H_lp)
H_2d = np.vectorize(H_2d)
H_ld = np.vectorize(H_ld)

## Compute the HT

In [12]:
# Deactivate performance warning when using df.loc
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

included_proc = ['DIS NC']
excluded_exp = {"DIS NC" : []}
for process_name, process_group in ht_theory.groupby(level='group'):
  for exp_name, exp_group in process_group.groupby(level='dataset'):
    if process_name in included_proc and exp_name not in excluded_exp[process_name]:
      x = exp_group.kin_1.to_numpy()
      q2 = exp_group.kin_2.to_numpy()
      y = exp_group.kin_3.to_numpy()
      N2, NL = compute_normalisation_by_experiment(exp_name, x, y, q2)
      if "_P_" in exp_name or "HERA" in exp_name:
        PC_2 = N2 * H_2p(x) / q2
        PC_L = NL * H_lp(x) / q2
      elif "_D_" in exp_name:
        PC_2 = N2 * H_2d(x) / q2
        PC_L = NL * H_ld(x) / q2
      else:
        # TODO
        # Need to implement this
        PC_2 = 0 / q2 #N2 * H_2d(x) / Q2
        PC_L = 0 / q2 #NL * H_ld(x) / Q2

      ht_theory.loc[(process_name, exp_name), 'ht'] = PC_2 + PC_L

## Collect process and experiment information

In [13]:
# Collecting name of nnpdf31_processes and experiments
process_list = []
exp_list = []
process_name = ''
exp_name = ''
for proc in theorypred_rep0.index.to_numpy():
  if proc[0] != process_name:
    process_name = proc[0]
    process_list.append(process_name)

  if proc[1] != exp_name:
    exp_name = proc[1]
    exp_list.append(exp_name)

# Compute global $\chi^2$

In [14]:
# With HT
ndat = theorypred_rep0.size
chi2t0 = (theorypred_rep0 +  ht_theory['ht']- data_exp) @ Cinvt0 @ (theorypred_rep0 +  ht_theory['ht'] - data_exp) / ndat
chi2t0_average = np.mean([(theorypred_reps[r] +  ht_theory['ht'] - data_exp.to_numpy()) @ Cinvt0 @ (theorypred_reps[r] +  ht_theory['ht'] - data_exp.to_numpy()) for r in theorypred_reps]) / ndat
chi2t0_meanT = (theorypred_reps.mean(axis=1) +  ht_theory['ht'] - data_exp) @ Cinvt0 @ (theorypred_reps.mean(axis=1) +  ht_theory['ht'] - data_exp) / ndat
chi2exp = (theorypred_rep0 +  ht_theory['ht'] - data_exp) @ Cinvexp @ (theorypred_rep0 +  ht_theory['ht'] - data_exp) / ndat

# Without HT
chi2t0_no_ht = (theorypred_rep0 - data_exp) @ Cinvt0 @ (theorypred_rep0 - data_exp) / ndat
chi2t0_average_no_ht = np.mean([(theorypred_reps[r]  - data_exp.to_numpy()) @ \
                                Cinvt0 @ \
                                (theorypred_reps[r]  - data_exp.to_numpy()) for r in theorypred_reps]) / ndat
chi2t0_meanT_no_ht = (theorypred_reps.mean(axis=1)  - data_exp) @ Cinvt0 @ (theorypred_reps.mean(axis=1) - data_exp) / ndat
chi2exp_no_ht = (theorypred_rep0  - data_exp) @ Cinvexp @ (theorypred_rep0  - data_exp) / ndat

# Reference fit
ndat_ref = theorypred_rep0_ref.size
chi2t0_ref = (theorypred_rep0_ref - data_exp_ref) @ Cinvt0_ref @ (theorypred_rep0_ref  - data_exp_ref) / ndat_ref
chi2t0_average_ref = np.mean([(theorypred_reps_ref[r] - data_exp_ref.to_numpy()) @ Cinvt0_ref @ (theorypred_reps_ref[r] - data_exp_ref.to_numpy()) for r in theorypred_reps_ref]) / ndat_ref
chi2t0_meanT_ref = (theorypred_reps_ref.mean(axis=1) - data_exp_ref) @ Cinvt0_ref @ (theorypred_reps_ref.mean(axis=1) - data_exp_ref) / ndat_ref
chi2exp_ref = (theorypred_rep0_ref - data_exp_ref) @ Cinvexp_ref @ (theorypred_rep0_ref - data_exp_ref) / ndat_ref


if True:
  print(f"{"name":>15}{"chi2 w/o HT":>20}{"chi2 w/ HT":>15}")
  print(f"{"chi2t0":>15}{chi2t0:>15.4f}{chi2t0_no_ht:>15.4f}")
  print(f"{"chi2t0_average":>15}{chi2t0_average:>15.4f}{chi2t0_average_no_ht:>15.4f}")
  print(f"{"chi2t0_meanT":>15}{chi2t0_meanT:>15.4f}{chi2t0_meanT_no_ht:>15.4f}")
  print(f"{"chi2exp":>15}{chi2exp:>15.4f}{chi2exp_no_ht:>15.4f}")

if True:
    print(f"{"name":>15}{"Baseline no HT":>20}{"With HT":>10}")
    print(f"{"chi2t0":>15}{chi2t0_ref:>15.4f}{chi2t0:>15.4f}")
    print(f"{"chi2t0_average":>15}{chi2t0_average_ref:>15.4f}{chi2t0_average:>15.4f}")
    print(f"{"chi2t0_meanT":>15}{chi2t0_meanT_ref:>15.4f}{chi2t0_meanT:>15.4f}")
    print(f"{"chi2exp":>15}{chi2exp_ref:>15.4f}{chi2exp:>15.4f}")

           name         chi2 w/o HT     chi2 w/ HT
         chi2t0         1.3504         1.2869
 chi2t0_average         1.3757         1.3122
   chi2t0_meanT         1.3502         1.2867
        chi2exp         1.2809         1.2088
           name      Baseline no HT   With HT
         chi2t0         1.2648         1.3504
 chi2t0_average         1.2883         1.3757
   chi2t0_meanT         1.2646         1.3502
        chi2exp         1.1936         1.2809


## Compute $\chi^2$ per process

In [15]:
chi2_dict = {}
collector = 0
for name in process_list:
  theorypred_rep0_PROC = theorypred_rep0.loc[[name]].to_numpy()
  data_exp_PROC = data_exp.loc[[name]].to_numpy()

  # For reference fit
  theorypred_rep0_PROC_ref = theorypred_rep0_ref.loc[[name]].to_numpy()
  data_exp_PROC_ref = data_exp_ref.loc[[name]].to_numpy()

  ht_PROC = ht_theory.loc[[name], 'ht'].to_numpy()

  try:
    assert(theorypred_rep0_PROC.shape == data_exp_PROC.shape == ht_PROC.shape)
  except AssertionError as e:
    print("Problem with the shape")
    print(e)

  # For reference fit
  try:
    assert(theorypred_rep0_PROC_ref.shape == data_exp_PROC_ref.shape)
  except AssertionError as e:
    print("Problem with the shape")
    print(e)
    
  Ndata_PROC = theorypred_rep0_PROC.shape[0]
  Cinvexp_PROC = Cinvexp[collector:collector+Ndata_PROC, collector:collector+Ndata_PROC]

  # For reference fit
  Ndata_PROC_ref = theorypred_rep0_PROC_ref.shape[0]
  try:
    assert(Ndata_PROC_ref == Ndata_PROC)
  except AssertionError as e:
    print("Fit and reference fit do not have the same number of points")
    print(e)
    exit
  Cinvexp_PROC_ref = Cinvexp_ref[collector:collector + Ndata_PROC_ref, collector:collector + Ndata_PROC_ref]

  collector += Ndata_PROC

  chi2exp_PROC = (theorypred_rep0_PROC +  ht_PROC - data_exp_PROC) @ Cinvexp_PROC @ (theorypred_rep0_PROC +  ht_PROC - data_exp_PROC) / Ndata_PROC
  chi2exp_no_ht_PROC = (theorypred_rep0_PROC - data_exp_PROC) @ Cinvexp_PROC @ (theorypred_rep0_PROC - data_exp_PROC) / Ndata_PROC

  # For reference fit
  chi2exp_PROC_ref = (theorypred_rep0_PROC_ref - data_exp_PROC_ref) @ Cinvexp_PROC_ref @ (theorypred_rep0_PROC_ref - data_exp_PROC_ref) / Ndata_PROC_ref

  chi2_dict[name] = {"HT": chi2exp_PROC, "NO_HT": chi2exp_no_ht_PROC, "ref": chi2exp_PROC_ref}

if True:
  print(f"{"name":>10}{"chi2 w/o HT":>20}{"chi2 w/ HT":>15}")
  for name in chi2_dict.keys():
    print(f"{name:>10}{chi2_dict[name]["NO_HT"]:>15.4f}{chi2_dict[name]["HT"]:>15.4f}")

if True:
  print(f"{"name":>15}{"Baseline no HT":>20}{"With HT":>10}")
  for name in chi2_dict.keys():
    print(f"{name:>14}{chi2_dict[name]["ref"]:>15.4f}{chi2_dict[name]["HT"]:>15.4f}")


      name         chi2 w/o HT     chi2 w/ HT
    DIS NC         1.2466         1.4022
    DIS CC         0.9485         0.9485
     DY NC         1.2913         1.2913
     DY CC         1.6754         1.6754
       TOP         1.1698         1.1698
      JETS         1.1660         1.1660
     DIJET         2.2998         2.2998
    PHOTON         0.8291         0.8291
 SINGLETOP         0.3477         0.3477
           name      Baseline no HT   With HT
        DIS NC         1.2492         1.4022
        DIS CC         0.9516         0.9485
         DY NC         1.2533         1.2913
         DY CC         1.6982         1.6754
           TOP         1.2836         1.1698
          JETS         1.0420         1.1660
         DIJET         2.2082         2.2998
        PHOTON         0.7839         0.8291
     SINGLETOP         0.3566         0.3477


## Compute $\chi^2$ per experiment

In [16]:
chi2_dict_exp = {}
collector = 0
for name in exp_list:
  theorypred_rep0_exp = theorypred_rep0.xs(name, level="dataset").to_numpy()
  data_exp_exp = data_exp.xs(name, level="dataset").to_numpy()
  ht_exp = ht_theory.xs(name, level='dataset')['ht'].to_numpy()
  try:
    assert(theorypred_rep0_exp.shape == ht_exp.shape == data_exp_exp.shape)
  except AssertionError as e:
    print("Problem with the shape")
    print(e)

  # For reference fit
  theorypred_rep0_exp_ref = theorypred_rep0_ref.xs(name, level="dataset").to_numpy()
  data_exp_exp_ref = data_exp_ref.xs(name, level="dataset").to_numpy()
    
  ndata_exp = theorypred_rep0_exp.shape[0]
  Cinvexp_exp = Cinvexp[collector:collector + ndata_exp, collector:collector + ndata_exp]

  # For reference fit
  ndata_exp_ref = theorypred_rep0_exp_ref.shape[0]
  Cinvexp_exp_ref = Cinvexp_ref[collector:collector + ndata_exp_ref, collector:collector + ndata_exp_ref]

  collector += ndata_exp

  chi2exp_exp = (theorypred_rep0_exp +  ht_exp - data_exp_exp) @ Cinvexp_exp @ (theorypred_rep0_exp +  ht_exp - data_exp_exp) / ndata_exp
  chi2exp_no_ht_exp = (theorypred_rep0_exp - data_exp_exp) @ Cinvexp_exp @ (theorypred_rep0_exp - data_exp_exp) / ndata_exp

  chi2exp_no_ht_exp_ref = (theorypred_rep0_exp_ref - data_exp_exp_ref) @ Cinvexp_exp_ref @ (theorypred_rep0_exp_ref - data_exp_exp_ref) / ndata_exp_ref

  chi2_dict_exp[name] = {"HT": chi2exp_exp, 
                         "NO_HT": chi2exp_no_ht_exp, 
                         "ref": chi2exp_no_ht_exp_ref,
                         "show": True if ht_theory.xs(name, level='dataset')['ht'].index[0][0] == 'DIS NC' else False}

if True:
  print(f"{"name":>30}{"chi2 w/o HT":>30}{"chi2 w/ HT":>15}")
  for name in chi2_dict_exp.keys():
    if chi2_dict_exp[name]['show']:
      print(f"{name:>40}{chi2_dict_exp[name]["NO_HT"]:>15.4f}{chi2_dict_exp[name]["HT"]:>15.4f}")

if True:
  print(f"{"name":>40}{"Baseline no HT":>20}{"With HT":>10}")
  for name in chi2_dict_exp.keys():
    if chi2_dict_exp[name]['show']:
      print(f"{name:>40}{chi2_dict_exp[name]["ref"]:>15.4f}{chi2_dict_exp[name]["HT"]:>15.4f}")

                          name                   chi2 w/o HT     chi2 w/ HT
                NMC_NC_NOTFIXED_DW_EM-F2         0.8828         0.8828
           NMC_NC_NOTFIXED_P_EM-SIGMARED         1.6665         1.8899
             SLAC_NC_NOTFIXED_P_DW_EM-F2         0.9829         2.8635
             SLAC_NC_NOTFIXED_D_DW_EM-F2         0.7522         1.1665
            BCDMS_NC_NOTFIXED_P_DW_EM-F2         1.5464         1.6274
            BCDMS_NC_NOTFIXED_D_DW_EM-F2         1.3437         1.4423
              HERA_NC_318GEV_EM-SIGMARED         1.5484         1.5462
              HERA_NC_225GEV_EP-SIGMARED         1.4122         1.4118
              HERA_NC_251GEV_EP-SIGMARED         1.2017         1.2022
              HERA_NC_300GEV_EP-SIGMARED         1.6596         1.6726
              HERA_NC_318GEV_EP-SIGMARED         1.4737         1.4570
      HERA_NC_318GEV_EAVG_CHARM-SIGMARED         1.9972         1.9932
     HERA_NC_318GEV_EAVG_BOTTOM-SIGMARED         1.4602         1.6043
 

# Compute pseudodata

In [17]:
chi2t0_pseudodata = (theorypred_rep0 - data_reps_mean) @ Cinvt0 @ (theorypred_rep0 - data_reps_mean) / ndat
chi2t0_average_pseudodata = np.mean([(theorypred_reps[r] - data_reps_mean.to_numpy()) @ Cinvt0 @ (theorypred_reps[r] - data_reps_mean.to_numpy()) for r in theorypred_reps]) / ndat
chi2t0_meanT_pseudodata = (theorypred_reps.mean(axis=1) - data_reps_mean) @ Cinvt0 @ (theorypred_reps.mean(axis=1) - data_reps_mean) / ndat
chi2exp_pseudodata = (theorypred_rep0 - data_reps_mean) @ Cinvexp @ (theorypred_rep0 - data_reps_mean) / ndat