In [None]:
import pandas as pd
from glob import glob
import arviz as az
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from nteprsm import constants
from cmdstanpy import stanfit
from settings import ROOT_DIR, DATA_DIR
import plotly.express as px
import utils as notebook_utils

# use customize plotly template
notebook_utils.set_custom_template()
from importlib import reload
reload(notebook_utils)


In [None]:
# Load the CSV file with entry information
csv_file_path = DATA_DIR / 'raw/quality_nj2.csv'
csv_data = pd.read_csv(csv_file_path)

# Create entry mapping
entry_mapping = csv_data[['ENTRY_CODE', 'ENTRY_NAME']].drop_duplicates().reset_index(drop=True)

print("Entry Mapping (first few rows):")
print(entry_mapping.head())


In [None]:
# Load the NetCDF file (PyMC model output)
netcdf_file_path = DATA_DIR / 'model_output/trace_20240715_230759.netcdf'
trace = az.from_netcdf(netcdf_file_path)

summary = az.summary(trace)

# Extract the entry samples from PyMC model
entry_samples_pymc = trace.posterior['entry'].values

# Calculate statistics for PyMC model
entry_means_pymc = np.mean(entry_samples_pymc, axis=(0, 1))
entry_stds_pymc = np.std(entry_samples_pymc, axis=(0, 1))
entry_2_5_percentile_pymc = np.percentile(entry_samples_pymc, 2.5, axis=(0, 1))
entry_97_5_percentile_pymc = np.percentile(entry_samples_pymc, 97.5, axis=(0, 1))

print("Shape of entry_samples_pymc:", entry_samples_pymc.shape)
print("Number of entries:", len(entry_means_pymc))


In [None]:
# Create DataFrame for PyMC model results
pymc_model_data = pd.DataFrame({
    'ENTRY_CODE': np.array(range(len(entry_means_pymc))) + 1,
    'EFF_RSM_PyMC': entry_means_pymc,
    'STD_RSM_PyMC': entry_stds_pymc,
    '2.5RSM_PyMC': entry_2_5_percentile_pymc,
    '97.5RSM_PyMC': entry_97_5_percentile_pymc
})

# Merge with entry names
pymc_model_data = pymc_model_data.merge(entry_mapping, on='ENTRY_CODE', how='left')

print("PyMC Model Data (first few rows):")
print(pymc_model_data.head())


In [None]:
# Load model configuration
config_file = ROOT_DIR / "config/nteprsm_njkbg07.yml"
config = utils.load_config(config_file)

# Load posterior samples from csv files
files = glob(str(ROOT_DIR / config["sampling"]["output_dir"] / "no_consistent_rater_model_dist_matrix-20240626_*.csv"))
fit = stanfit.from_csv(files)


In [None]:
# Process data
datahandler = utils.DataHandler(filepath=ROOT_DIR / config["data_path"])
datahandler.load_data()
datahandler.preprocess_data()
datahandler.generate_stan_data(**config["stan_additional_data"])

In [None]:
def extract(variable, results, n_burnin=400):
    col_name = variable.upper() + "_EFF"
    var_name = variable.upper() + "_CODE"
    str_to_drop = variable + "["
    results_data = results[results.index > n_burnin]
    variable_data = pd.DataFrame(
        results_data.loc[
            :,
            (results_data.columns.str.startswith(variable))
            & (~results_data.columns.str.contains("free")),
        ].mean(),
        columns=[col_name],
    )
    # If unable to extract the code, use the index
    variable_data[var_name] = np.arange(len(variable_data)) + 1
    variable_data.reset_index(drop=True, inplace=True)
    return variable_data

# Extract entry data
entry_data = extract('entry', fit.draws_pd())

# Correctly assign data attribute if missing in datahandler
if not hasattr(datahandler, 'data'):
    datahandler.data = csv_data

# Merge with entry names
entry_mapping = datahandler.data[['ENTRY_CODE', 'ENTRY_NAME']].drop_duplicates().reset_index(drop=True)
stan_entry_data = entry_data.merge(entry_mapping, on='ENTRY_CODE', how='left')

print("Stan Entry Data (first few rows):")
print(stan_entry_data)

# Print some summary information
print(f"\nNumber of entry parameters: {len(stan_entry_data)}")

# Check for any missing matches
missing_matches = stan_entry_data[stan_entry_data['ENTRY_NAME'].isnull()]
if not missing_matches.empty:
    print("\nWarning: Some entries didn't match with names:")
    print(missing_matches)


In [None]:
# Ensure 'ENTRY_NAME' is present in both datasets
pymc_model_data = pymc_model_data.dropna(subset=['ENTRY_NAME'])
stan_entry_data = stan_entry_data.dropna(subset=['ENTRY_NAME'])

# Merge PyMC and Stan model data on 'ENTRY_NAME'
merged_data = pd.merge(pymc_model_data, stan_entry_data, on='ENTRY_NAME', suffixes=('_PyMC', '_Stan'))

print("Merged Data (first few rows):")
print(merged_data.head())



In [None]:
plt.figure(figsize=(12, 8))

scatter = plt.scatter(merged_data['ENTRY_EFF'], merged_data['EFF_RSM_PyMC'], 
                      alpha=0.7, edgecolors='k', c=merged_data['STD_RSM_PyMC'], cmap='viridis')

plt.xlabel('EFF_RSM from Stan Model', fontsize=12)
plt.ylabel('EFF_RSM from PyMC Model', fontsize=12)
plt.title('Comparison of EFF_RSM between Stan and PyMC Models', fontsize=14)

min_value = min(merged_data['ENTRY_EFF'].min(), merged_data['EFF_RSM_PyMC'].min())
max_value = max(merged_data['ENTRY_EFF'].max(), merged_data['EFF_RSM_PyMC'].max())
plt.plot([min_value, max_value], [min_value, max_value], color='red', linestyle='--', label='Line of Equality')

for i, row in merged_data.iterrows():
    if abs(row['EFF_RSM_PyMC'] - row['ENTRY_EFF']) > 0.5:
        plt.annotate(row['ENTRY_NAME'], (row['ENTRY_EFF'], row['EFF_RSM_PyMC']),
                     xytext=(5, 5), textcoords='offset points', fontsize=8)

plt.grid(True, linestyle=':', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.colorbar(scatter, label='STD_RSM_PyMC')
plt.show()

correlation = merged_data['EFF_RSM_PyMC'].corr(merged_data['ENTRY_EFF'])
print(f"Correlation coefficient between EFF_RSM_PyMC and ENTRY_EFF (Stan): {correlation:.4f}")

mean_abs_diff = np.mean(np.abs(merged_data['EFF_RSM_PyMC'] - merged_data['ENTRY_EFF']))
print(f"Mean absolute difference between EFF_RSM_PyMC and ENTRY_EFF (Stan): {mean_abs_diff:.4f}")
