In [1]:
from collections import OrderedDict
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from tqdm.notebook import tqdm

from eval import get_run_metrics, read_run_dir, get_model_from_run
from plot_utils import basic_plot, collect_results, relevant_model_names
from samplers import get_data_sampler
from tasks import get_task_sampler

%matplotlib inline
%load_ext autoreload
%autoreload 2

sns.set_theme('notebook', 'darkgrid')
palette = sns.color_palette('colorblind')

In [2]:
run_dir = "../models"
task = "polynomial_regression_1dim"
run_id = "pretrained"  # 或你训练过的任意 run_id

run_path = os.path.join(run_dir, task, run_id)
df = read_run_dir(run_dir)

get_run_metrics(run_path)

metrics = collect_results(run_dir, df, valid_row=lambda r: r.task == task and r.run_id == run_id)
_, conf = get_model_from_run(run_path, only_conf=True)

models = relevant_model_names["polynomial_regression"]  # 适配绘图

print("Available models in metrics['standard']:", metrics["standard"].keys())

basic_plot(metrics["standard"], models=models)
plt.title(f"PolynomialRegression - {run_id}")
plt.show()

  0%|          | 0/1 [00:01<?, ?it/s]


RuntimeError: The size of tensor a (101) must match the size of tensor b (0) at non-singleton dimension 1

In [None]:
n_dims = conf.model.n_dims

for name, metric in metrics.items():
    if name == "standard": continue

    if "scale" in name:
        scale = float(name.split("=")[-1])**2
    else:
        scale = 1.0

    fig, ax = basic_plot(metric, models=models, trivial=1.0 * scale)
    ax.set_title(name)
    ax.set_ylim(-.1 * scale, 1.5 * scale)
    plt.show()

### Evaluate

In [None]:
from baselines_1dim import (
    zero_estimator,
    mean_estimator,
    linear_regression_estimator,
    polynomial_fit_estimator,
)

# 模型预测
with torch.no_grad():
    transformer_pred = model(xs, ys)

# 各种 baseline 预测
zero_pred = zero_estimator(xs, ys, xs)
mean_pred = mean_estimator(xs, ys, xs)
linear_pred = linear_regression_estimator(xs, ys, xs)
oracle_pred = polynomial_fit_estimator(xs, ys, xs, degree=conf.training.task_kwargs["degree"])

# 统一评估
metric = task.get_metric()
loss_transformer = metric(transformer_pred, ys).numpy()
loss_zero = metric(zero_pred, ys).numpy()
loss_mean = metric(mean_pred, ys).numpy()
loss_linear = metric(linear_pred, ys).numpy()
loss_oracle = metric(oracle_pred, ys).numpy()

# 可视化比较
plt.plot(loss_transformer.mean(axis=0), label="Transformer")
plt.plot(loss_zero.mean(axis=0), label="Zero Estimator")
plt.plot(loss_mean.mean(axis=0), label="Mean Estimator")
plt.plot(loss_linear.mean(axis=0), label="Linear Regression")
plt.plot(loss_oracle.mean(axis=0), label="Oracle Poly Fit")
plt.xlabel("# in-context examples")
plt.ylabel("squared error")
plt.legend()
plt.title("Baselines vs Transformer")
plt.show()
