In [22]:
# Cell 1: imports & basic setup

import os
import numpy as np
import pandas as pd

from datasets import load_dataset
from scipy.stats import pearsonr

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.pipeline import Pipeline

# Reproducibility helpers
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


In [23]:
# Cell 2: load embeddings from disk and labels from STS-B

base_dir = "/scratch/06782/ysu707/PRLE/stsb/llama_3.1_8B_Instr"

X_train = np.load(os.path.join(base_dir, "train_embeds.npy"))
X_val   = np.load(os.path.join(base_dir, "validation_embeds.npy"))
X_test  = np.load(os.path.join(base_dir, "test_embeds.npy"))

# Load STS-B dataset (sentence-transformers/stsb has train/validation/test splits)
dataset = load_dataset("sentence-transformers/stsb")

y_train = np.array(dataset["train"]["score"], dtype=float)
y_val   = np.array(dataset["validation"]["score"], dtype=float)
y_test  = np.array(dataset["test"]["score"], dtype=float)

print("Shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:  ", X_val.shape,   "y_val:  ", y_val.shape)
print("X_test: ", X_test.shape,  "y_test: ", y_test.shape)


Shapes:
X_train: (5749, 4096) y_train: (5749,)
X_val:   (1500, 4096) y_val:   (1500,)
X_test:  (1379, 4096) y_test:  (1379,)


In [24]:
# Cell 3: merge train and val for final training

X_train_full = np.concatenate([X_train, X_val], axis=0)
y_train_full = np.concatenate([y_train, y_val], axis=0)

print("X_train_full:", X_train_full.shape, "y_train_full:", y_train_full.shape)

X_train_full: (7249, 4096) y_train_full: (7249,)


In [25]:
# Cell 4: helpers for evaluation and pretty printing

def eval_on_test(estimator, X_test, y_test):
    """
    Works for plain estimators or Pipeline(scaler -> regressor).
    Returns mse, pearson_r, y_pred.
    """
    y_pred = estimator.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r, _ = pearsonr(y_test, y_pred)
    return mse, r, y_pred


def make_predefined_split(X_train, X_val):
    """
    Build X_all, y_all, predefined_split for GridSearchCV so that:
    - fold=-1 rows are "train portion"
    - fold=0 rows are "validation portion"
    """
    X_all = np.concatenate([X_train, X_val], axis=0)
    split_index = np.concatenate([
        -1 * np.ones(len(X_train), dtype=int),
         0 * np.ones(len(X_val),   dtype=int),
    ])
    predefined = PredefinedSplit(test_fold=split_index)
    return X_all, predefined


In [26]:
# Cell 5: generalized tuner using GridSearchCV with PredefinedSplit

def tune_with_validation_cv(
    X_train, y_train,
    X_val, y_val,
    build_base_estimator_fn,
    param_grid,
    scale=False,
    scoring="neg_mean_squared_error",
    verbose=2,
):
    """
    Use sklearn's GridSearchCV with a manual train/val split.
    - If scale=True: we build Pipeline([("scaler", StandardScaler()), ("regressor", base_estimator)])
      and param_grid keys must be 'regressor__<paramname>'.
    - If scale=False: we pass the base estimator directly, and param_grid
      keys are just the estimator's param names.
    - We set cv=PredefinedSplit so it trains on train split only and scores on val split only.
    - refit=True means: after finding best params, refit on ALL (train+val).
    """

    # concat labels the same way we concat features
    y_all = np.concatenate([y_train, y_val], axis=0)

    # build predefined split
    X_all, predefined_cv = make_predefined_split(X_train, X_val)

    # base estimator for this model family
    base_estimator = build_base_estimator_fn()

    if scale:
        estimator = Pipeline([
            ("scaler", StandardScaler()),
            ("regressor", base_estimator),
        ])
        grid = param_grid  # expects regressor__... keys
    else:
        estimator = base_estimator
        grid = param_grid  # expects direct param names

    gscv = GridSearchCV(
        estimator=estimator,
        param_grid=grid,
        scoring=scoring,      # neg_mean_squared_error
        cv=predefined_cv,     # train vs val as we defined
        refit=True,           # after tuning, refit best on ALL (train+val)
        n_jobs=-1,            # parallelize across param combos
        verbose=verbose,
        return_train_score=False,
    )

    gscv.fit(X_all, y_all)

    # best_estimator_ is already refit on train+val
    best_estimator = gscv.best_estimator_
    best_params = gscv.best_params_

    # gscv.best_score_ is neg-MSE on the validation fold, so flip sign
    best_val_mse = -gscv.best_score_

    print("===== GridSearch Summary =====")
    print("Best params:", best_params)
    print("Best validation MSE:", best_val_mse)
    print("==============================")

    return best_estimator, best_params, best_val_mse


In [27]:
# Cell 6: results container (we'll append rows per model)

results = []


In [28]:
# Cell 7: Linear Regression (no hyperparams, no scaling)

linreg = LinearRegression()
linreg.fit(X_train_full, y_train_full)

linreg_test_mse, linreg_test_r, _ = eval_on_test(linreg, X_test, y_test)

print("=== Linear Regression ===")
print(f"Test MSE:       {linreg_test_mse:.4f}")
print(f"Test Pearson r: {linreg_test_r:.4f}")
print("=========================")

results.append({
    "Model": "Linear Regression",
    "Test MSE": linreg_test_mse,
    "Pearson r": linreg_test_r,
    "Chosen Hyperparams": "{}",
})


=== Linear Regression ===
Test MSE:       0.0816
Test Pearson r: 0.5851


In [29]:
# Cell 8: SVR (RBF kernel) with scaling

def build_svr_base():
    return SVR()

svr_param_grid = {
    "regressor__kernel": ["rbf"],
    "regressor__C": [0.1, 1.0, 10.0, 100.0],
    "regressor__epsilon": [0.05, 0.1, 0.2],
    "regressor__gamma": ["scale", "auto"],
}

svr_best_estimator, svr_best_params, svr_best_val_mse = tune_with_validation_cv(
    X_train, y_train,
    X_val, y_val,
    build_base_estimator_fn=build_svr_base,
    param_grid=svr_param_grid,
    scale=True,                          # SVR benefits from scaling
    scoring="neg_mean_squared_error",
    verbose=1,
)

svr_test_mse, svr_test_r, _ = eval_on_test(svr_best_estimator, X_test, y_test)

print("=== SVR (RBF) ===")
print("Best params:", svr_best_params)
print(f"Best Val MSE:   {svr_best_val_mse:.4f}")
print(f"Test MSE:       {svr_test_mse:.4f}")
print(f"Test Pearson r: {svr_test_r:.4f}")
print("=================")

results.append({
    "Model": "SVR (RBF)",
    "Test MSE": svr_test_mse,
    "Pearson r": svr_test_r,
    "Chosen Hyperparams": str(svr_best_params),
})


Fitting 1 folds for each of 24 candidates, totalling 24 fits


===== GridSearch Summary =====
Best params: {'regressor__C': 10.0, 'regressor__epsilon': 0.05, 'regressor__gamma': 'scale', 'regressor__kernel': 'rbf'}
Best validation MSE: 0.051284707497370144
=== SVR (RBF) ===
Best params: {'regressor__C': 10.0, 'regressor__epsilon': 0.05, 'regressor__gamma': 'scale', 'regressor__kernel': 'rbf'}
Best Val MSE:   0.0513
Test MSE:       0.0457
Test Pearson r: 0.7140


In [30]:
# Cell 9: KNN Regression (also scale features)

def build_knn_base():
    return KNeighborsRegressor()

knn_param_grid = {
    "regressor__n_neighbors": [1, 3, 5, 7, 9, 11, 15, 21, 31, 41, 51],
    "regressor__weights": ["uniform", "distance"],
}

knn_best_estimator, knn_best_params, knn_best_val_mse = tune_with_validation_cv(
    X_train, y_train,
    X_val, y_val,
    build_base_estimator_fn=build_knn_base,
    param_grid=knn_param_grid,
    scale=True,                          # distance-based -> scale
    scoring="neg_mean_squared_error",
    verbose=1,
)

knn_test_mse, knn_test_r, _ = eval_on_test(knn_best_estimator, X_test, y_test)

print("=== KNN Regression ===")
print("Best params:", knn_best_params)
print(f"Best Val MSE:   {knn_best_val_mse:.4f}")
print(f"Test MSE:       {knn_test_mse:.4f}")
print(f"Test Pearson r: {knn_test_r:.4f}")
print("======================")

results.append({
    "Model": "KNN Regression",
    "Test MSE": knn_test_mse,
    "Pearson r": knn_test_r,
    "Chosen Hyperparams": str(knn_best_params),
})


Fitting 1 folds for each of 22 candidates, totalling 22 fits
===== GridSearch Summary =====
Best params: {'regressor__n_neighbors': 9, 'regressor__weights': 'distance'}
Best validation MSE: 0.07754225319273925
=== KNN Regression ===
Best params: {'regressor__n_neighbors': 9, 'regressor__weights': 'distance'}
Best Val MSE:   0.0775
Test MSE:       0.0946
Test Pearson r: 0.3865


In [31]:
# Cell 10: Decision Tree Regressor (no scaling)

def build_dt_base():
    return DecisionTreeRegressor(random_state=RANDOM_STATE)

dt_param_grid = {
    "max_depth": [None, 3, 5, 7, 10, 15, 25, 40],
    "min_samples_leaf": [1, 2, 5, 10, 20],
}

dt_best_estimator, dt_best_params, dt_best_val_mse = tune_with_validation_cv(
    X_train, y_train,
    X_val, y_val,
    build_base_estimator_fn=build_dt_base,
    param_grid=dt_param_grid,
    scale=False,
    scoring="neg_mean_squared_error",
    verbose=1,
)

dt_test_mse, dt_test_r, _ = eval_on_test(dt_best_estimator, X_test, y_test)

print("=== Decision Tree ===")
print("Best params:", dt_best_params)
print(f"Best Val MSE:   {dt_best_val_mse:.4f}")
print(f"Test MSE:       {dt_test_mse:.4f}")
print(f"Test Pearson r: {dt_test_r:.4f}")
print("=====================")

results.append({
    "Model": "Decision Tree",
    "Test MSE": dt_test_mse,
    "Pearson r": dt_test_r,
    "Chosen Hyperparams": str(dt_best_params),
})


Fitting 1 folds for each of 40 candidates, totalling 40 fits
===== GridSearch Summary =====
Best params: {'max_depth': 3, 'min_samples_leaf': 1}
Best validation MSE: 0.0877905211354545
=== Decision Tree ===
Best params: {'max_depth': 3, 'min_samples_leaf': 1}
Best Val MSE:   0.0878
Test MSE:       0.0850
Test Pearson r: 0.3043


In [32]:
# Cell 11: Random Forest Regressor (no scaling)

def build_rf_base():
    return RandomForestRegressor(
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

rf_param_grid = {
    "n_estimators": [100, 500, 1000],
    "max_depth": [None, 10, 20],
    "min_samples_leaf": [1, 2, 5],
    "max_features": ["sqrt", "log2"],
}

rf_best_estimator, rf_best_params, rf_best_val_mse = tune_with_validation_cv(
    X_train, y_train,
    X_val, y_val,
    build_base_estimator_fn=build_rf_base,
    param_grid=rf_param_grid,
    scale=False,
    scoring="neg_mean_squared_error",
    verbose=1,
)

rf_test_mse, rf_test_r, _ = eval_on_test(rf_best_estimator, X_test, y_test)

print("=== Random Forest ===")
print("Best params:", rf_best_params)
print(f"Best Val MSE:   {rf_best_val_mse:.4f}")
print(f"Test MSE:       {rf_test_mse:.4f}")
print(f"Test Pearson r: {rf_test_r:.4f}")
print("====================")

results.append({
    "Model": "Random Forest",
    "Test MSE": rf_test_mse,
    "Pearson r": rf_test_r,
    "Chosen Hyperparams": str(rf_best_params),
})


Fitting 1 folds for each of 54 candidates, totalling 54 fits
===== GridSearch Summary =====
Best params: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 500}
Best validation MSE: 0.0761410583493141
=== Random Forest ===
Best params: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 500}
Best Val MSE:   0.0761
Test MSE:       0.0710
Test Pearson r: 0.5168


In [33]:
# Cell 12: Final pretty results table

df_results = pd.DataFrame(results)

# reorder / round / sort
df_results = df_results[["Model", "Test MSE", "Pearson r", "Chosen Hyperparams"]]
df_results["Test MSE"]   = df_results["Test MSE"].astype(float).round(4)
df_results["Pearson r"]  = df_results["Pearson r"].astype(float).round(4)

df_results = df_results.sort_values(
    by=["Pearson r", "Test MSE"],
    ascending=[False, True]
).reset_index(drop=True)

print("=== Summary Table (sorted by Pearson r desc, then MSE asc) ===")
display(df_results)  # in notebook this shows a nice HTML table

from tabulate import tabulate
print(tabulate(df_results, headers="keys", tablefmt="github", showindex=False))

# optional archive
dir = "/work/06782/ysu707/ls6/PRLE/results/stsb/llama_3.1_8B_Instr/wrapperbox"
os.makedirs(dir, exist_ok=True)
df_results.to_csv(f"{dir}/results.csv", index=False)
print("Saved stsb_regression_results.csv")


=== Summary Table (sorted by Pearson r desc, then MSE asc) ===


Unnamed: 0,Model,Test MSE,Pearson r,Chosen Hyperparams
0,SVR (RBF),0.0457,0.714,"{'regressor__C': 10.0, 'regressor__epsilon': 0..."
1,Linear Regression,0.0816,0.5851,{}
2,Random Forest,0.071,0.5168,"{'max_depth': 20, 'max_features': 'sqrt', 'min..."
3,KNN Regression,0.0946,0.3865,"{'regressor__n_neighbors': 9, 'regressor__weig..."
4,Decision Tree,0.085,0.3043,"{'max_depth': 3, 'min_samples_leaf': 1}"


| Model             |   Test MSE |   Pearson r | Chosen Hyperparams                                                                                          |
|-------------------|------------|-------------|-------------------------------------------------------------------------------------------------------------|
| SVR (RBF)         |     0.0457 |      0.714  | {'regressor__C': 10.0, 'regressor__epsilon': 0.05, 'regressor__gamma': 'scale', 'regressor__kernel': 'rbf'} |
| Linear Regression |     0.0816 |      0.5851 | {}                                                                                                          |
| Random Forest     |     0.071  |      0.5168 | {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 500}                       |
| KNN Regression    |     0.0946 |      0.3865 | {'regressor__n_neighbors': 9, 'regressor__weights': 'distance'}                                             |
| Decision Tree     |     0.085  |      0.3043