In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from lazypredict.Supervised import LazyRegressor

In [2]:
metric_names =  pd.read_json("data/metric_names.json")
train_data = pd.read_json("data/train_data.json")
test_data = pd.read_json("data/test_data.json")
metric_name_embeddings = np.load("data/metric_name_embeddings.npy")
full_text_embeddings_train = np.load("full_text_embeddings_train.npy")

In [3]:
metric_names = metric_names[0].to_list()

metric_idx_map = {name:i for i,name in enumerate(metric_names)}

In [4]:
# def metric_name_to_embed_map(metric_name, metric_idx, metric_embeds):
#     idx = metric_idx[metric_name]
#     embed = np.array([metric_embeds[idx,:]])
#     return embed
    
idx = train_data['metric_name'].map(metric_idx_map)

In [5]:
train_metric_embeds = metric_name_embeddings[idx]

In [6]:
train_metric_embeds.shape

(5000, 768)

In [7]:
full_text_embeddings_train.shape

(5000, 768)

In [8]:
M = train_metric_embeds
T = full_text_embeddings_train  

In [9]:
diff   = M - T
prod   = M * T
adiff  = np.abs(diff)

In [10]:
eps = 1e-8
cos   = (M * T).sum(axis=1, keepdims=True) / (
          (np.linalg.norm(M, axis=1, keepdims=True)+eps) *
          (np.linalg.norm(T, axis=1, keepdims=True)+eps)
       )

In [11]:
l2    = np.linalg.norm(diff, axis=1, keepdims=True)
dot   = (M * T).sum(axis=1, keepdims=True)

In [12]:
X = np.hstack([M, T, adiff, prod, cos, l2, dot]) 
# X = cos

In [13]:
X.shape

(5000, 3075)

In [14]:
y = train_data['score'].astype(float)

In [15]:
bin_edges = [0, 4, 6, 8, 10.0001]
bins = np.digitize(y, bin_edges) - 1

counts = np.bincount(bins, minlength=4).astype(float)
inv_freq = 1.0 / counts[bins]  
w = inv_freq * (len(y) / inv_freq.sum()) 

In [16]:
from sklearn.model_selection import train_test_split
X_tr, X_va, y_tr, y_va, w_tr, w_va, bins_tr, bins_va = train_test_split(
    X, y, w, bins, test_size=0.2, random_state=42, stratify=bins
)


In [17]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

def eval_model(name, model, use_scaler=False):
    if use_scaler:
        model = make_pipeline(StandardScaler(with_mean=False), model)

    model.fit(X_tr, y_tr, **({"sample_weight": w_tr} if "sample_weight" in model.fit.__code__.co_varnames else {}))
    pred = model.predict(X_va)
    rmse = math.sqrt(mean_squared_error(y_va, pred))
    mae  = mean_absolute_error(y_va, pred)
    r2   = r2_score(y_va, pred)
    print(f"{name:30s}  RMSE={rmse:.3f}  MAE={mae:.3f}  R2={r2:.3f}")
    return pred

# 1) Linear baseline
eval_model("LinearRegression (weighted)", LinearRegression())

# # 2) ElasticNet (scaled)
# eval_model("ElasticNet (scaled, weighted)", ElasticNet(alpha=0.001, l1_ratio=0.2, max_iter=2000), use_scaler=True)

# # 3) Random Forest
# eval_model("RandomForestRegressor (weighted)",
#            RandomForestRegressor(n_estimators=500, max_depth=None, n_jobs=-1, random_state=42))

# # 4) HistGradientBoosting (fast, strong)
# eval_model("HistGradientBoostingRegressor (weighted)",
#            HistGradientBoostingRegressor(max_depth=None, learning_rate=0.08, max_bins=255, random_state=42))

# # 5) SVR RBF (scaled)
# eval_model("SVR RBF (scaled, weighted)", SVR(C=3.0, epsilon=0.1, gamma="scale"), use_scaler=True)

# # 6) MLP (scaled)
# eval_model("MLPRegressor (scaled, weighted)",
#            MLPRegressor(hidden_layer_sizes=(512,256), activation="relu", alpha=1e-4,
#                         batch_size=256, learning_rate_init=1e-3, max_iter=200, random_state=42),
#            use_scaler=True)


LinearRegression (weighted)     RMSE=1.415  MAE=1.052  R2=-1.241


array([ 9.321808 ,  8.869366 , 10.530363 ,  9.18033  ,  8.577536 ,
        8.58201  ,  8.428543 ,  7.6953735,  9.261969 ,  9.129917 ,
       10.085846 ,  8.981546 ,  9.240688 ,  8.249186 ,  7.5736637,
        7.715662 ,  9.899361 ,  9.374287 , 10.318861 ,  9.677103 ,
        8.568287 ,  7.537428 ,  9.533102 , 10.118229 ,  8.625946 ,
       12.502405 , 11.333282 , 11.410433 ,  8.040106 ,  9.014845 ,
        8.904837 ,  8.170691 ,  7.849352 ,  8.117895 ,  9.236612 ,
        7.560663 , 10.205755 ,  7.1390247,  5.218212 ,  8.573536 ,
        8.258514 ,  9.627865 ,  9.394449 ,  7.5009212,  9.918783 ,
        8.853926 ,  8.083349 ,  9.459524 , 10.035448 ,  8.732607 ,
        8.297321 ,  9.602369 ,  7.8452663,  8.647152 ,  9.948008 ,
        9.153061 ,  8.083044 ,  9.600542 ,  7.458454 ,  8.875422 ,
        9.712782 ,  9.179716 ,  9.455381 ,  9.169382 ,  8.765997 ,
       10.715017 ,  9.583601 , 11.454266 , 10.534342 ,  8.609747 ,
        9.877676 ,  9.054316 ,  9.463802 , 10.879103 ,  8.7935

In [18]:
linreg = LinearRegression()
linreg.fit(X_tr, y_tr, sample_weight=w_tr)

In [None]:
# # build the test embeddings and save it.

# from sentence_transformers import SentenceTransformer
# import numpy as np
# from huggingface_hub import login


# with open("info.json", "r") as file:
#     userdata = json.load(file)


# hf_token = userdata["hf_token"]
# login(hf_token)

# model = SentenceTransformer("google/embeddinggemma-300m", token=hf_token,)


# def build_full_text(user_prompt, response, system_prompt):
#     # Convert None to empty string
#     user_prompt = user_prompt if user_prompt is not None else ""
#     response = response if response is not None else ""
#     system_prompt = system_prompt if system_prompt is not None else ""
    
#     # Join with spaces and strip extra whitespace
#     full_text = f"{system_prompt} {user_prompt} {response}".strip()
#     return full_text


# test_data['full_text'] = test_data.apply(
#     lambda row: build_full_text(row['user_prompt'], row['response'], row['system_prompt']),
#     axis=1
# )

In [None]:
# train_data['full_text'] = train_data.apply(
#     lambda row: build_full_text(row['user_prompt'], row['response'], row['system_prompt']),
#     axis=1
# )

In [None]:
# embed_full_text_train = model.encode(train_data['full_text'].fillna("").tolist())
# np.save('full_text_embeddings_train.npy', embed_full_text_train)

In [None]:
# embed_full_text_test = model.encode(test_data['full_text'].fillna("").tolist())
# np.save('full_text_embeddings.npy', embed_full_text_test)

In [20]:
embed_full_text_test = np.load("full_text_embeddings.npy")
embed_full_text_test.shape

(3638, 768)

In [21]:
def build_features(metric_embeds, text_embeds):
    M = metric_embeds
    T = text_embeds
    
    diff = M - T
    adiff = np.abs(diff)
    prod = M * T
    
    eps = 1e-8
    cos = (M * T).sum(axis=1, keepdims=True) / (
        (np.linalg.norm(M, axis=1, keepdims=True) + eps) *
        (np.linalg.norm(T, axis=1, keepdims=True) + eps)
    )
    
    l2 = np.linalg.norm(diff, axis=1, keepdims=True)
    dot = (M * T).sum(axis=1, keepdims=True)
    
    return np.hstack([M, T, adiff, prod, cos, l2, dot])


In [22]:
idx_test = test_data['metric_name'].map(metric_idx_map)

test_metric_embeds = metric_name_embeddings[idx_test]

In [23]:
test_features = build_features(test_metric_embeds, embed_full_text_test)

In [30]:
test_features.shape

(3638, 3075)

In [24]:
test_pred = linreg.predict(test_features)

In [25]:
test_pred.max()

np.float32(15.377399)

In [None]:
test_pred = np.clip(test_pred, 0, 10)

In [27]:
test_data.index +1

RangeIndex(start=1, stop=3639, step=1)

In [None]:
import pandas as pd

submission = pd.DataFrame({
    "id": test_data.index +1,   # whatever ID column exists in test file
    "score": test_pred
})

submission.to_csv("submission_linear_reg_weighting.csv", index=False)
print("Submission saved: submission_linear.csv")


✅ Submission saved: submission_linear.csv


In [28]:
lazyreg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = lazyreg.fit(X_tr, X_va, y_tr, y_va)

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.162071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 699656
[LightGBM] [Info] Number of data points in the train set: 4000, number of used features: 3075
[LightGBM] [Info] Start training from score 9.119375


In [29]:
print(models)

                                  Adjusted R-Squared               R-Squared  \
Model                                                                          
SGDRegressor                  1173810582204484608.00 -2439270038695205376.00   
KernelRidge                                    47.23                  -95.06   
GaussianProcessRegressor                       44.52                  -89.44   
Lars                                            3.72                   -4.65   
LinearRegression                                2.14                   -1.36   
TransformedTargetRegressor                      2.14                   -1.36   
Ridge                                           2.10                   -1.29   
PassiveAggressiveRegressor                      2.00                   -1.08   
RidgeCV                                         1.98                   -1.04   
LinearSVR                                       1.98                   -1.03   
MLPRegressor                            

In [32]:
from sklearn.svm import SVR

svr_model = SVR(kernel='rbf', C=100, epsilon=0.1, gamma='scale')
eval_model("Support Vector Regression (weighted)", svr_model)

Support Vector Regression (weighted)  RMSE=0.996  MAE=0.659  R2=-0.111


array([ 9.53203101,  8.56915413,  9.77052189,  9.90581935,  8.33636424,
        8.93956429,  8.74241497,  9.01324132,  9.25483302,  9.30423862,
        9.29803079,  8.89972698,  9.48532981,  9.17398235,  9.01848221,
        6.92673814,  9.20798773,  8.97974409,  8.52785244,  9.62386443,
        8.51291612,  7.92771514,  9.80223344,  9.6188831 ,  9.50210306,
       11.19682614,  9.45150522,  9.13667806,  8.63916361,  8.84645909,
        9.51249247,  9.24288547,  8.53169629,  9.03218422,  9.15617862,
        8.35117372,  9.35822095,  8.59452698,  7.81746333,  9.52749606,
        8.05943629,  8.96217934,  8.67042849,  8.89692765, 10.07611844,
        8.48009479,  8.90674014,  9.50496166, 10.0062761 ,  9.33194505,
        8.68472085,  9.4366794 ,  8.54721019,  8.6255587 ,  9.53344381,
        9.82677158,  9.05581875,  9.50353565,  7.92201101,  8.76599516,
        9.12125743,  9.27818022,  8.74133669,  9.13680194,  8.95874544,
        9.65145499,  8.92170634,  9.7183812 ,  9.22847209,  9.36

In [34]:
svr_model = SVR(kernel='rbf', C=100, epsilon=0.1, gamma='scale')
svr_model.fit(X_tr, y_tr, sample_weight=w_tr)

In [35]:
import pandas as pd

test_pred_svr = svr_model.predict(test_features)
test_pred_svr = np.clip(test_pred_svr, 0, 10)
submission_svr = pd.DataFrame({
    "id": test_data.index +1,   # whatever ID column exists in test file
    "score": test_pred_svr
})

submission_svr.to_csv("submission_svr_weighting.csv", index=False)
print("Submission saved: submission_linear.csv")


Submission saved: submission_linear.csv


In [36]:
submission_svr.min()

id      1.00
score   5.06
dtype: float64

In [37]:
y.min()

np.float64(0.0)

In [38]:
submission_svr

Unnamed: 0,id,score
0,1,9.49
1,2,8.90
2,3,9.60
3,4,9.45
4,5,8.80
...,...,...
3633,3634,8.34
3634,3635,9.37
3635,3636,9.71
3636,3637,9.49
