In [24]:
import numpy as np
import pandas as pd 
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor 
from sklearn.preprocessing import MinMaxScaler 

In [25]:
DATA_DIR = "./data"
MODEL = 'dinov2_vitl14'
TRAIN_DATA_PATH = f"{DATA_DIR}/train.csv"
TEST_DATA_PATH = f"{DATA_DIR}/test.csv"
TRAIN_EMBEDDINGS_PATH = f"{DATA_DIR}/{MODEL}_data/train_embeddings.csv"
TEST_EMBEDDINGS_PATH = f"{DATA_DIR}/{MODEL}_data/test_embeddings.csv"

TRAIN_DATA_RATIO = 0.8

AUX_START = 1
AUX_END = 164 #7
AUX_CATEGORIES = AUX_END - AUX_START 

LABLES_START = 164
LABELS_END = 170
LABELS_CATEGORIES = LABELS_END - LABLES_START

In [26]:

def preprocessData(filename):
    df = pd.read_csv(filename)

    imageIds = df[['id']].values.squeeze()
    auxData = df.iloc[:, AUX_START:AUX_END]
    labels = df.iloc[:, LABLES_START:LABELS_END]

    return imageIds, auxData, labels

dataScaler = MinMaxScaler((0,1))
labelScaler = MinMaxScaler((0,1))

train_ids, train_aux, train_labels_us = preprocessData(TRAIN_DATA_PATH)
test_ids, test_aux, test_labels = preprocessData(TRAIN_DATA_PATH)

train_embed = pd.read_csv(TRAIN_EMBEDDINGS_PATH)
test_embed = pd.read_csv(TEST_EMBEDDINGS_PATH)

train_embed_ii = [train_embed.loc[train_embed['img_id'] == i].values[0,1:] for i in train_ids]
train_data_us = np.concatenate((train_aux, train_embed_ii), axis=1)

In [27]:
train_data = dataScaler.fit_transform(train_data_us)
train_labels = labelScaler.fit_transform(train_labels_us)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.2)

metaparams = [
    (100, 5, 0.1), 
    (250, 6, 0.07), 
    # (350, 6, 0.07), 
    # (350, 6, 0.07), 
    # (300, 7, 0.06), 
    (350, 8, 0.06), 
]

models = []
preds = []

for n_estimators, max_depth, learning_rate in metaparams:

    model = XGBRegressor(objective ='reg:squarederror', n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate)

    # Train model
    model.fit(X_train, y_train)
    models.append(model)


    pred = model.predict(X_test)
    preds.append(pred)
    print(f"e:{n_estimators} d:{max_depth} lr:{learning_rate} r2:{r2_score(y_test, pred)}")

e:350 d:6 lr:0.07 r2:0.3825124280662952
e:300 d:7 lr:0.06 r2:0.38311374045206753
e:350 d:8 lr:0.06 r2:0.3854624573022669
