In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import torch
import torch.nn.functional as F
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Load data
file_path = "/content/drive/MyDrive/Machine Learning in network science/processed_data.csv"
df = pd.read_csv(file_path)

In [None]:
# 5% sampling per brand
df = df.groupby('brand_cleaned', group_keys=False).apply(
    lambda x: x.sample(frac=0.05, random_state=42)
).reset_index(drop=True)

  df = df.groupby('brand_cleaned', group_keys=False).apply(


# Random forest

In [None]:
# Define features and target
feature_columns = [
    "broad_type", "product_gender_target", "product_category",
    "product_color", "brand_cleaned", "product_condition",
    "material_cleaned", "product_like_count"
]
target_column = "price_usd"
df[target_column] = np.log1p(df[target_column])  # log transform target

X = df[feature_columns]
y = df[target_column]

In [None]:
# Preprocess: one-hot encode categorical features
categorical_cols = X.select_dtypes(include="object").columns.tolist()
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
], remainder="passthrough")

In [None]:
# Split 70/15/15
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42)  # 0.1765 × 0.85 ≈ 0.15

In [None]:
# Build pipeline and train
ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)],
    remainder="passthrough",
    verbose_feature_names_out=False,  # optional, cleaner feature names
    force_int_remainder_cols=False    # This is what the warning suggests adding
)

model = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
# Predict on test set
y_pred = model.predict(X_test)

In [None]:
# Custom evaluation (convert back from log-price)
import torch
import torch.nn.functional as F

# Convert to tensors (make sure both are NumPy arrays)
y_pred_exp_tensor = torch.tensor(y_pred_exp, dtype=torch.float32)
y_test_exp_tensor = torch.tensor(y_test_exp.to_numpy(), dtype=torch.float32)

# Compute evaluation metrics
mse = F.mse_loss(y_pred_exp_tensor, y_test_exp_tensor).item()
mae = F.l1_loss(y_pred_exp_tensor, y_test_exp_tensor).item()

r2_numerator = ((y_test_exp_tensor - y_pred_exp_tensor) ** 2).sum()
r2_denominator = ((y_test_exp_tensor - y_test_exp_tensor.mean()) ** 2).sum()
r2 = 1 - r2_numerator / r2_denominator

In [None]:
# Print metrics
print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²:  {r2:.4f}")

MSE: 875084.94
MAE: 250.17
R²:  0.1335
