In [None]:
# SCRIPT 1: GENERATE PSEUDO-LABELS

import pandas as pd, numpy as np, os, lightgbm as lgb, gc
from sklearn.model_selection import KFold

print("--- STAGE 1: GENERATING PSEUDO-LABELS ---")

# --- 1. LOAD FEATURES ---
PROCESSED_DIR = "../data/processed"
full_df_text_stats = pd.read_parquet(os.path.join(PROCESSED_DIR, "features_v1.parquet"))
text_embeddings_df = pd.read_parquet(os.path.join(PROCESSED_DIR, "text_embeddings_v1.parquet"))
full_df = pd.concat([full_df_text_stats.reset_index(drop=True), text_embeddings_df.reset_index(drop=True)], axis=1)

# --- 2. DATA PREP ---
train_df = full_df[full_df['is_train'] == 1].copy(); test_df = full_df[full_df['is_train'] == 0].copy()
y = train_df['log_price']
features_to_drop = ['sample_id', 'price', 'log_price', 'is_train']
features = [col for col in full_df.columns if col not in features_to_drop]
X = train_df[features]; X_test = test_df[features]
categorical_features = ['extracted_unit', 'brand']
for col in categorical_features: X[col] = X[col].astype('category'); X_test[col] = X_test[col].astype('category')

# --- 3. TRAIN AND PREDICT ---
N_SPLITS = 5; kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
test_predictions_log = np.zeros(len(X_test))

for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    lgb_params = { 'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 7000,
                   'learning_rate': 0.015, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1,
                   'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 61, 'verbose': -1, 'n_jobs': -1, 'seed': 42 + fold }
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(150, verbose=False)])
    test_predictions_log += model.predict(X_test) / N_SPLITS

# --- 4. SAVE PSEUDO-LABELS ---
pseudo_labels = np.expm1(test_predictions_log)
pseudo_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': pseudo_labels})
pseudo_df.to_csv("../pseudo_labels.csv", index=False)
print("\n STAGE 1 COMPLETE: 'pseudo_labels.csv' file created.")

--- STAGE 1: GENERATING PSEUDO-LABELS ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for col in categorical_features: X[col] = X[col].astype('category'); X_test[col] = X_test[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for col in categorical_features: X[col] = X[col].astype('category'); X_test[col] = X_test[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-


--- Fold 1/5 ---
