In [None]:
# ==============================================================================
# 1. Import Libraries
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor

# ==============================================================================
# 2. Load Data
# ==============================================================================
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
except FileNotFoundError:
    train_df = pd.read_csv("hotel_data_preprocessed.csv")
    test_df = pd.read_csv("test.csv")

id_test = test_df['Id']
current_year = 2025

# ==============================================================================
# 3. Feature Engineering
# ==============================================================================
train_df['Age'] = current_year - train_df['ConstructionYear']
test_df['Age'] = current_year - test_df['ConstructionYear']

train_df['YearsSinceRemod'] = current_year - train_df['RenovationYear']
test_df['YearsSinceRemod'] = current_year - test_df['RenovationYear']

train_df['YearsSinceRemod'] = train_df['YearsSinceRemod'].apply(lambda x: max(x, 0))
test_df['YearsSinceRemod'] = test_df['YearsSinceRemod'].apply(lambda x: max(x, 0))

train_df['Log_HotelValue'] = np.log1p(train_df['HotelValue'])
y_train = train_df['Log_HotelValue']

drop_cols = ['Id', 'HotelValue', 'Log_HotelValue', 'ConstructionYear',
             'RenovationYear', 'YearSold', 'ParkingConstructionYear']

X_train_raw = train_df.drop(columns=drop_cols, errors='ignore')
X_test_raw = test_df.drop(columns=['Id'] + drop_cols[3:], errors='ignore')

combined_df = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)
train_len = len(X_train_raw)

# Log-transform skewed numeric features
skewed_cols = ['LandArea', 'FacadeArea', 'BasementFacilitySF1', 'BasementTotalSF',
               'GroundFloorArea', 'UpperFloorArea', 'ParkingArea']
for col in skewed_cols:
    if col in combined_df.columns:
        combined_df[col] = np.log1p(combined_df[col].fillna(0))

# Interaction features
if 'OverallQuality' in combined_df.columns and 'LandArea' in combined_df.columns:
    combined_df['LogArea_x_Qual'] = combined_df['LandArea'] * combined_df['OverallQuality']
if 'Age' in combined_df.columns and 'OverallQuality' in combined_df.columns:
    combined_df['Age_x_Qual'] = combined_df['Age'] * combined_df['OverallQuality']

# Impute missing values
numeric_cols = combined_df.select_dtypes(include=np.number).columns
cat_cols = combined_df.select_dtypes(exclude=np.number).columns

train_means = combined_df.iloc[:train_len][numeric_cols].mean()
combined_df[numeric_cols] = combined_df[numeric_cols].fillna(train_means)

for col in cat_cols:
    mode_val = combined_df.iloc[:train_len][col].mode()[0] if not combined_df.iloc[:train_len][col].mode().empty else "Missing"
    combined_df[col] = combined_df[col].fillna(mode_val)

combined_encoded = pd.get_dummies(combined_df, columns=cat_cols, dummy_na=True, drop_first=True)
X_train_all = combined_encoded.iloc[:train_len].fillna(0)
X_test_all = combined_encoded.iloc[train_len:].fillna(0)
X_train_all, X_test_all = X_train_all.align(X_test_all, join='inner', axis=1, fill_value=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_all)
X_test_scaled = scaler.transform(X_test_all)

# ==============================================================================
# 4. Train Models
# ==============================================================================

# ElasticNet (Fast & Stable)
alphas = np.logspace(-5, -2, 50)
l1_ratios = [0.1, 0.5, 0.9]
cv_folds = KFold(n_splits=5, shuffle=True, random_state=42)

elastic_model = ElasticNetCV(
    alphas=alphas, l1_ratio=l1_ratios, cv=cv_folds, n_jobs=-1,
    max_iter=10000, tol=1e-4, random_state=42
).fit(X_train_scaled, y_train)

y_test_pred_elastic = elastic_model.predict(X_test_scaled)

# LightGBM (Fast Gradient Boosting)
lgb_model = LGBMRegressor(
    learning_rate=0.02, n_estimators=1500, max_depth=7,
    num_leaves=40, min_child_samples=10, random_state=42
)
lgb_model.fit(X_train_scaled, y_train)
y_test_pred_lgb = lgb_model.predict(X_test_scaled)

# ==============================================================================
# 5. Blending (ElasticNet 60% + LightGBM 40%)
# ==============================================================================
ELASTIC_WEIGHT = 0.6
LGB_WEIGHT = 0.4

y_test_pred_blend = (y_test_pred_elastic * ELASTIC_WEIGHT) + (y_test_pred_lgb * LGB_WEIGHT)
y_test_pred_blend = np.expm1(y_test_pred_blend)
y_test_pred_blend = np.maximum(0, y_test_pred_blend)

# ==============================================================================
# 6. Submission
# ==============================================================================
submission = pd.DataFrame({
    "Id": id_test,
    "HotelValue": y_test_pred_blend
})

submission.to_csv("submission_fast_blended.csv", index=False)
print("✅ Submission Ready: submission_fast_blended.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3710
[LightGBM] [Info] Number of data points in the train set: 1200, number of used features: 178
[LightGBM] [Info] Start training from score 12.031590
✅ Submission Ready: submission_fast_blended.csv
