In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('synthetic_luxury_interior_dataset.csv')

In [3]:
df.head()

Unnamed: 0,area_sqft,num_rooms,space_type,style,material,add_3d,add_turnkey,add_smart_home,final_cost
0,3319,3,residential,scandinavian,standard,0,0,0,3982800
1,1119,10,residential,modern,custom_wood,0,0,0,1790400
2,1595,5,hospitality,modern,italian_marble,0,1,0,3401649
3,2539,6,residential,japandi,premium_tiles,1,1,1,3697650
4,1336,5,commercial,royal,italian_marble,1,0,1,3296240


In [4]:
df.size

4500

In [5]:
df.shape

(500, 9)

In [6]:
X = df.drop("final_cost", axis=1)
y = np.log1p(df["final_cost"])

In [7]:
X.head()

Unnamed: 0,area_sqft,num_rooms,space_type,style,material,add_3d,add_turnkey,add_smart_home
0,3319,3,residential,scandinavian,standard,0,0,0
1,1119,10,residential,modern,custom_wood,0,0,0
2,1595,5,hospitality,modern,italian_marble,0,1,0
3,2539,6,residential,japandi,premium_tiles,1,1,1
4,1336,5,commercial,royal,italian_marble,1,0,1


In [8]:
categorical_features = ["space_type", "style", "material"]
numeric_features = ["area_sqft", "num_rooms", "add_3d", "add_turnkey", "add_smart_home"]

In [9]:
X[categorical_features]

Unnamed: 0,space_type,style,material
0,residential,scandinavian,standard
1,residential,modern,custom_wood
2,hospitality,modern,italian_marble
3,residential,japandi,premium_tiles
4,commercial,royal,italian_marble
...,...,...,...
495,residential,royal,italian_marble
496,residential,scandinavian,italian_marble
497,commercial,royal,italian_marble
498,commercial,scandinavian,italian_marble


In [10]:
X[numeric_features]

Unnamed: 0,area_sqft,num_rooms,add_3d,add_turnkey,add_smart_home
0,3319,3,0,0,0
1,1119,10,0,0,0
2,1595,5,0,1,0
3,2539,6,1,1,1
4,1336,5,1,0,1
...,...,...,...,...,...
495,2054,5,0,1,0
496,1068,10,1,0,1
497,1912,6,0,0,1
498,2175,7,0,0,0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [12]:
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", LinearRegression())
])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
model.fit(X_train, y_train)


In [15]:
X.head()

Unnamed: 0,area_sqft,num_rooms,space_type,style,material,add_3d,add_turnkey,add_smart_home
0,3319,3,residential,scandinavian,standard,0,0,0
1,1119,10,residential,modern,custom_wood,0,0,0
2,1595,5,hospitality,modern,italian_marble,0,1,0
3,2539,6,residential,japandi,premium_tiles,1,1,1
4,1336,5,commercial,royal,italian_marble,1,0,1


In [16]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [17]:
rmse, r2

(np.float64(0.08792924065706104), 0.9620265132900365)

In [18]:
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

ridge_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", Ridge(alpha=1.0))
])

In [19]:
from sklearn.linear_model import Lasso
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

lasso_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", Lasso(alpha=0.1))
])

In [20]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error


preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

odel = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", GradientBoostingRegressor(
        n_estimators=200,     # ← use this instead of max_iter
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ))
])

In [21]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        random_state=42
    ))
])


In [26]:
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

# Full pipeline with XGBoost
model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        objective="reg:squarederror"
    ))
])


In [27]:
# Fit the pipeline to the training data
model.fit(X_train, y_train)

# Then make predictions
y_pred = model.predict(X_test)

# Now evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)


In [28]:
rmse, r2

(np.float64(0.06499629635693469), 0.9792512923839978)

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np

# Preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

# Pipeline
pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("regressor", XGBRegressor(objective="reg:squarederror", random_state=42))
])

# Hyperparameter grid
param_grid = {
    "regressor__n_estimators": [100, 200],
    "regressor__learning_rate": [0.05, 0.1],
    "regressor__max_depth": [3, 5, 7],
    "regressor__subsample": [0.8, 1.0]
}

# GridSearch
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=1
)

# Fit model
grid_search.fit(X_train, y_train)

# Predict
y_pred = grid_search.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Best Params:", grid_search.best_params_)
print("RMSE:", rmse)
print("R² Score:", r2)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Params: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 200, 'regressor__subsample': 0.8}
RMSE: 0.04865805170821558
R² Score: 0.9883715116657206


In [30]:
import pickle

# Save the best pipeline (including preprocessing and XGBoost)
with open("xgb_pipeline.pkl", "wb") as f:
    pickle.dump(grid_search.best_estimator_, f)

