In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

import joblib


In [2]:
from google.colab import drive
drive.mount("/content/drive")
BASE_PATH = "/content/drive/MyDrive/Satellite Imagery Based Property Valuation"



Mounted at /content/drive


## Tabular Model


In [3]:
df = pd.read_csv(f"{BASE_PATH}/train_multimodal_pca.csv", dtype={"id": "string"})
TARGET = "price"

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [None]:
img_pca_cols = [c for c in X.columns if c.startswith("img_pca_")]
tabular_cols = [c for c in X.columns if c not in img_pca_cols + ["id"]]

numeric_tab = X[tabular_cols].select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_tab = X[tabular_cols].select_dtypes(include=["object", "category"]).columns.tolist()


In [None]:
tabular_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_tab),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_tab)
    ]
)


In [None]:
tabular_models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=10.0),
    "Lasso": Lasso(alpha=0.001),
    "RandomForest": RandomForestRegressor(
        n_estimators=300,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=4,
        random_state=42
    ),
    "XGBoost": XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    )
}


In [None]:
results = []

for name, model in tabular_models.items():
    pipe = Pipeline(steps=[
        ("preprocess", tabular_preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train[tabular_cols], y_train)
    preds = pipe.predict(X_val[tabular_cols])

    results.append({
        "Type": "Tabular",
        "Model": name,
        "RMSE": np.sqrt(mean_squared_error(y_val, preds)),
        "MAE": mean_absolute_error(y_val, preds),
        "R2": r2_score(y_val, preds)
    })


  model = cd_fast.sparse_enet_coordinate_descent(


In [None]:
results_df = pd.DataFrame(results)
results_df.sort_values(["Type", "RMSE"])

Unnamed: 0,Type,Model,RMSE,MAE,R2
5,Tabular,XGBoost,105691.7565,63053.230469,0.908617
4,Tabular,GradientBoosting,114692.877876,69813.168566,0.89239
3,Tabular,RandomForest,119709.961346,69373.057726,0.882769
1,Tabular,Ridge,190439.052755,126733.23341,0.703317
2,Tabular,Lasso,191866.302768,128400.2191,0.698853
0,Tabular,LinearRegression,191867.111203,128405.707391,0.69885


## Multimodal Training

In [None]:
numeric_mm = numeric_tab + img_pca_cols
cat_mm = cat_tab

multimodal_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_mm),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_mm)
    ]
)


In [None]:
multimodal_models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=400,
        max_depth=25,
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=4,
        random_state=42
    ),
    "XGBoost": XGBRegressor(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=6,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.1,
        reg_lambda=1.2,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    )
}


In [None]:
for name, model in multimodal_models.items():
    pipe = Pipeline(steps=[
        ("preprocess", multimodal_preprocessor),
        ("model", model)
    ])

    pipe.fit(X_train.drop(columns=["id"]), y_train)
    preds = pipe.predict(X_val.drop(columns=["id"]))

    results.append({
        "Type": "Multimodal",
        "Model": name,
        "RMSE": np.sqrt(mean_squared_error(y_val, preds)),
        "MAE": mean_absolute_error(y_val, preds),
        "R2": r2_score(y_val, preds)
    })


In [None]:
results_df = pd.DataFrame(results)
results_df.sort_values(["Type", "RMSE"])


Unnamed: 0,Type,Model,RMSE,MAE,R2
8,Multimodal,XGBoost,109395.813302,65800.578125,0.9021
7,Multimodal,GradientBoosting,118642.788692,71827.79055,0.88485
6,Multimodal,RandomForest,127324.660908,74275.886886,0.867381
5,Tabular,XGBoost,105691.7565,63053.230469,0.908617
14,Tabular,XGBoost,105691.7565,63053.230469,0.908617
4,Tabular,GradientBoosting,114692.877876,69813.168566,0.89239
13,Tabular,GradientBoosting,114692.877876,69813.168566,0.89239
3,Tabular,RandomForest,119709.961346,69373.057726,0.882769
12,Tabular,RandomForest,119709.961346,69373.057726,0.882769
1,Tabular,Ridge,190439.052755,126733.23341,0.703317


In [None]:
best_model = results_df.sort_values("R2", ascending=False).iloc[0]
best_model


Unnamed: 0,5
Type,Tabular
Model,XGBoost
RMSE,105691.7565
MAE,63053.230469
R2,0.908617


In [None]:
test_df = pd.read_csv(f"{BASE_PATH}/test_multimodal_pca.csv", dtype={"id": "string"})
X_test = test_df.drop(columns=["id"])
best_model = multimodal_models[BEST_MODEL_NAME]

In [None]:
BEST_PIPELINE = Pipeline(steps=[
    ("preprocess", multimodal_preprocessor),
    ("model", best_model)
])


In [None]:
BEST_PIPELINE.fit(
    X_train.drop(columns=["id"]),
    y_train
)


In [None]:
joblib.dump(BEST_PIPELINE, "best_multimodal_model.pkl")
print("Best multimodal model saved")

Best multimodal model saved


### Predicting test data

In [None]:
test_preds = BEST_PIPELINE.predict(X_test)
submission = pd.DataFrame({
    "id": test_df["id"].values,
    "predicted_price": test_preds
})
submission.to_csv("final_submission.csv", index=False)
print("final_submission.csv saved")

final_submission.csv saved


In [None]:
print(submission.head())
print("Rows:", submission.shape)

           id  predicted_price
0  2591820310     4.098755e+05
1  7974200820     8.708013e+05
2  7701450110     1.296203e+06
3  9522300010     2.178258e+06
4  9510861140     7.286679e+05
Rows: (5396, 2)


In [None]:
DRIVE_PATH = "/content/drive/MyDrive/Satellite Imagery Based Property Valuation/final_submission.csv"
submission.to_csv(DRIVE_PATH, index=False)
print(f"Submission saved to: {DRIVE_PATH}")


Submission saved to: /content/drive/MyDrive/Satellite Imagery Based Property Valuation/final_submission.csv
